[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH] xen: Use asm inline when available for alternatives



On Tue, Apr 22, 2025 at 12:40 PM Andrew Cooper
<andrew.cooper3@xxxxxxxxxx> wrote:
>
> Compilers estimate the size of an asm() block for inlining purposes.
>
> Constructs such as ALTERNATIVE appear large due to the metadata, depsite often
> only being a handful of instructions.  asm inline() overrides the estimation
> to identify the block as being small.
>
> This has a substantial impact on inlining decisions, expected to be for the
> better given that the compiler has a more accurate picture to work with.
>
> Signed-off-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
> ---
> CC: Jan Beulich <JBeulich@xxxxxxxx>
> CC: Roger Pau Monné <roger.pau@xxxxxxxxxx>
> CC: Stefano Stabellini <sstabellini@xxxxxxxxxx>
> CC: Julien Grall <julien@xxxxxxx>
> CC: Volodymyr Babchuk <Volodymyr_Babchuk@xxxxxxxx>
> CC: Bertrand Marquis <bertrand.marquis@xxxxxxx>
> CC: Michal Orzel <michal.orzel@xxxxxxx>
> ---
>  xen/Kconfig                               |  4 +++
>  xen/arch/arm/include/asm/alternative.h    |  4 +--
>  xen/arch/arm/include/asm/arm64/flushtlb.h |  4 +--
>  xen/arch/arm/include/asm/arm64/io.h       | 43 ++++++++++++++---------
>  xen/arch/arm/include/asm/cpuerrata.h      |  8 ++---
>  xen/arch/arm/include/asm/cpufeature.h     |  8 ++---
>  xen/arch/arm/include/asm/page.h           | 12 ++++---
>  xen/arch/arm/include/asm/processor.h      |  7 ++--
>  xen/arch/arm/include/asm/sysregs.h        | 10 +++---
>  xen/arch/arm/mmu/p2m.c                    |  3 +-
>  xen/arch/x86/include/asm/alternative.h    | 36 ++++++++++---------
>  xen/include/xen/compiler.h                | 14 ++++++++
>  12 files changed, 95 insertions(+), 58 deletions(-)
>
> diff --git a/xen/Kconfig b/xen/Kconfig
> index ae1c401a981e..ab4ab42ae987 100644
> --- a/xen/Kconfig
> +++ b/xen/Kconfig
> @@ -29,6 +29,10 @@ config LD_IS_GNU
>  config LD_IS_LLVM
>         def_bool $(success,$(LD) --version | head -n 1 | grep -q "^LLD")
>
> +config CC_HAS_ASM_INLINE
> +       # GCC >= 9, Clang >= 11
> +       def_bool $(success,echo 'void foo(void) { asm inline (""); }' | $(CC) 
> -x c - -c -o /dev/null)
> +

Why not check for "asm __inline" instead of "asm inline" ? The
asm_inline macro below uses "asm __inline".
They are mainly synonyms but it would look more coherent.

>  # Use -f{function,data}-sections compiler parameters
>  config CC_SPLIT_SECTIONS
>         bool
> diff --git a/xen/arch/arm/include/asm/alternative.h 
> b/xen/arch/arm/include/asm/alternative.h
> index 22477d9497a3..1563f03a0f5a 100644
> --- a/xen/arch/arm/include/asm/alternative.h
> +++ b/xen/arch/arm/include/asm/alternative.h
> @@ -209,9 +209,9 @@ alternative_endif
>  #endif  /*  __ASSEMBLY__  */
>
>  /*
> - * Usage: asm(ALTERNATIVE(oldinstr, newinstr, feature));
> + * Usage: asm_inline (ALTERNATIVE(oldinstr, newinstr, feature));
>   *
> - * Usage: asm(ALTERNATIVE(oldinstr, newinstr, feature, CONFIG_FOO));
> + * Usage: asm_inline (ALTERNATIVE(oldinstr, newinstr, feature, CONFIG_FOO));
>   * N.B. If CONFIG_FOO is specified, but not selected, the whole block
>   *      will be omitted, including oldinstr.
>   */
> diff --git a/xen/arch/arm/include/asm/arm64/flushtlb.h 
> b/xen/arch/arm/include/asm/arm64/flushtlb.h
> index 45642201d147..3b99c11b50d1 100644
> --- a/xen/arch/arm/include/asm/arm64/flushtlb.h
> +++ b/xen/arch/arm/include/asm/arm64/flushtlb.h
> @@ -31,7 +31,7 @@
>  #define TLB_HELPER(name, tlbop, sh)              \
>  static inline void name(void)                    \
>  {                                                \
> -    asm volatile(                                \
> +    asm_inline volatile (                        \
>          "dsb  "  # sh  "st;"                     \
>          "tlbi "  # tlbop  ";"                    \
>          ALTERNATIVE(                             \
> @@ -55,7 +55,7 @@ static inline void name(void)                    \
>  #define TLB_HELPER_VA(name, tlbop)               \
>  static inline void name(vaddr_t va)              \
>  {                                                \
> -    asm volatile(                                \
> +    asm_inline volatile (                        \
>          "tlbi "  # tlbop  ", %0;"                \
>          ALTERNATIVE(                             \
>              "nop; nop;",                         \
> diff --git a/xen/arch/arm/include/asm/arm64/io.h 
> b/xen/arch/arm/include/asm/arm64/io.h
> index 7d5959877759..ac90b729c44d 100644
> --- a/xen/arch/arm/include/asm/arm64/io.h
> +++ b/xen/arch/arm/include/asm/arm64/io.h
> @@ -51,40 +51,51 @@ static inline void __raw_writeq(u64 val, volatile void 
> __iomem *addr)
>  static inline u8 __raw_readb(const volatile void __iomem *addr)
>  {
>          u8 val;
> -        asm volatile(ALTERNATIVE("ldrb %w0, [%1]",
> -                                 "ldarb %w0, [%1]",
> -                                 ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE)
> -                     : "=r" (val) : "r" (addr));
> +
> +        asm_inline volatile (
> +            ALTERNATIVE("ldrb %w0, [%1]",
> +                        "ldarb %w0, [%1]",
> +                        ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE)
> +            : "=r" (val) : "r" (addr) );
> +
>          return val;
>  }
>
>  static inline u16 __raw_readw(const volatile void __iomem *addr)
>  {
>          u16 val;
> -        asm volatile(ALTERNATIVE("ldrh %w0, [%1]",
> -                                 "ldarh %w0, [%1]",
> -                                 ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE)
> -                     : "=r" (val) : "r" (addr));
> +        asm_inline volatile (
> +            ALTERNATIVE("ldrh %w0, [%1]",
> +                        "ldarh %w0, [%1]",
> +                        ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE)
> +            : "=r" (val) : "r" (addr) );
> +
>          return val;
>  }
>
>  static inline u32 __raw_readl(const volatile void __iomem *addr)
>  {
>          u32 val;
> -        asm volatile(ALTERNATIVE("ldr %w0, [%1]",
> -                                 "ldar %w0, [%1]",
> -                                 ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE)
> -                     : "=r" (val) : "r" (addr));
> +
> +        asm_inline volatile (
> +            ALTERNATIVE("ldr %w0, [%1]",
> +                        "ldar %w0, [%1]",
> +                        ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE)
> +            : "=r" (val) : "r" (addr) );
> +
>          return val;
>  }
>
>  static inline u64 __raw_readq(const volatile void __iomem *addr)
>  {
>          u64 val;
> -        asm volatile(ALTERNATIVE("ldr %0, [%1]",
> -                                 "ldar %0, [%1]",
> -                                 ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE)
> -                     : "=r" (val) : "r" (addr));
> +
> +        asm_inline volatile (
> +            ALTERNATIVE("ldr %0, [%1]",
> +                        "ldar %0, [%1]",
> +                        ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE)
> +            : "=r" (val) : "r" (addr) );
> +
>          return val;
>  }
>
> diff --git a/xen/arch/arm/include/asm/cpuerrata.h 
> b/xen/arch/arm/include/asm/cpuerrata.h
> index 8d7e7b9375bd..1799a16d7e7f 100644
> --- a/xen/arch/arm/include/asm/cpuerrata.h
> +++ b/xen/arch/arm/include/asm/cpuerrata.h
> @@ -16,10 +16,10 @@ static inline bool check_workaround_##erratum(void)       
>       \
>      {                                                           \
>          register_t ret;                                         \
>                                                                  \
> -        asm volatile (ALTERNATIVE("mov %0, #0",                 \
> -                                  "mov %0, #1",                 \
> -                                  feature)                      \
> -                      : "=r" (ret));                            \
> +        asm_inline volatile (                                   \
> +            ALTERNATIVE("mov %0, #0",                           \
> +                        "mov %0, #1", feature)                  \
> +            : "=r" (ret) );                                     \
>                                                                  \
>          return unlikely(ret);                                   \
>      }                                                           \
> diff --git a/xen/arch/arm/include/asm/cpufeature.h 
> b/xen/arch/arm/include/asm/cpufeature.h
> index 50297e53d90e..b6df18801166 100644
> --- a/xen/arch/arm/include/asm/cpufeature.h
> +++ b/xen/arch/arm/include/asm/cpufeature.h
> @@ -102,10 +102,10 @@ static inline bool cpus_have_cap(unsigned int num)
>  #define cpus_have_const_cap(num) ({                 \
>          register_t __ret;                           \
>                                                      \
> -        asm volatile (ALTERNATIVE("mov %0, #0",     \
> -                                  "mov %0, #1",     \
> -                                  num)              \
> -                      : "=r" (__ret));              \
> +        asm_inline volatile (                       \
> +            ALTERNATIVE("mov %0, #0",               \
> +                        "mov %0, #1", num)          \
> +            : "=r" (__ret) );                       \
>                                                      \
>          unlikely(__ret);                            \
>          })
> diff --git a/xen/arch/arm/include/asm/page.h b/xen/arch/arm/include/asm/page.h
> index 69f817d1e68a..27bc96b9f401 100644
> --- a/xen/arch/arm/include/asm/page.h
> +++ b/xen/arch/arm/include/asm/page.h
> @@ -176,7 +176,8 @@ static inline int invalidate_dcache_va_range(const void 
> *p, unsigned long size)
>      {
>          size -= dcache_line_bytes - ((uintptr_t)p & cacheline_mask);
>          p = (void *)((uintptr_t)p & ~cacheline_mask);
> -        asm volatile (__clean_and_invalidate_dcache_one(0) : : "r" (p));
> +        asm_inline volatile (
> +            __clean_and_invalidate_dcache_one(0) :: "r" (p) );
>          p += dcache_line_bytes;
>      }
>
> @@ -185,7 +186,8 @@ static inline int invalidate_dcache_va_range(const void 
> *p, unsigned long size)
>          asm volatile (__invalidate_dcache_one(0) : : "r" (p + idx));
>
>      if ( size > 0 )
> -        asm volatile (__clean_and_invalidate_dcache_one(0) : : "r" (p + 
> idx));
> +        asm_inline volatile (
> +            __clean_and_invalidate_dcache_one(0) :: "r" (p + idx) );
>
>      dsb(sy);           /* So we know the flushes happen before continuing */
>
> @@ -209,7 +211,7 @@ static inline int clean_dcache_va_range(const void *p, 
> unsigned long size)
>      p = (void *)((uintptr_t)p & ~cacheline_mask);
>      for ( ; size >= dcache_line_bytes;
>              idx += dcache_line_bytes, size -= dcache_line_bytes )
> -        asm volatile (__clean_dcache_one(0) : : "r" (p + idx));
> +        asm_inline volatile ( __clean_dcache_one(0) : : "r" (p + idx) );
>      dsb(sy);           /* So we know the flushes happen before continuing */
>      /* ARM callers assume that dcache_* functions cannot fail. */
>      return 0;
> @@ -247,7 +249,7 @@ static inline int clean_and_invalidate_dcache_va_range
>      if ( sizeof(x) > MIN_CACHELINE_BYTES || sizeof(x) > alignof(x) )    \
>          clean_dcache_va_range(_p, sizeof(x));                           \
>      else                                                                \
> -        asm volatile (                                                  \
> +        asm_inline volatile (                                           \
>              "dsb sy;"   /* Finish all earlier writes */                 \
>              __clean_dcache_one(0)                                       \
>              "dsb sy;"   /* Finish flush before continuing */            \
> @@ -259,7 +261,7 @@ static inline int clean_and_invalidate_dcache_va_range
>      if ( sizeof(x) > MIN_CACHELINE_BYTES || sizeof(x) > alignof(x) )    \
>          clean_and_invalidate_dcache_va_range(_p, sizeof(x));            \
>      else                                                                \
> -        asm volatile (                                                  \
> +        asm_inline volatile (                                           \
>              "dsb sy;"   /* Finish all earlier writes */                 \
>              __clean_and_invalidate_dcache_one(0)                        \
>              "dsb sy;"   /* Finish flush before continuing */            \
> diff --git a/xen/arch/arm/include/asm/processor.h 
> b/xen/arch/arm/include/asm/processor.h
> index 60b587db697f..9cbc4f911061 100644
> --- a/xen/arch/arm/include/asm/processor.h
> +++ b/xen/arch/arm/include/asm/processor.h
> @@ -607,9 +607,10 @@ register_t get_default_cptr_flags(void);
>  #define SYNCHRONIZE_SERROR(feat)                                  \
>      do {                                                          \
>          ASSERT(local_abort_is_enabled());                         \
> -        asm volatile(ALTERNATIVE("dsb sy; isb",                   \
> -                                 "nop; nop", feat)                \
> -                                 : : : "memory");                 \
> +        asm_inline volatile (                                     \
> +            ALTERNATIVE("dsb sy; isb",                            \
> +                        "nop; nop", feat)                         \
> +            ::: "memory" );                                       \
>      } while (0)
>
>  /*
> diff --git a/xen/arch/arm/include/asm/sysregs.h 
> b/xen/arch/arm/include/asm/sysregs.h
> index 61e30c9e517c..5c2d362be3d8 100644
> --- a/xen/arch/arm/include/asm/sysregs.h
> +++ b/xen/arch/arm/include/asm/sysregs.h
> @@ -22,11 +22,13 @@ static inline register_t read_sysreg_par(void)
>       * DMB SY before and after accessing it, as part of the workaround for 
> the
>       * errata 1508412.
>       */
> -    asm volatile(ALTERNATIVE("nop", "dmb sy", ARM64_WORKAROUND_1508412,
> -                 CONFIG_ARM64_ERRATUM_1508412));
> +    asm_inline volatile (
> +        ALTERNATIVE("nop", "dmb sy", ARM64_WORKAROUND_1508412,
> +                    CONFIG_ARM64_ERRATUM_1508412) );
>      par_el1 = READ_SYSREG64(PAR_EL1);
> -    asm volatile(ALTERNATIVE("nop", "dmb sy", ARM64_WORKAROUND_1508412,
> -                 CONFIG_ARM64_ERRATUM_1508412));
> +    asm_inline volatile (
> +        ALTERNATIVE("nop", "dmb sy", ARM64_WORKAROUND_1508412,
> +                    CONFIG_ARM64_ERRATUM_1508412) );
>
>      return par_el1;
>  }
> diff --git a/xen/arch/arm/mmu/p2m.c b/xen/arch/arm/mmu/p2m.c
> index 7642dbc7c55b..d96078f547d5 100644
> --- a/xen/arch/arm/mmu/p2m.c
> +++ b/xen/arch/arm/mmu/p2m.c
> @@ -228,7 +228,8 @@ void p2m_restore_state(struct vcpu *n)
>       * registers associated to EL1/EL0 translations regime have been
>       * synchronized.
>       */
> -    asm volatile(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_AT_SPECULATE));
> +    asm_inline volatile (
> +        ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_AT_SPECULATE) );
>      WRITE_SYSREG64(p2m->vttbr, VTTBR_EL2);
>
>      last_vcpu_ran = &p2m->last_vcpu_ran[smp_processor_id()];
> diff --git a/xen/arch/x86/include/asm/alternative.h 
> b/xen/arch/x86/include/asm/alternative.h
> index 38472fb58e2d..4c8be51d0e23 100644
> --- a/xen/arch/x86/include/asm/alternative.h
> +++ b/xen/arch/x86/include/asm/alternative.h
> @@ -116,12 +116,13 @@ extern void alternative_branches(void);
>   * without volatile and memory clobber.
>   */
>  #define alternative(oldinstr, newinstr, feature)                        \
> -        asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : 
> "memory")
> +    asm_inline volatile ( ALTERNATIVE(oldinstr, newinstr, feature)      \
> +                          ::: "memory" )
>
>  #define alternative_2(oldinstr, newinstr1, feature1, newinstr2, feature2) \
> -       asm volatile (ALTERNATIVE_2(oldinstr, newinstr1, feature1,      \
> -                                   newinstr2, feature2)                \
> -                     : : : "memory")
> +    asm_inline volatile ( ALTERNATIVE_2(oldinstr, newinstr1, feature1,  \
> +                                        newinstr2, feature2)            \
> +                          ::: "memory" )
>
>  /*
>   * Alternative inline assembly with input.
> @@ -133,14 +134,14 @@ extern void alternative_branches(void);
>   * If you use variable sized constraints like "m" or "g" in the
>   * replacement make sure to pad to the worst case length.
>   */
> -#define alternative_input(oldinstr, newinstr, feature, input...)       \
> -       asm volatile (ALTERNATIVE(oldinstr, newinstr, feature)          \
> -                     : : input)
> +#define alternative_input(oldinstr, newinstr, feature, input...)        \
> +    asm_inline volatile ( ALTERNATIVE(oldinstr, newinstr, feature)      \
> +                          :: input )
>
>  /* Like alternative_input, but with a single output argument */
> -#define alternative_io(oldinstr, newinstr, feature, output, input...)  \
> -       asm volatile (ALTERNATIVE(oldinstr, newinstr, feature)          \
> -                     : output : input)
> +#define alternative_io(oldinstr, newinstr, feature, output, input...)   \
> +    asm_inline volatile ( ALTERNATIVE(oldinstr, newinstr, feature)      \
> +                          : output : input )
>
>  /*
>   * This is similar to alternative_io. But it has two features and
> @@ -150,11 +151,11 @@ extern void alternative_branches(void);
>   * Otherwise, if CPU has feature1, newinstr1 is used.
>   * Otherwise, oldinstr is used.
>   */
> -#define alternative_io_2(oldinstr, newinstr1, feature1, newinstr2,     \
> -                        feature2, output, input...)                    \
> -       asm volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1,       \
> -                                  newinstr2, feature2)                 \
> -                    : output : input)
> +#define alternative_io_2(oldinstr, newinstr1, feature1, newinstr2,      \
> +                         feature2, output, input...)                    \
> +    asm_inline volatile ( ALTERNATIVE_2(oldinstr, newinstr1, feature1,  \
> +                                        newinstr2, feature2)            \
> +                          : output : input )
>
>  /* Use this macro(s) if you need more than one output parameter. */
>  #define ASM_OUTPUT2(a...) a
> @@ -234,8 +235,9 @@ extern void alternative_branches(void);
>      rettype ret_;                                                  \
>      register unsigned long r10_ asm("r10");                        \
>      register unsigned long r11_ asm("r11");                        \
> -    asm volatile (ALTERNATIVE("call *%c[addr](%%rip)", "call .",   \
> -                              X86_FEATURE_ALWAYS)                  \
> +    asm_inline volatile (                                          \
> +        ALTERNATIVE("call *%c[addr](%%rip)", "call .",             \
> +                    X86_FEATURE_ALWAYS)                            \
>                    : ALT_CALL ## n ## _OUT, "=a" (ret_),            \
>                      "=r" (r10_), "=r" (r11_) ASM_CALL_CONSTRAINT   \
>                    : [addr] "i" (&(func)), "g" (func)               \
> diff --git a/xen/include/xen/compiler.h b/xen/include/xen/compiler.h
> index c68fab189154..d9b133016bb6 100644
> --- a/xen/include/xen/compiler.h
> +++ b/xen/include/xen/compiler.h
> @@ -53,6 +53,20 @@
>  #define unreachable() __builtin_unreachable()
>  #endif
>
> +/*
> + * Compilers estimate the size of an asm() block for inlining purposes.
> + * Constructs such as ALTERNATIVE appear large due to the metadata, depsite
> + * often only being a handful of instructions.  asm inline() overrides the
> + * estimation to identify the block as being small.
> + *
> + * Note: __inline is needed to avoid getting caught up in INIT_SECTIONS_ONLY.
> + */
> +#if CONFIG_CC_HAS_ASM_INLINE
> +# define asm_inline asm __inline

Here ^^^

> +#else
> +# define asm_inline asm
> +#endif
> +
>  /*
>   * Add the pseudo keyword 'fallthrough' so case statement blocks
>   * must end with any of these keywords:

Frediano



 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.