[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH] xen: Use asm inline when available for alternatives



Compilers estimate the size of an asm() block for inlining purposes.

Constructs such as ALTERNATIVE appear large due to the metadata, depsite often
only being a handful of instructions.  asm inline() overrides the estimation
to identify the block as being small.

This has a substantial impact on inlining decisions, expected to be for the
better given that the compiler has a more accurate picture to work with.

Signed-off-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
---
CC: Jan Beulich <JBeulich@xxxxxxxx>
CC: Roger Pau Monné <roger.pau@xxxxxxxxxx>
CC: Stefano Stabellini <sstabellini@xxxxxxxxxx>
CC: Julien Grall <julien@xxxxxxx>
CC: Volodymyr Babchuk <Volodymyr_Babchuk@xxxxxxxx>
CC: Bertrand Marquis <bertrand.marquis@xxxxxxx>
CC: Michal Orzel <michal.orzel@xxxxxxx>
---
 xen/Kconfig                               |  4 +++
 xen/arch/arm/include/asm/alternative.h    |  4 +--
 xen/arch/arm/include/asm/arm64/flushtlb.h |  4 +--
 xen/arch/arm/include/asm/arm64/io.h       | 43 ++++++++++++++---------
 xen/arch/arm/include/asm/cpuerrata.h      |  8 ++---
 xen/arch/arm/include/asm/cpufeature.h     |  8 ++---
 xen/arch/arm/include/asm/page.h           | 12 ++++---
 xen/arch/arm/include/asm/processor.h      |  7 ++--
 xen/arch/arm/include/asm/sysregs.h        | 10 +++---
 xen/arch/arm/mmu/p2m.c                    |  3 +-
 xen/arch/x86/include/asm/alternative.h    | 36 ++++++++++---------
 xen/include/xen/compiler.h                | 14 ++++++++
 12 files changed, 95 insertions(+), 58 deletions(-)

diff --git a/xen/Kconfig b/xen/Kconfig
index ae1c401a981e..ab4ab42ae987 100644
--- a/xen/Kconfig
+++ b/xen/Kconfig
@@ -29,6 +29,10 @@ config LD_IS_GNU
 config LD_IS_LLVM
        def_bool $(success,$(LD) --version | head -n 1 | grep -q "^LLD")
 
+config CC_HAS_ASM_INLINE
+       # GCC >= 9, Clang >= 11
+       def_bool $(success,echo 'void foo(void) { asm inline (""); }' | $(CC) 
-x c - -c -o /dev/null)
+
 # Use -f{function,data}-sections compiler parameters
 config CC_SPLIT_SECTIONS
        bool
diff --git a/xen/arch/arm/include/asm/alternative.h 
b/xen/arch/arm/include/asm/alternative.h
index 22477d9497a3..1563f03a0f5a 100644
--- a/xen/arch/arm/include/asm/alternative.h
+++ b/xen/arch/arm/include/asm/alternative.h
@@ -209,9 +209,9 @@ alternative_endif
 #endif  /*  __ASSEMBLY__  */
 
 /*
- * Usage: asm(ALTERNATIVE(oldinstr, newinstr, feature));
+ * Usage: asm_inline (ALTERNATIVE(oldinstr, newinstr, feature));
  *
- * Usage: asm(ALTERNATIVE(oldinstr, newinstr, feature, CONFIG_FOO));
+ * Usage: asm_inline (ALTERNATIVE(oldinstr, newinstr, feature, CONFIG_FOO));
  * N.B. If CONFIG_FOO is specified, but not selected, the whole block
  *      will be omitted, including oldinstr.
  */
diff --git a/xen/arch/arm/include/asm/arm64/flushtlb.h 
b/xen/arch/arm/include/asm/arm64/flushtlb.h
index 45642201d147..3b99c11b50d1 100644
--- a/xen/arch/arm/include/asm/arm64/flushtlb.h
+++ b/xen/arch/arm/include/asm/arm64/flushtlb.h
@@ -31,7 +31,7 @@
 #define TLB_HELPER(name, tlbop, sh)              \
 static inline void name(void)                    \
 {                                                \
-    asm volatile(                                \
+    asm_inline volatile (                        \
         "dsb  "  # sh  "st;"                     \
         "tlbi "  # tlbop  ";"                    \
         ALTERNATIVE(                             \
@@ -55,7 +55,7 @@ static inline void name(void)                    \
 #define TLB_HELPER_VA(name, tlbop)               \
 static inline void name(vaddr_t va)              \
 {                                                \
-    asm volatile(                                \
+    asm_inline volatile (                        \
         "tlbi "  # tlbop  ", %0;"                \
         ALTERNATIVE(                             \
             "nop; nop;",                         \
diff --git a/xen/arch/arm/include/asm/arm64/io.h 
b/xen/arch/arm/include/asm/arm64/io.h
index 7d5959877759..ac90b729c44d 100644
--- a/xen/arch/arm/include/asm/arm64/io.h
+++ b/xen/arch/arm/include/asm/arm64/io.h
@@ -51,40 +51,51 @@ static inline void __raw_writeq(u64 val, volatile void 
__iomem *addr)
 static inline u8 __raw_readb(const volatile void __iomem *addr)
 {
         u8 val;
-        asm volatile(ALTERNATIVE("ldrb %w0, [%1]",
-                                 "ldarb %w0, [%1]",
-                                 ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE)
-                     : "=r" (val) : "r" (addr));
+
+        asm_inline volatile (
+            ALTERNATIVE("ldrb %w0, [%1]",
+                        "ldarb %w0, [%1]",
+                        ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE)
+            : "=r" (val) : "r" (addr) );
+
         return val;
 }
 
 static inline u16 __raw_readw(const volatile void __iomem *addr)
 {
         u16 val;
-        asm volatile(ALTERNATIVE("ldrh %w0, [%1]",
-                                 "ldarh %w0, [%1]",
-                                 ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE)
-                     : "=r" (val) : "r" (addr));
+        asm_inline volatile (
+            ALTERNATIVE("ldrh %w0, [%1]",
+                        "ldarh %w0, [%1]",
+                        ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE)
+            : "=r" (val) : "r" (addr) );
+
         return val;
 }
 
 static inline u32 __raw_readl(const volatile void __iomem *addr)
 {
         u32 val;
-        asm volatile(ALTERNATIVE("ldr %w0, [%1]",
-                                 "ldar %w0, [%1]",
-                                 ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE)
-                     : "=r" (val) : "r" (addr));
+
+        asm_inline volatile (
+            ALTERNATIVE("ldr %w0, [%1]",
+                        "ldar %w0, [%1]",
+                        ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE)
+            : "=r" (val) : "r" (addr) );
+
         return val;
 }
 
 static inline u64 __raw_readq(const volatile void __iomem *addr)
 {
         u64 val;
-        asm volatile(ALTERNATIVE("ldr %0, [%1]",
-                                 "ldar %0, [%1]",
-                                 ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE)
-                     : "=r" (val) : "r" (addr));
+
+        asm_inline volatile (
+            ALTERNATIVE("ldr %0, [%1]",
+                        "ldar %0, [%1]",
+                        ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE)
+            : "=r" (val) : "r" (addr) );
+
         return val;
 }
 
diff --git a/xen/arch/arm/include/asm/cpuerrata.h 
b/xen/arch/arm/include/asm/cpuerrata.h
index 8d7e7b9375bd..1799a16d7e7f 100644
--- a/xen/arch/arm/include/asm/cpuerrata.h
+++ b/xen/arch/arm/include/asm/cpuerrata.h
@@ -16,10 +16,10 @@ static inline bool check_workaround_##erratum(void)         
    \
     {                                                           \
         register_t ret;                                         \
                                                                 \
-        asm volatile (ALTERNATIVE("mov %0, #0",                 \
-                                  "mov %0, #1",                 \
-                                  feature)                      \
-                      : "=r" (ret));                            \
+        asm_inline volatile (                                   \
+            ALTERNATIVE("mov %0, #0",                           \
+                        "mov %0, #1", feature)                  \
+            : "=r" (ret) );                                     \
                                                                 \
         return unlikely(ret);                                   \
     }                                                           \
diff --git a/xen/arch/arm/include/asm/cpufeature.h 
b/xen/arch/arm/include/asm/cpufeature.h
index 50297e53d90e..b6df18801166 100644
--- a/xen/arch/arm/include/asm/cpufeature.h
+++ b/xen/arch/arm/include/asm/cpufeature.h
@@ -102,10 +102,10 @@ static inline bool cpus_have_cap(unsigned int num)
 #define cpus_have_const_cap(num) ({                 \
         register_t __ret;                           \
                                                     \
-        asm volatile (ALTERNATIVE("mov %0, #0",     \
-                                  "mov %0, #1",     \
-                                  num)              \
-                      : "=r" (__ret));              \
+        asm_inline volatile (                       \
+            ALTERNATIVE("mov %0, #0",               \
+                        "mov %0, #1", num)          \
+            : "=r" (__ret) );                       \
                                                     \
         unlikely(__ret);                            \
         })
diff --git a/xen/arch/arm/include/asm/page.h b/xen/arch/arm/include/asm/page.h
index 69f817d1e68a..27bc96b9f401 100644
--- a/xen/arch/arm/include/asm/page.h
+++ b/xen/arch/arm/include/asm/page.h
@@ -176,7 +176,8 @@ static inline int invalidate_dcache_va_range(const void *p, 
unsigned long size)
     {
         size -= dcache_line_bytes - ((uintptr_t)p & cacheline_mask);
         p = (void *)((uintptr_t)p & ~cacheline_mask);
-        asm volatile (__clean_and_invalidate_dcache_one(0) : : "r" (p));
+        asm_inline volatile (
+            __clean_and_invalidate_dcache_one(0) :: "r" (p) );
         p += dcache_line_bytes;
     }
 
@@ -185,7 +186,8 @@ static inline int invalidate_dcache_va_range(const void *p, 
unsigned long size)
         asm volatile (__invalidate_dcache_one(0) : : "r" (p + idx));
 
     if ( size > 0 )
-        asm volatile (__clean_and_invalidate_dcache_one(0) : : "r" (p + idx));
+        asm_inline volatile (
+            __clean_and_invalidate_dcache_one(0) :: "r" (p + idx) );
 
     dsb(sy);           /* So we know the flushes happen before continuing */
 
@@ -209,7 +211,7 @@ static inline int clean_dcache_va_range(const void *p, 
unsigned long size)
     p = (void *)((uintptr_t)p & ~cacheline_mask);
     for ( ; size >= dcache_line_bytes;
             idx += dcache_line_bytes, size -= dcache_line_bytes )
-        asm volatile (__clean_dcache_one(0) : : "r" (p + idx));
+        asm_inline volatile ( __clean_dcache_one(0) : : "r" (p + idx) );
     dsb(sy);           /* So we know the flushes happen before continuing */
     /* ARM callers assume that dcache_* functions cannot fail. */
     return 0;
@@ -247,7 +249,7 @@ static inline int clean_and_invalidate_dcache_va_range
     if ( sizeof(x) > MIN_CACHELINE_BYTES || sizeof(x) > alignof(x) )    \
         clean_dcache_va_range(_p, sizeof(x));                           \
     else                                                                \
-        asm volatile (                                                  \
+        asm_inline volatile (                                           \
             "dsb sy;"   /* Finish all earlier writes */                 \
             __clean_dcache_one(0)                                       \
             "dsb sy;"   /* Finish flush before continuing */            \
@@ -259,7 +261,7 @@ static inline int clean_and_invalidate_dcache_va_range
     if ( sizeof(x) > MIN_CACHELINE_BYTES || sizeof(x) > alignof(x) )    \
         clean_and_invalidate_dcache_va_range(_p, sizeof(x));            \
     else                                                                \
-        asm volatile (                                                  \
+        asm_inline volatile (                                           \
             "dsb sy;"   /* Finish all earlier writes */                 \
             __clean_and_invalidate_dcache_one(0)                        \
             "dsb sy;"   /* Finish flush before continuing */            \
diff --git a/xen/arch/arm/include/asm/processor.h 
b/xen/arch/arm/include/asm/processor.h
index 60b587db697f..9cbc4f911061 100644
--- a/xen/arch/arm/include/asm/processor.h
+++ b/xen/arch/arm/include/asm/processor.h
@@ -607,9 +607,10 @@ register_t get_default_cptr_flags(void);
 #define SYNCHRONIZE_SERROR(feat)                                  \
     do {                                                          \
         ASSERT(local_abort_is_enabled());                         \
-        asm volatile(ALTERNATIVE("dsb sy; isb",                   \
-                                 "nop; nop", feat)                \
-                                 : : : "memory");                 \
+        asm_inline volatile (                                     \
+            ALTERNATIVE("dsb sy; isb",                            \
+                        "nop; nop", feat)                         \
+            ::: "memory" );                                       \
     } while (0)
 
 /*
diff --git a/xen/arch/arm/include/asm/sysregs.h 
b/xen/arch/arm/include/asm/sysregs.h
index 61e30c9e517c..5c2d362be3d8 100644
--- a/xen/arch/arm/include/asm/sysregs.h
+++ b/xen/arch/arm/include/asm/sysregs.h
@@ -22,11 +22,13 @@ static inline register_t read_sysreg_par(void)
      * DMB SY before and after accessing it, as part of the workaround for the
      * errata 1508412.
      */
-    asm volatile(ALTERNATIVE("nop", "dmb sy", ARM64_WORKAROUND_1508412,
-                 CONFIG_ARM64_ERRATUM_1508412));
+    asm_inline volatile (
+        ALTERNATIVE("nop", "dmb sy", ARM64_WORKAROUND_1508412,
+                    CONFIG_ARM64_ERRATUM_1508412) );
     par_el1 = READ_SYSREG64(PAR_EL1);
-    asm volatile(ALTERNATIVE("nop", "dmb sy", ARM64_WORKAROUND_1508412,
-                 CONFIG_ARM64_ERRATUM_1508412));
+    asm_inline volatile (
+        ALTERNATIVE("nop", "dmb sy", ARM64_WORKAROUND_1508412,
+                    CONFIG_ARM64_ERRATUM_1508412) );
 
     return par_el1;
 }
diff --git a/xen/arch/arm/mmu/p2m.c b/xen/arch/arm/mmu/p2m.c
index 7642dbc7c55b..d96078f547d5 100644
--- a/xen/arch/arm/mmu/p2m.c
+++ b/xen/arch/arm/mmu/p2m.c
@@ -228,7 +228,8 @@ void p2m_restore_state(struct vcpu *n)
      * registers associated to EL1/EL0 translations regime have been
      * synchronized.
      */
-    asm volatile(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_AT_SPECULATE));
+    asm_inline volatile (
+        ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_AT_SPECULATE) );
     WRITE_SYSREG64(p2m->vttbr, VTTBR_EL2);
 
     last_vcpu_ran = &p2m->last_vcpu_ran[smp_processor_id()];
diff --git a/xen/arch/x86/include/asm/alternative.h 
b/xen/arch/x86/include/asm/alternative.h
index 38472fb58e2d..4c8be51d0e23 100644
--- a/xen/arch/x86/include/asm/alternative.h
+++ b/xen/arch/x86/include/asm/alternative.h
@@ -116,12 +116,13 @@ extern void alternative_branches(void);
  * without volatile and memory clobber.
  */
 #define alternative(oldinstr, newinstr, feature)                        \
-        asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory")
+    asm_inline volatile ( ALTERNATIVE(oldinstr, newinstr, feature)      \
+                          ::: "memory" )
 
 #define alternative_2(oldinstr, newinstr1, feature1, newinstr2, feature2) \
-       asm volatile (ALTERNATIVE_2(oldinstr, newinstr1, feature1,      \
-                                   newinstr2, feature2)                \
-                     : : : "memory")
+    asm_inline volatile ( ALTERNATIVE_2(oldinstr, newinstr1, feature1,  \
+                                        newinstr2, feature2)            \
+                          ::: "memory" )
 
 /*
  * Alternative inline assembly with input.
@@ -133,14 +134,14 @@ extern void alternative_branches(void);
  * If you use variable sized constraints like "m" or "g" in the
  * replacement make sure to pad to the worst case length.
  */
-#define alternative_input(oldinstr, newinstr, feature, input...)       \
-       asm volatile (ALTERNATIVE(oldinstr, newinstr, feature)          \
-                     : : input)
+#define alternative_input(oldinstr, newinstr, feature, input...)        \
+    asm_inline volatile ( ALTERNATIVE(oldinstr, newinstr, feature)      \
+                          :: input )
 
 /* Like alternative_input, but with a single output argument */
-#define alternative_io(oldinstr, newinstr, feature, output, input...)  \
-       asm volatile (ALTERNATIVE(oldinstr, newinstr, feature)          \
-                     : output : input)
+#define alternative_io(oldinstr, newinstr, feature, output, input...)   \
+    asm_inline volatile ( ALTERNATIVE(oldinstr, newinstr, feature)      \
+                          : output : input )
 
 /*
  * This is similar to alternative_io. But it has two features and
@@ -150,11 +151,11 @@ extern void alternative_branches(void);
  * Otherwise, if CPU has feature1, newinstr1 is used.
  * Otherwise, oldinstr is used.
  */
-#define alternative_io_2(oldinstr, newinstr1, feature1, newinstr2,     \
-                        feature2, output, input...)                    \
-       asm volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1,       \
-                                  newinstr2, feature2)                 \
-                    : output : input)
+#define alternative_io_2(oldinstr, newinstr1, feature1, newinstr2,      \
+                         feature2, output, input...)                    \
+    asm_inline volatile ( ALTERNATIVE_2(oldinstr, newinstr1, feature1,  \
+                                        newinstr2, feature2)            \
+                          : output : input )
 
 /* Use this macro(s) if you need more than one output parameter. */
 #define ASM_OUTPUT2(a...) a
@@ -234,8 +235,9 @@ extern void alternative_branches(void);
     rettype ret_;                                                  \
     register unsigned long r10_ asm("r10");                        \
     register unsigned long r11_ asm("r11");                        \
-    asm volatile (ALTERNATIVE("call *%c[addr](%%rip)", "call .",   \
-                              X86_FEATURE_ALWAYS)                  \
+    asm_inline volatile (                                          \
+        ALTERNATIVE("call *%c[addr](%%rip)", "call .",             \
+                    X86_FEATURE_ALWAYS)                            \
                   : ALT_CALL ## n ## _OUT, "=a" (ret_),            \
                     "=r" (r10_), "=r" (r11_) ASM_CALL_CONSTRAINT   \
                   : [addr] "i" (&(func)), "g" (func)               \
diff --git a/xen/include/xen/compiler.h b/xen/include/xen/compiler.h
index c68fab189154..d9b133016bb6 100644
--- a/xen/include/xen/compiler.h
+++ b/xen/include/xen/compiler.h
@@ -53,6 +53,20 @@
 #define unreachable() __builtin_unreachable()
 #endif
 
+/*
+ * Compilers estimate the size of an asm() block for inlining purposes.
+ * Constructs such as ALTERNATIVE appear large due to the metadata, depsite
+ * often only being a handful of instructions.  asm inline() overrides the
+ * estimation to identify the block as being small.
+ *
+ * Note: __inline is needed to avoid getting caught up in INIT_SECTIONS_ONLY.
+ */
+#if CONFIG_CC_HAS_ASM_INLINE
+# define asm_inline asm __inline
+#else
+# define asm_inline asm
+#endif
+
 /*
  * Add the pseudo keyword 'fallthrough' so case statement blocks
  * must end with any of these keywords:
-- 
2.39.5




 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.