[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[xen staging] x86/bitops: Optimise arch_ffs{,l}() some more on AMD



commit 437f07b2b52b32929c74c2e19a52437ce7e5b88f
Author:     Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
AuthorDate: Wed Apr 30 14:35:39 2025 +0100
Commit:     Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
CommitDate: Mon Sep 1 19:54:21 2025 +0100

    x86/bitops: Optimise arch_ffs{,l}() some more on AMD
    
    One aspect of the old ffs() infrastructure was the use of TZCNT without a
    CPUID check.  Given no obvious justification, I dropped it during the 
cleanup.
    
    It turns out there is a good reason, on all but the most recent AMD CPUs.
    
    Reinstate the use of "blind" TZCNT when safe to do so.  This happens to be
    preferentially in loops where a repeated saving of 5-6 uops becomes far more
    relevant than a one-off scenario.  Leave an explanation of why.
    
    It is not safe to perform this transformation for BSR and LZCNT, because 
they
    count from opposite ends of the input operand.  Furthermore, this incorrect
    transform was not caught by the selftests.
    
    Extend the selftests with test_for_each_set_bit_reverse() as that exercises
    the "x known non-zero" optimsiation path in x86's helpers.  Adjust the
    existing test_for_each_set_bit() to intentionally use an asymmetric input 
bit
    pattern, otherwise "counting from the wrong end" bugs continue to slip by.
    
    No functional change.
    
    Fixes: 989e5f08d33e ("x86/bitops: Improve arch_ffs() in the general case")
    Fixes: 5ed26fc0768d ("xen/bitops: Implement ffsl() in common logic")
    Signed-off-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
    Reviewed-by: Jan Beulich <jbeulich@xxxxxxxx>
---
 xen/arch/x86/include/asm/bitops.h | 13 +++++--
 xen/common/bitops.c               | 78 ++++++++++++++++++++++++++++++++++++++-
 2 files changed, 86 insertions(+), 5 deletions(-)

diff --git a/xen/arch/x86/include/asm/bitops.h 
b/xen/arch/x86/include/asm/bitops.h
index 87eac7782f..b17d67e8ca 100644
--- a/xen/arch/x86/include/asm/bitops.h
+++ b/xen/arch/x86/include/asm/bitops.h
@@ -399,9 +399,16 @@ static always_inline unsigned int arch_ffs(unsigned int x)
          *
          * and the optimiser really can work with the knowledge of x being
          * non-zero without knowing it's exact value, in which case we don't
-         * need to compensate for BSF's corner cases.  Otherwise...
+         * need to compensate for BSF's corner cases.
+         *
+         * That said, we intentionally use TZCNT on capable hardware when the
+         * behaviour for the 0 case doesn't matter.  On AMD CPUs prior to
+         * Zen4, TZCNT is 1-2 uops while BSF is 6-8 with a latency to match.
+         * Intel CPUs don't suffer this discrepancy.
+         *
+         * Otherwise...
          */
-        asm ( "bsf %[val], %[res]"
+        asm ( "tzcnt %[val], %[res]"
               : [res] "=r" (r)
               : [val] "rm" (x) );
     }
@@ -428,7 +435,7 @@ static always_inline unsigned int arch_ffsl(unsigned long x)
 
     /* See arch_ffs() for safety discussions. */
     if ( __builtin_constant_p(x > 0) && x > 0 )
-        asm ( "bsf %[val], %q[res]"
+        asm ( "tzcnt %[val], %q[res]"
               : [res] "=r" (r)
               : [val] "rm" (x) );
     else
diff --git a/xen/common/bitops.c b/xen/common/bitops.c
index bd17ddef57..14e6265037 100644
--- a/xen/common/bitops.c
+++ b/xen/common/bitops.c
@@ -97,14 +97,14 @@ static void __init test_for_each_set_bit(void)
     if ( ui != ui_res )
         panic("for_each_set_bit(uint) expected %#x, got %#x\n", ui, ui_res);
 
-    ul = HIDE(1UL << (BITS_PER_LONG - 1) | 1);
+    ul = HIDE(1UL << (BITS_PER_LONG - 1) | 0x11);
     for_each_set_bit ( i, ul )
         ul_res |= 1UL << i;
 
     if ( ul != ul_res )
         panic("for_each_set_bit(ulong) expected %#lx, got %#lx\n", ul, ul_res);
 
-    ull = HIDE(0x8000000180000001ULL);
+    ull = HIDE(0x8000000180000011ULL);
     for_each_set_bit ( i, ull )
         ull_res |= 1ULL << i;
 
@@ -127,6 +127,79 @@ static void __init test_for_each_set_bit(void)
         panic("for_each_set_bit(break) expected 0x1008, got %#x\n", ui_res);
 }
 
+/*
+ * A type-generic fls() which picks the appropriate fls{,l,64}() based on it's
+ * argument.
+ */
+#define fls_g(x)                                        \
+    (sizeof(x) <= sizeof(int)      ? fls(x) :           \
+     sizeof(x) <= sizeof(long)     ? flsl(x) :          \
+     sizeof(x) <= sizeof(uint64_t) ? fls64(x) :         \
+     ({ BUILD_ERROR("fls_g() Bad input type"); 0; }))
+
+/*
+ * for_each_set_bit_reverse() - Iterate over all set bits in a scalar value,
+ * from MSB to LSB.
+ *
+ * @iter An iterator name.  Scoped is within the loop only.
+ * @val  A scalar value to iterate over.
+ *
+ * A copy of @val is taken internally.
+ */
+#define for_each_set_bit_reverse(iter, val)             \
+    for ( typeof(val) __v = (val); __v; __v = 0 )       \
+        for ( unsigned int (iter);                      \
+              __v && ((iter) = fls_g(__v) - 1, true);   \
+              __clear_bit(iter, &__v) )
+
+/*
+ * Xen doesn't have need of for_each_set_bit_reverse() at present, but the
+ * construct does exercise a case of arch_fls*() not covered anywhere else by
+ * these tests.
+ */
+static void __init test_for_each_set_bit_reverse(void)
+{
+    unsigned int  ui,  ui_res = 0, tmp;
+    unsigned long ul,  ul_res = 0;
+    uint64_t      ull, ull_res = 0;
+
+    ui = HIDE(0x80008001U);
+    for_each_set_bit_reverse ( i, ui )
+        ui_res |= 1U << i;
+
+    if ( ui != ui_res )
+        panic("for_each_set_bit_reverse(uint) expected %#x, got %#x\n", ui, 
ui_res);
+
+    ul = HIDE(1UL << (BITS_PER_LONG - 1) | 0x11);
+    for_each_set_bit_reverse ( i, ul )
+        ul_res |= 1UL << i;
+
+    if ( ul != ul_res )
+        panic("for_each_set_bit_reverse(ulong) expected %#lx, got %#lx\n", ul, 
ul_res);
+
+    ull = HIDE(0x8000000180000011ULL);
+    for_each_set_bit_reverse ( i, ull )
+        ull_res |= 1ULL << i;
+
+    if ( ull != ull_res )
+        panic("for_each_set_bit_reverse(uint64) expected %#"PRIx64", got 
%#"PRIx64"\n", ull, ull_res);
+
+    /* Check that we can break from the middle of the loop. */
+    ui = HIDE(0x80001008U);
+    tmp = 0;
+    ui_res = 0;
+    for_each_set_bit_reverse ( i, ui )
+    {
+        if ( tmp++ > 1 )
+            break;
+
+        ui_res |= 1U << i;
+    }
+
+    if ( ui_res != 0x80001000U )
+        panic("for_each_set_bit_reverse(break) expected 0x80001000, got 
%#x\n", ui_res);
+}
+
 static void __init test_multiple_bits_set(void)
 {
     /*
@@ -169,6 +242,7 @@ static void __init __constructor test_bitops(void)
     test_ffs();
     test_fls();
     test_for_each_set_bit();
+    test_for_each_set_bit_reverse();
 
     test_multiple_bits_set();
     test_hweight();
--
generated by git-patchbot for /home/xen/git/xen.git#staging



 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.