[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH v2 11/11] x86/bitops: Use the POPCNT instruction when available



It has existed in x86 CPUs since 2008, so we're only 16 years late adding
support.  With all the other scafolding in place, implement arch_hweightl()
for x86.

The only complication is that the call to arch_generic_hweightl() is behind
the compilers back.  Address this by writing it in ASM and ensure that it
preserves all registers.

Copy the code generation from generic_hweightl().  It's not a complicated
algorithm, and is easy to regenerate if needs be, but cover it with the same
unit tests as test_generic_hweightl() just for piece of mind.

Signed-off-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
---
CC: Jan Beulich <JBeulich@xxxxxxxx>
CC: Roger Pau Monné <roger.pau@xxxxxxxxxx>

v2:
 * Fix MISRA 8.2 (parameter name) and 8.5 (single declaration) regressions.
 * Rename {arch->x86}-generic-hweightl.{S->c}
 * Adjust ASM formating

The __constructor trick to cause any reference to $foo() to pull in
test_$foo() only works when both are in the same TU.  i.e. what I did in
v1 (putting test_arch_generic_hweightl() in the regular generic-hweightl.c)
didn't work.

This in turn means that arch_generic_hweightl() needs writing in a global asm
block, and also that we can't use FUNC()/END().  While we could adjust it to
work for GCC/binutils, we can't have CPP macros in Clang-IAS strings.

I don't particularly like opencoding FUNC()/END(), but I can't think of
anything better.
---
 xen/arch/x86/include/asm/bitops.h | 23 +++++++++++
 xen/lib/Makefile                  |  1 +
 xen/lib/x86-generic-hweightl.c    | 69 +++++++++++++++++++++++++++++++
 3 files changed, 93 insertions(+)
 create mode 100644 xen/lib/x86-generic-hweightl.c

diff --git a/xen/arch/x86/include/asm/bitops.h 
b/xen/arch/x86/include/asm/bitops.h
index 642d8e58b288..39e37f1cbe55 100644
--- a/xen/arch/x86/include/asm/bitops.h
+++ b/xen/arch/x86/include/asm/bitops.h
@@ -6,6 +6,7 @@
  */
 
 #include <asm/alternative.h>
+#include <asm/asm_defns.h>
 #include <asm/cpufeatureset.h>
 
 /*
@@ -475,4 +476,26 @@ static always_inline unsigned int arch_flsl(unsigned long 
x)
 }
 #define arch_flsl arch_flsl
 
+unsigned int arch_generic_hweightl(unsigned long x);
+
+static always_inline unsigned int arch_hweightl(unsigned long x)
+{
+    unsigned int r;
+
+    /*
+     * arch_generic_hweightl() is written in ASM in order to preserve all
+     * registers, as the compiler can't see the call.
+     *
+     * This limits the POPCNT instruction to using the same ABI as a function
+     * call (input in %rdi, output in %eax) but that's fine.
+     */
+    alternative_io("call arch_generic_hweightl",
+                   "popcnt %[val], %q[res]", X86_FEATURE_POPCNT,
+                   ASM_OUTPUT2([res] "=a" (r) ASM_CALL_CONSTRAINT),
+                   [val] "D" (x));
+
+    return r;
+}
+#define arch_hweightl arch_hweightl
+
 #endif /* _X86_BITOPS_H */
diff --git a/xen/lib/Makefile b/xen/lib/Makefile
index b6558e108bd9..54440f628aae 100644
--- a/xen/lib/Makefile
+++ b/xen/lib/Makefile
@@ -36,6 +36,7 @@ lib-y += strtol.o
 lib-y += strtoll.o
 lib-y += strtoul.o
 lib-y += strtoull.o
+lib-$(CONFIG_X86) += x86-generic-hweightl.o
 lib-$(CONFIG_X86) += xxhash32.o
 lib-$(CONFIG_X86) += xxhash64.o
 
diff --git a/xen/lib/x86-generic-hweightl.c b/xen/lib/x86-generic-hweightl.c
new file mode 100644
index 000000000000..e0be25a01c1d
--- /dev/null
+++ b/xen/lib/x86-generic-hweightl.c
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#include <xen/bitops.h>
+#include <xen/init.h>
+#include <xen/self-tests.h>
+
+/*
+ * An implementation of generic_hweightl() used on hardware without the POPCNT
+ * instruction.
+ *
+ * This function is called from within an ALTERNATIVE in arch_hweightl().
+ * i.e. behind the back of the compiler.  Therefore all registers are callee
+ * preserved.
+ *
+ * The ASM is what GCC-12 emits for generic_hweightl() in a release build of
+ * Xen, with spilling of %rdi/%rdx to preserve the callers registers.
+ *
+ * Note: When we can use __attribute__((no_caller_saved_registers))
+ *       unconditionally (GCC 7, Clang 5), we can implement this in plain C.
+ */
+asm (
+    ".type arch_generic_hweightl, STT_FUNC\n\t"
+    ".globl arch_generic_hweightl\n\t"
+    ".hidden arch_generic_hweightl\n\t"
+    ".balign " STR(CONFIG_FUNCTION_ALIGNMENT) ", 0x90\n\t"
+    "arch_generic_hweightl:\n\t"
+
+    "push   %rdi\n\t"
+    "push   %rdx\n\t"
+
+    "movabs $0x5555555555555555, %rdx\n\t"
+    "mov    %rdi, %rax\n\t"
+    "shr    $1, %rax\n\t"
+    "and    %rdx, %rax\n\t"
+    "sub    %rax, %rdi\n\t"
+    "movabs $0x3333333333333333, %rax\n\t"
+    "mov    %rdi, %rdx\n\t"
+    "shr    $2, %rdi\n\t"
+    "and    %rax, %rdx\n\t"
+    "and    %rax, %rdi\n\t"
+    "add    %rdi, %rdx\n\t"
+    "mov    %rdx, %rax\n\t"
+    "shr    $4, %rax\n\t"
+    "add    %rdx, %rax\n\t"
+    "movabs $0x0f0f0f0f0f0f0f0f, %rdx\n\t"
+    "and    %rdx, %rax\n\t"
+    "movabs $0x0101010101010101, %rdx\n\t"
+    "imul   %rdx, %rax\n\t"
+    "shr    $" STR(BITS_PER_LONG) "- 8, %rax\n\t"
+
+    "pop    %rdx\n\t"
+    "pop    %rdi\n\t"
+
+    "ret\n\t"
+
+    ".size arch_generic_hweightl, . - arch_generic_hweightl\n\t"
+);
+
+static void __init __constructor test_arch_generic_hweightl(void)
+{
+    RUNTIME_CHECK(arch_generic_hweightl, 0, 0);
+    RUNTIME_CHECK(arch_generic_hweightl, 1, 1);
+    RUNTIME_CHECK(arch_generic_hweightl, 3, 2);
+    RUNTIME_CHECK(arch_generic_hweightl, 7, 3);
+    RUNTIME_CHECK(arch_generic_hweightl, 0xff, 8);
+
+    RUNTIME_CHECK(arch_generic_hweightl, 1 | (1UL << (BITS_PER_LONG - 1)), 2);
+    RUNTIME_CHECK(arch_generic_hweightl, -1UL, BITS_PER_LONG);
+}
-- 
2.39.2




 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.