[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[xen staging] xen/riscv: introduce cmpxchg.h



commit 51dabd6312ce42e557bcba10710577306808a13e
Author:     Oleksii Kurochko <oleksii.kurochko@xxxxxxxxx>
AuthorDate: Fri May 17 15:54:54 2024 +0200
Commit:     Jan Beulich <jbeulich@xxxxxxxx>
CommitDate: Thu May 23 10:10:19 2024 +0200

    xen/riscv: introduce cmpxchg.h
    
    The header was taken from Linux kernl 6.4.0-rc1.
    
    Addionally, were updated:
    * add emulation of {cmp}xchg for 1/2 byte types using 32-bit atomic
      access.
    * replace tabs with spaces
    * replace __* variale with *__
    * introduce generic version of xchg_* and cmpxchg_*.
    * drop {cmp}xchg{release,relaxed,acquire} as Xen doesn't use them
    * drop barries and use instruction suffixices instead ( .aq, .rl, .aqrl )
    
    Implementation of 4- and 8-byte cases were updated according to the spec:
    ```
                  ....
    Linux Construct         RVWMO AMO Mapping
        ...
    atomic <op>             amo<op>.{w|d}.aqrl
    Linux Construct         RVWMO LR/SC Mapping
        ...
    atomic <op>             loop: lr.{w|d}.aq; <op>; sc.{w|d}.aqrl; bnez loop
    
    Table A.5: Mappings from Linux memory primitives to RISC-V primitives
    
    ```
    
    The current implementation is the same with 8e86f0b409a4
    ("arm64: atomics: fix use of acquire + release for full barrier
    semantics") [1].
    RISC-V could combine acquire and release into the SC
    instructions and it could reduce a fence instruction to gain better
    performance. Here is related description from RISC-V ISA 10.2
    Load-Reserved/Store-Conditional Instructions:
    
     - .aq:   The LR/SC sequence can be given acquire semantics by
              setting the aq bit on the LR instruction.
     - .rl:   The LR/SC sequence can be given release semantics by
              setting the rl bit on the SC instruction.
     - .aqrl: Setting the aq bit on the LR instruction, and setting
              both the aq and the rl bit on the SC instruction makes
              the LR/SC sequence sequentially consistent, meaning that
              it cannot be reordered with earlier or later memory
              operations from the same hart.
    
     Software should not set the rl bit on an LR instruction unless
     the aq bit is also set, nor should software set the aq bit on an
     SC instruction unless the rl bit is also set. LR.rl and SC.aq
     instructions are not guaranteed to provide any stronger ordering
     than those with both bits clear, but may result in lower
     performance.
    
    Also, I way of transforming ".rl + full barrier" to ".aqrl" was approved
    by (the author of the RVWMO spec) [2]
    
    [1] 
https://patchwork.kernel.org/project/linux-arm-kernel/patch/1391516953-14541-1-git-send-email-will.deacon@xxxxxxx/
    [2] 
https://lore.kernel.org/linux-riscv/41e01514-74ca-84f2-f5cc-2645c444fd8e@xxxxxxxxxx/
    
    Signed-off-by: Oleksii Kurochko <oleksii.kurochko@xxxxxxxxx>
    Acked-by: Jan Beulich <jbeulich@xxxxxxxx>
---
 xen/arch/riscv/include/asm/cmpxchg.h | 241 +++++++++++++++++++++++++++++++++++
 xen/arch/riscv/include/asm/config.h  |   2 +
 2 files changed, 243 insertions(+)

diff --git a/xen/arch/riscv/include/asm/cmpxchg.h 
b/xen/arch/riscv/include/asm/cmpxchg.h
new file mode 100644
index 0000000000..d5e678c036
--- /dev/null
+++ b/xen/arch/riscv/include/asm/cmpxchg.h
@@ -0,0 +1,241 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (C) 2014 Regents of the University of California */
+
+#ifndef _ASM_RISCV_CMPXCHG_H
+#define _ASM_RISCV_CMPXCHG_H
+
+#include <xen/compiler.h>
+#include <xen/lib.h>
+
+#include <asm/fence.h>
+#include <asm/io.h>
+#include <asm/system.h>
+
+#define _amoswap_generic(ptr, new, ret, sfx) \
+    asm volatile ( \
+        " amoswap" sfx " %0, %2, %1" \
+        : "=r" (ret), "+A" (*(ptr)) \
+        : "r" (new) \
+        : "memory" );
+
+/*
+ * For LR and SC, the A extension requires that the address held in rs1 be
+ * naturally aligned to the size of the operand (i.e., eight-byte aligned
+ * for 64-bit words and four-byte aligned for 32-bit words).
+ * If the address is not naturally aligned, an address-misaligned exception
+ * or an access-fault exception will be generated.
+ *
+ * Thereby:
+ * - for 1-byte xchg access the containing word by clearing low two bits.
+ * - for 2-byte xchg access the containing word by clearing bit 1.
+ *
+ * If resulting 4-byte access is still misalgined, it will fault just as
+ * non-emulated 4-byte access would.
+ */
+#define emulate_xchg_1_2(ptr, new, lr_sfx, sc_sfx) \
+({ \
+    uint32_t *aligned_ptr; \
+    unsigned long alignment_mask = sizeof(*aligned_ptr) - sizeof(*(ptr)); \
+    unsigned int new_val_bit = \
+        ((unsigned long)(ptr) & alignment_mask) * BITS_PER_BYTE; \
+    unsigned long mask = \
+        GENMASK(((sizeof(*(ptr))) * BITS_PER_BYTE) - 1, 0) << new_val_bit; \
+    unsigned int new_ = (new) << new_val_bit; \
+    unsigned int old; \
+    unsigned int scratch; \
+    \
+    aligned_ptr = (uint32_t *)((unsigned long)(ptr) & ~alignment_mask); \
+    \
+    asm volatile ( \
+        "0: lr.w" lr_sfx " %[old], %[ptr_]\n" \
+        "   andn  %[scratch], %[old], %[mask]\n" \
+        "   or   %[scratch], %[scratch], %z[new_]\n" \
+        "   sc.w" sc_sfx " %[scratch], %[scratch], %[ptr_]\n" \
+        "   bnez %[scratch], 0b\n" \
+        : [old] "=&r" (old), [scratch] "=&r" (scratch), \
+          [ptr_] "+A" (*aligned_ptr) \
+        : [new_] "rJ" (new_), [mask] "r" (mask) \
+        : "memory" ); \
+    \
+    (__typeof__(*(ptr)))((old & mask) >> new_val_bit); \
+})
+
+/*
+ * This function doesn't exist, so you'll get a linker error
+ * if something tries to do an invalid xchg().
+ */
+extern unsigned long __bad_xchg(volatile void *ptr, unsigned int size);
+
+static always_inline unsigned long __xchg(volatile void *ptr,
+                                          unsigned long new,
+                                          unsigned int size)
+{
+    unsigned long ret;
+
+    switch ( size )
+    {
+    case 1:
+        ret = emulate_xchg_1_2((volatile uint8_t *)ptr, new, ".aq", ".aqrl");
+        break;
+    case 2:
+        ret = emulate_xchg_1_2((volatile uint16_t *)ptr, new, ".aq", ".aqrl");
+        break;
+    case 4:
+        _amoswap_generic((volatile uint32_t *)ptr, new, ret, ".w.aqrl");
+        break;
+#ifndef CONFIG_RISCV_32
+    case 8:
+        _amoswap_generic((volatile uint64_t *)ptr, new, ret, ".d.aqrl");
+        break;
+#endif
+    default:
+        return __bad_xchg(ptr, size);
+    }
+
+    return ret;
+}
+
+#define xchg(ptr, x) \
+({ \
+    __typeof__(*(ptr)) n_ = (x); \
+    (__typeof__(*(ptr))) \
+        __xchg(ptr, (unsigned long)n_, sizeof(*(ptr))); \
+})
+
+#define _generic_cmpxchg(ptr, old, new, lr_sfx, sc_sfx) \
+ ({ \
+    unsigned int rc; \
+    unsigned long ret; \
+    unsigned long mask = GENMASK(((sizeof(*(ptr))) * BITS_PER_BYTE) - 1, 0); \
+    asm volatile ( \
+        "0: lr" lr_sfx " %[ret], %[ptr_]\n" \
+        "   and  %[ret], %[ret], %[mask]\n" \
+        "   bne  %[ret], %z[old_], 1f\n" \
+        "   sc" sc_sfx " %[rc], %z[new_], %[ptr_]\n" \
+        "   bnez %[rc], 0b\n" \
+        "1:\n" \
+        : [ret] "=&r" (ret), [rc] "=&r" (rc), [ptr_] "+A" (*ptr) \
+        : [old_] "rJ" (old), [new_] "rJ" (new), [mask] "r" (mask)  \
+        : "memory" ); \
+    ret; \
+ })
+
+/*
+ * For LR and SC, the A extension requires that the address held in rs1 be
+ * naturally aligned to the size of the operand (i.e., eight-byte aligned
+ * for 64-bit words and four-byte aligned for 32-bit words).
+ * If the address is not naturally aligned, an address-misaligned exception
+ * or an access-fault exception will be generated.
+ *
+ * Thereby:
+ * - for 1-byte xchg access the containing word by clearing low two bits
+ * - for 2-byte xchg ccess the containing word by clearing first bit.
+ *
+ * If resulting 4-byte access is still misalgined, it will fault just as
+ * non-emulated 4-byte access would.
+ *
+ * old_val was casted to unsigned long for cmpxchgptr()
+ */
+#define emulate_cmpxchg_1_2(ptr, old, new, lr_sfx, sc_sfx) \
+({ \
+    uint32_t *aligned_ptr; \
+    unsigned long alignment_mask = sizeof(*aligned_ptr) - sizeof(*(ptr)); \
+    uint8_t new_val_bit = \
+        ((unsigned long)(ptr) & alignment_mask) * BITS_PER_BYTE; \
+    unsigned long mask = \
+        GENMASK(((sizeof(*(ptr))) * BITS_PER_BYTE) - 1, 0) << new_val_bit; \
+    unsigned int old_ = (old) << new_val_bit; \
+    unsigned int new_ = (new) << new_val_bit; \
+    unsigned int old_val; \
+    unsigned int scratch; \
+    \
+    aligned_ptr = (uint32_t *)((unsigned long)ptr & ~alignment_mask); \
+    \
+    asm volatile ( \
+        "0: lr.w" lr_sfx " %[scratch], %[ptr_]\n" \
+        "   and  %[old_val], %[scratch], %[mask]\n" \
+        "   bne  %[old_val], %z[old_], 1f\n" \
+        /* the following line is an equivalent to: \
+         *     scratch = old_val & ~mask; \
+         * And to elimanate one ( likely register ) input it was decided \
+         * to use: \
+         *     scratch = old_val ^ scratch \
+         */ \
+        "   xor  %[scratch], %[old_val], %[scratch]\n" \
+        "   or   %[scratch], %[scratch], %z[new_]\n" \
+        "   sc.w" sc_sfx " %[scratch], %[scratch], %[ptr_]\n" \
+        "   bnez %[scratch], 0b\n" \
+        "1:\n" \
+        : [old_val] "=&r" (old_val), [scratch] "=&r" (scratch), \
+          [ptr_] "+A" (*aligned_ptr) \
+        : [old_] "rJ" (old_), [new_] "rJ" (new_), \
+          [mask] "r" (mask) \
+        : "memory" ); \
+    \
+    (__typeof__(*(ptr)))((unsigned long)old_val >> new_val_bit); \
+})
+
+/*
+ * This function doesn't exist, so you'll get a linker error
+ * if something tries to do an invalid cmpxchg().
+ */
+extern unsigned long __bad_cmpxchg(volatile void *ptr, unsigned int size);
+
+/*
+ * Atomic compare and exchange.  Compare OLD with MEM, if identical,
+ * store NEW in MEM.  Return the initial value in MEM.  Success is
+ * indicated by comparing RETURN with OLD.
+ */
+static always_inline unsigned long __cmpxchg(volatile void *ptr,
+                                             unsigned long old,
+                                             unsigned long new,
+                                             unsigned int size)
+{
+    unsigned long ret;
+
+    switch ( size )
+    {
+    case 1:
+        ret = emulate_cmpxchg_1_2((volatile uint8_t *)ptr, old, new,
+                                  ".aq", ".aqrl");
+        break;
+    case 2:
+        ret = emulate_cmpxchg_1_2((volatile uint16_t *)ptr, old, new,
+                                   ".aq", ".aqrl");
+        break;
+    case 4:
+        ret = _generic_cmpxchg((volatile uint32_t *)ptr, old, new,
+                          ".w.aq", ".w.aqrl");
+        break;
+#ifndef CONFIG_32BIT
+    case 8:
+        ret = _generic_cmpxchg((volatile uint64_t *)ptr, old, new,
+                           ".d.aq", ".d.aqrl");
+        break;
+#endif
+    default:
+        return __bad_cmpxchg(ptr, size);
+    }
+
+    return ret;
+}
+
+#define cmpxchg(ptr, o, n) \
+({ \
+    __typeof__(*(ptr)) o_ = (o); \
+    __typeof__(*(ptr)) n_ = (n); \
+    (__typeof__(*(ptr))) \
+    __cmpxchg(ptr, (unsigned long)o_, (unsigned long)n_, \
+              sizeof(*(ptr))); \
+})
+
+#endif /* _ASM_RISCV_CMPXCHG_H */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/arch/riscv/include/asm/config.h 
b/xen/arch/riscv/include/asm/config.h
index c5f93e6a01..50583aafdc 100644
--- a/xen/arch/riscv/include/asm/config.h
+++ b/xen/arch/riscv/include/asm/config.h
@@ -119,6 +119,8 @@
 
 #define BITS_PER_LLONG 64
 
+#define BITS_PER_BYTE 8
+
 /* xen_ulong_t is always 64 bits */
 #define BITS_PER_XEN_ULONG 64
 
--
generated by git-patchbot for /home/xen/git/xen.git#staging



 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.