Xen project Mailing List

Re: [PATCH v2 02/70] xen/sort: Switch to an extern inline implementation

From: Stefano Stabellini <sstabellini@xxxxxxxxxx>

Date: Tue, 15 Feb 2022 19:46:14 -0800 (PST)

Cc: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>, Xen-devel <xen-devel@xxxxxxxxxxxxxxxxxxxx>, Jan Beulich <JBeulich@xxxxxxxx>, Roger Pau Monné <roger.pau@xxxxxxxxxx>, Wei Liu <wl@xxxxxxx>, Stefano Stabellini <sstabellini@xxxxxxxxxx>, Volodymyr Babchuk <Volodymyr_Babchuk@xxxxxxxx>, Bertrand Marquis <bertrand.marquis@xxxxxxx>

Delivery-date: Wed, 16 Feb 2022 03:46:33 +0000

List-id: Xen developer discussion <xen-devel.lists.xenproject.org>

On Mon, 14 Feb 2022, Julien Grall wrote: > On 14/02/2022 12:50, Andrew Cooper wrote: > > There are exactly 3 callers of sort() in the hypervisor. Callbacks in a > > tight > > loop like this are problematic for performance, especially with Spectre v2 > > protections, which is why extern inline is used commonly by libraries. > > > > Both ARM callers pass in NULL for the swap function, and while this might > > seem > > like an attractive option at first, it causes generic_swap() to be used, > > which > > forced a byte-wise copy. Provide real swap functions so the compiler can > > optimise properly, which is very important for ARM downstreams where > > milliseconds until the system is up matters. > > Did you actually benchmark it? Both those lists will have < 128 elements in > them. So I would be extremely surprised if you save more than a few hundreds > microseconds with this approach. > > So, my opinion on this approach hasn't changed. On v1, we discussed an > approach that would suit both Stefano and I. Jan seemed to confirm that would > also suit x86. This patch series has become 70 patches and for the sake of helping Andrew move forward in the quickest and most painless way possible, I append the following using generic_swap as static inline. Julien, Bertrand, is that acceptable to you? Andrew, I know this is not your favorite approach but you have quite a lot of changes to handle -- probably not worth focussing on one detail which is pretty minor? --- xen/sort: Switch to an extern inline implementation There are exactly 3 callers of sort() in the hypervisor. Callbacks in a tight loop like this are problematic for performance, especially with Spectre v2 protections, which is why extern inline is used commonly by libraries. Make generic_swap() a static inline and used it at the two ARM call sites. No functional change. Signed-off-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx> Signed-off-by: Stefano Stabellini <stefano.stabellini@xxxxxxxxxx> Reviewed-by: Jan Beulich <jbeulich@xxxxxxxx> --- xen/arch/arm/bootfdt.c | 2 +- xen/arch/arm/io.c | 2 +- xen/include/xen/sort.h | 67 ++++++++++++++++++++++++++++++++++- xen/lib/sort.c | 80 ++---------------------------------------- 4 files changed, 70 insertions(+), 81 deletions(-) diff --git a/xen/arch/arm/bootfdt.c b/xen/arch/arm/bootfdt.c index afaa0e249b..0d62945d56 100644 --- a/xen/arch/arm/bootfdt.c +++ b/xen/arch/arm/bootfdt.c @@ -472,7 +472,7 @@ size_t __init boot_fdt_info(const void *fdt, paddr_t paddr) * the banks sorted in ascending order. So sort them through. */ sort(bootinfo.mem.bank, bootinfo.mem.nr_banks, sizeof(struct membank), - cmp_memory_node, NULL); + cmp_memory_node, generic_swap); early_print_info(); diff --git a/xen/arch/arm/io.c b/xen/arch/arm/io.c index 729287e37c..1f35aaeea6 100644 --- a/xen/arch/arm/io.c +++ b/xen/arch/arm/io.c @@ -170,7 +170,7 @@ void register_mmio_handler(struct domain *d, /* Sort mmio handlers in ascending order based on base address */ sort(vmmio->handlers, vmmio->num_entries, sizeof(struct mmio_handler), - cmp_mmio_handler, NULL); + cmp_mmio_handler, generic_swap); write_unlock(&vmmio->lock); } diff --git a/xen/include/xen/sort.h b/xen/include/xen/sort.h index a403652948..f6065eda58 100644 --- a/xen/include/xen/sort.h +++ b/xen/include/xen/sort.h @@ -3,8 +3,73 @@ #include <xen/types.h> +extern gnu_inline +void generic_swap(void *a, void *b, size_t size) +{ + char t; + + do { + t = *(char *)a; + *(char *)a++ = *(char *)b; + *(char *)b++ = t; + } while ( --size > 0 ); +} + +/* + * sort - sort an array of elements + * @base: pointer to data to sort + * @num: number of elements + * @size: size of each element + * @cmp: pointer to comparison function + * @swap: pointer to swap function or NULL + * + * This function does a heapsort on the given array. You may provide a + * swap function optimized to your element type. + * + * Sorting time is O(n log n) both on average and worst-case. While + * qsort is about 20% faster on average, it suffers from exploitable + * O(n*n) worst-case behavior and extra memory requirements that make + * it less suitable for kernel use. + */ +#ifndef SORT_IMPLEMENTATION +extern gnu_inline +#endif void sort(void *base, size_t num, size_t size, int (*cmp)(const void *, const void *), - void (*swap)(void *, void *, size_t)); + void (*swap)(void *, void *, size_t)) +{ + /* pre-scale counters for performance */ + size_t i = (num / 2) * size, n = num * size, c, r; + + /* heapify */ + while ( i > 0 ) + { + for ( r = i -= size; r * 2 + size < n; r = c ) + { + c = r * 2 + size; + if ( (c < n - size) && (cmp(base + c, base + c + size) < 0) ) + c += size; + if ( cmp(base + r, base + c) >= 0 ) + break; + swap(base + r, base + c, size); + } + } + + /* sort */ + for ( i = n; i > 0; ) + { + i -= size; + swap(base, base + i, size); + for ( r = 0; r * 2 + size < i; r = c ) + { + c = r * 2 + size; + if ( (c < i - size) && (cmp(base + c, base + c + size) < 0) ) + c += size; + if ( cmp(base + r, base + c) >= 0 ) + break; + swap(base + r, base + c, size); + } + } +} #endif /* __XEN_SORT_H__ */ diff --git a/xen/lib/sort.c b/xen/lib/sort.c index 35ce0d7abd..b7e78cc0e8 100644 --- a/xen/lib/sort.c +++ b/xen/lib/sort.c @@ -4,81 +4,5 @@ * Jan 23 2005 Matt Mackall <mpm@xxxxxxxxxxx> */ -#include <xen/types.h> - -static void u32_swap(void *a, void *b, size_t size) -{ - uint32_t t = *(uint32_t *)a; - - *(uint32_t *)a = *(uint32_t *)b; - *(uint32_t *)b = t; -} - -static void generic_swap(void *a, void *b, size_t size) -{ - char t; - - do { - t = *(char *)a; - *(char *)a++ = *(char *)b; - *(char *)b++ = t; - } while ( --size > 0 ); -} - -/* - * sort - sort an array of elements - * @base: pointer to data to sort - * @num: number of elements - * @size: size of each element - * @cmp: pointer to comparison function - * @swap: pointer to swap function or NULL - * - * This function does a heapsort on the given array. You may provide a - * swap function optimized to your element type. - * - * Sorting time is O(n log n) both on average and worst-case. While - * qsort is about 20% faster on average, it suffers from exploitable - * O(n*n) worst-case behavior and extra memory requirements that make - * it less suitable for kernel use. - */ - -void sort(void *base, size_t num, size_t size, - int (*cmp)(const void *, const void *), - void (*swap)(void *, void *, size_t size)) -{ - /* pre-scale counters for performance */ - size_t i = (num / 2) * size, n = num * size, c, r; - - if ( !swap ) - swap = (size == 4 ? u32_swap : generic_swap); - - /* heapify */ - while ( i > 0 ) - { - for ( r = i -= size; r * 2 + size < n; r = c ) - { - c = r * 2 + size; - if ( (c < n - size) && (cmp(base + c, base + c + size) < 0) ) - c += size; - if ( cmp(base + r, base + c) >= 0 ) - break; - swap(base + r, base + c, size); - } - } - - /* sort */ - for ( i = n; i > 0; ) - { - i -= size; - swap(base, base + i, size); - for ( r = 0; r * 2 + size < i; r = c ) - { - c = r * 2 + size; - if ( (c < i - size) && (cmp(base + c, base + c + size) < 0) ) - c += size; - if ( cmp(base + r, base + c) >= 0 ) - break; - swap(base + r, base + c, size); - } - } -} +#define SORT_IMPLEMENTATION +#include <xen/sort.h> -- 2.25.1

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.