[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] Re: [PATCH v2 02/70] xen/sort: Switch to an extern inline implementation
Hi Stefano, > On 16 Feb 2022, at 03:46, Stefano Stabellini <sstabellini@xxxxxxxxxx> wrote: > > On Mon, 14 Feb 2022, Julien Grall wrote: >> On 14/02/2022 12:50, Andrew Cooper wrote: >>> There are exactly 3 callers of sort() in the hypervisor. Callbacks in a >>> tight >>> loop like this are problematic for performance, especially with Spectre v2 >>> protections, which is why extern inline is used commonly by libraries. >>> >>> Both ARM callers pass in NULL for the swap function, and while this might >>> seem >>> like an attractive option at first, it causes generic_swap() to be used, >>> which >>> forced a byte-wise copy. Provide real swap functions so the compiler can >>> optimise properly, which is very important for ARM downstreams where >>> milliseconds until the system is up matters. >> >> Did you actually benchmark it? Both those lists will have < 128 elements in >> them. So I would be extremely surprised if you save more than a few hundreds >> microseconds with this approach. >> >> So, my opinion on this approach hasn't changed. On v1, we discussed an >> approach that would suit both Stefano and I. Jan seemed to confirm that would >> also suit x86. > > > This patch series has become 70 patches and for the sake of helping > Andrew move forward in the quickest and most painless way possible, I > append the following using generic_swap as static inline. > > Julien, Bertrand, is that acceptable to you? Any reason why we cannot in this case keep the NULL parameter in the existing code and do the if swap == NULL handling in the sort code ? Other then that this is acceptable for me but I will let Julien say if he is ok or not as I had no objections before. Regards Bertrand > > Andrew, I know this is not your favorite approach but you have quite a > lot of changes to handle -- probably not worth focussing on one detail > which is pretty minor? > > > --- > xen/sort: Switch to an extern inline implementation > > There are exactly 3 callers of sort() in the hypervisor. Callbacks in a tight > loop like this are problematic for performance, especially with Spectre v2 > protections, which is why extern inline is used commonly by libraries. > > Make generic_swap() a static inline and used it at the two ARM call > sites. > > No functional change. > > Signed-off-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx> > Signed-off-by: Stefano Stabellini <stefano.stabellini@xxxxxxxxxx> > Reviewed-by: Jan Beulich <jbeulich@xxxxxxxx> > --- > xen/arch/arm/bootfdt.c | 2 +- > xen/arch/arm/io.c | 2 +- > xen/include/xen/sort.h | 67 ++++++++++++++++++++++++++++++++++- > xen/lib/sort.c | 80 ++---------------------------------------- > 4 files changed, 70 insertions(+), 81 deletions(-) > > diff --git a/xen/arch/arm/bootfdt.c b/xen/arch/arm/bootfdt.c > index afaa0e249b..0d62945d56 100644 > --- a/xen/arch/arm/bootfdt.c > +++ b/xen/arch/arm/bootfdt.c > @@ -472,7 +472,7 @@ size_t __init boot_fdt_info(const void *fdt, paddr_t > paddr) > * the banks sorted in ascending order. So sort them through. > */ > sort(bootinfo.mem.bank, bootinfo.mem.nr_banks, sizeof(struct membank), > - cmp_memory_node, NULL); > + cmp_memory_node, generic_swap); > > early_print_info(); > > diff --git a/xen/arch/arm/io.c b/xen/arch/arm/io.c > index 729287e37c..1f35aaeea6 100644 > --- a/xen/arch/arm/io.c > +++ b/xen/arch/arm/io.c > @@ -170,7 +170,7 @@ void register_mmio_handler(struct domain *d, > > /* Sort mmio handlers in ascending order based on base address */ > sort(vmmio->handlers, vmmio->num_entries, sizeof(struct mmio_handler), > - cmp_mmio_handler, NULL); > + cmp_mmio_handler, generic_swap); > > write_unlock(&vmmio->lock); > } > diff --git a/xen/include/xen/sort.h b/xen/include/xen/sort.h > index a403652948..f6065eda58 100644 > --- a/xen/include/xen/sort.h > +++ b/xen/include/xen/sort.h > @@ -3,8 +3,73 @@ > > #include <xen/types.h> > > +extern gnu_inline > +void generic_swap(void *a, void *b, size_t size) > +{ > + char t; > + > + do { > + t = *(char *)a; > + *(char *)a++ = *(char *)b; > + *(char *)b++ = t; > + } while ( --size > 0 ); > +} > + > +/* > + * sort - sort an array of elements > + * @base: pointer to data to sort > + * @num: number of elements > + * @size: size of each element > + * @cmp: pointer to comparison function > + * @swap: pointer to swap function or NULL > + * > + * This function does a heapsort on the given array. You may provide a > + * swap function optimized to your element type. > + * > + * Sorting time is O(n log n) both on average and worst-case. While > + * qsort is about 20% faster on average, it suffers from exploitable > + * O(n*n) worst-case behavior and extra memory requirements that make > + * it less suitable for kernel use. > + */ > +#ifndef SORT_IMPLEMENTATION > +extern gnu_inline > +#endif > void sort(void *base, size_t num, size_t size, > int (*cmp)(const void *, const void *), > - void (*swap)(void *, void *, size_t)); > + void (*swap)(void *, void *, size_t)) > +{ > + /* pre-scale counters for performance */ > + size_t i = (num / 2) * size, n = num * size, c, r; > + > + /* heapify */ > + while ( i > 0 ) > + { > + for ( r = i -= size; r * 2 + size < n; r = c ) > + { > + c = r * 2 + size; > + if ( (c < n - size) && (cmp(base + c, base + c + size) < 0) ) > + c += size; > + if ( cmp(base + r, base + c) >= 0 ) > + break; > + swap(base + r, base + c, size); > + } > + } > + > + /* sort */ > + for ( i = n; i > 0; ) > + { > + i -= size; > + swap(base, base + i, size); > + for ( r = 0; r * 2 + size < i; r = c ) > + { > + c = r * 2 + size; > + if ( (c < i - size) && (cmp(base + c, base + c + size) < 0) ) > + c += size; > + if ( cmp(base + r, base + c) >= 0 ) > + break; > + swap(base + r, base + c, size); > + } > + } > +} > > #endif /* __XEN_SORT_H__ */ > diff --git a/xen/lib/sort.c b/xen/lib/sort.c > index 35ce0d7abd..b7e78cc0e8 100644 > --- a/xen/lib/sort.c > +++ b/xen/lib/sort.c > @@ -4,81 +4,5 @@ > * Jan 23 2005 Matt Mackall <mpm@xxxxxxxxxxx> > */ > > -#include <xen/types.h> > - > -static void u32_swap(void *a, void *b, size_t size) > -{ > - uint32_t t = *(uint32_t *)a; > - > - *(uint32_t *)a = *(uint32_t *)b; > - *(uint32_t *)b = t; > -} > - > -static void generic_swap(void *a, void *b, size_t size) > -{ > - char t; > - > - do { > - t = *(char *)a; > - *(char *)a++ = *(char *)b; > - *(char *)b++ = t; > - } while ( --size > 0 ); > -} > - > -/* > - * sort - sort an array of elements > - * @base: pointer to data to sort > - * @num: number of elements > - * @size: size of each element > - * @cmp: pointer to comparison function > - * @swap: pointer to swap function or NULL > - * > - * This function does a heapsort on the given array. You may provide a > - * swap function optimized to your element type. > - * > - * Sorting time is O(n log n) both on average and worst-case. While > - * qsort is about 20% faster on average, it suffers from exploitable > - * O(n*n) worst-case behavior and extra memory requirements that make > - * it less suitable for kernel use. > - */ > - > -void sort(void *base, size_t num, size_t size, > - int (*cmp)(const void *, const void *), > - void (*swap)(void *, void *, size_t size)) > -{ > - /* pre-scale counters for performance */ > - size_t i = (num / 2) * size, n = num * size, c, r; > - > - if ( !swap ) > - swap = (size == 4 ? u32_swap : generic_swap); > - > - /* heapify */ > - while ( i > 0 ) > - { > - for ( r = i -= size; r * 2 + size < n; r = c ) > - { > - c = r * 2 + size; > - if ( (c < n - size) && (cmp(base + c, base + c + size) < 0) ) > - c += size; > - if ( cmp(base + r, base + c) >= 0 ) > - break; > - swap(base + r, base + c, size); > - } > - } > - > - /* sort */ > - for ( i = n; i > 0; ) > - { > - i -= size; > - swap(base, base + i, size); > - for ( r = 0; r * 2 + size < i; r = c ) > - { > - c = r * 2 + size; > - if ( (c < i - size) && (cmp(base + c, base + c + size) < 0) ) > - c += size; > - if ( cmp(base + r, base + c) >= 0 ) > - break; > - swap(base + r, base + c, size); > - } > - } > -} > +#define SORT_IMPLEMENTATION > +#include <xen/sort.h> > -- > 2.25.1 >
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |