[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] RE: [Xen-devel] [PATCH] x86: add SSE-based copy_page()
Jan -- I assume the 12% faster is on a benchmark... Have you measured how much faster the copy_page_sse2 routine (standalond) is than the memcpy? Is it a factor of 2? Thanks, Dan > -----Original Message----- > From: Jan Beulich [mailto:jbeulich@xxxxxxxxxx] > Sent: Wednesday, November 12, 2008 2:38 AM > To: xen-devel@xxxxxxxxxxxxxxxxxxx > Subject: [Xen-devel] [PATCH] x86: add SSE-based copy_page() > > > In top of the highmem asstance hypercalls added earlier, this provides > a performance improvement of another 12% (measured on Xeon E5345) for > the page copying case. > > Signed-off-by: Jan Beulich <jbeulich@xxxxxxxxxx> > > Index: 2008-10-27/xen/arch/x86/Makefile > =================================================================== > --- 2008-10-27.orig/xen/arch/x86/Makefile 2008-11-11 > 16:19:45.000000000 +0100 > +++ 2008-10-27/xen/arch/x86/Makefile 2008-11-11 > 16:18:36.000000000 +0100 > @@ -11,6 +11,7 @@ subdir-$(x86_64) += x86_64 > obj-y += apic.o > obj-y += bitops.o > obj-y += clear_page.o > +obj-y += copy_page.o > obj-y += compat.o > obj-y += delay.o > obj-y += dmi_scan.o > Index: 2008-10-27/xen/arch/x86/copy_page.S > =================================================================== > --- /dev/null 1970-01-01 00:00:00.000000000 +0000 > +++ 2008-10-27/xen/arch/x86/copy_page.S 2008-06-03 > 14:24:57.000000000 +0200 > @@ -0,0 +1,66 @@ > +#include <xen/config.h> > +#include <asm/page.h> > + > +#ifdef __i386__ > +#define src_reg %esi > +#define dst_reg %edi > +#define WORD_SIZE 4 > +#define tmp1_reg %eax > +#define tmp2_reg %edx > +#define tmp3_reg %ebx > +#define tmp4_reg %ebp > +#else > +#define src_reg %rsi > +#define dst_reg %rdi > +#define WORD_SIZE 8 > +#define tmp1_reg %r8 > +#define tmp2_reg %r9 > +#define tmp3_reg %r10 > +#define tmp4_reg %r11 > +#endif > + > +ENTRY(copy_page_sse2) > +#ifdef __i386__ > + push %ebx > + push %ebp > + push %esi > + push %edi > + mov 6*4(%esp), src_reg > + mov 5*4(%esp), dst_reg > +#endif > + mov $PAGE_SIZE/(4*WORD_SIZE)-3, %ecx > + > + prefetchnta 2*4*WORD_SIZE(src_reg) > + mov (src_reg), tmp1_reg > + mov WORD_SIZE(src_reg), tmp2_reg > + mov 2*WORD_SIZE(src_reg), tmp3_reg > + mov 3*WORD_SIZE(src_reg), tmp4_reg > + > +0: prefetchnta 3*4*WORD_SIZE(src_reg) > +1: add $4*WORD_SIZE, src_reg > + movnti tmp1_reg, (dst_reg) > + mov (src_reg), tmp1_reg > + dec %ecx > + movnti tmp2_reg, WORD_SIZE(dst_reg) > + mov WORD_SIZE(src_reg), tmp2_reg > + movnti tmp3_reg, 2*WORD_SIZE(dst_reg) > + mov 2*WORD_SIZE(src_reg), tmp3_reg > + movnti tmp4_reg, 3*WORD_SIZE(dst_reg) > + lea 4*WORD_SIZE(dst_reg), dst_reg > + mov 3*WORD_SIZE(src_reg), tmp4_reg > + jg 0b > + jpe 1b > + > + movnti tmp1_reg, (dst_reg) > + movnti tmp2_reg, WORD_SIZE(dst_reg) > + movnti tmp3_reg, 2*WORD_SIZE(dst_reg) > + movnti tmp4_reg, 3*WORD_SIZE(dst_reg) > + > +#ifdef __i386__ > + pop %edi > + pop %esi > + pop %ebp > + pop %ebx > +#endif > + sfence > + ret > Index: 2008-10-27/xen/arch/x86/domain.c > =================================================================== > --- 2008-10-27.orig/xen/arch/x86/domain.c 2008-11-11 > 14:55:44.000000000 +0100 > +++ 2008-10-27/xen/arch/x86/domain.c 2008-11-11 > 16:24:48.000000000 +0100 > @@ -183,7 +183,8 @@ static int setup_compat_l4(struct vcpu * > /* This page needs to look like a pagetable so that it > can be shadowed */ > pg->u.inuse.type_info = PGT_l4_page_table|PGT_validated|1; > > - l4tab = copy_page(page_to_virt(pg), idle_pg_table); > + l4tab = page_to_virt(pg); > + copy_page(l4tab, idle_pg_table); > l4tab[0] = l4e_empty(); > l4tab[l4_table_offset(LINEAR_PT_VIRT_START)] = > l4e_from_page(pg, __PAGE_HYPERVISOR); > Index: 2008-10-27/xen/arch/x86/domain_build.c > =================================================================== > --- 2008-10-27.orig/xen/arch/x86/domain_build.c > 2008-11-11 16:19:45.000000000 +0100 > +++ 2008-10-27/xen/arch/x86/domain_build.c 2008-11-11 > 16:18:36.000000000 +0100 > @@ -467,8 +467,9 @@ int __init construct_dom0( > /* WARNING: The new domain must have its 'processor' > field filled in! */ > l3start = l3tab = (l3_pgentry_t *)mpt_alloc; mpt_alloc > += PAGE_SIZE; > l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc > += 4*PAGE_SIZE; > - memcpy(l2tab, idle_pg_table_l2, 4*PAGE_SIZE); > - for (i = 0; i < 4; i++) { > + for (i = 0; i < L3_PAGETABLE_ENTRIES; i++) { > + copy_page(l2tab + i * L2_PAGETABLE_ENTRIES, > + idle_pg_table_l2 + i * L2_PAGETABLE_ENTRIES); > l3tab[i] = l3e_from_paddr((u32)l2tab + i*PAGE_SIZE, L3_PROT); > l2tab[(LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT)+i] = > l2e_from_paddr((u32)l2tab + i*PAGE_SIZE, > __PAGE_HYPERVISOR); > Index: 2008-10-27/xen/include/asm-x86/page.h > =================================================================== > --- 2008-10-27.orig/xen/include/asm-x86/page.h > 2008-11-11 16:19:45.000000000 +0100 > +++ 2008-10-27/xen/include/asm-x86/page.h 2008-11-11 > 16:18:36.000000000 +0100 > @@ -215,7 +215,10 @@ void clear_page_sse2(void *); > #define clear_page(_p) (cpu_has_xmm2 ? > \ > clear_page_sse2((void *)(_p)) : > \ > (void)memset((void *)(_p), 0, > PAGE_SIZE)) > -#define copy_page(_t,_f) memcpy((void *)(_t), (void > *)(_f), PAGE_SIZE) > +void copy_page_sse2(void *, const void *); > +#define copy_page(_t,_f) (cpu_has_xmm2 ? > \ > + copy_page_sse2(_t, _f) : > \ > + (void)memcpy(_t, _f, PAGE_SIZE)) > > #define mfn_valid(mfn) ((mfn) < max_page) > > > > > _______________________________________________ > Xen-devel mailing list > Xen-devel@xxxxxxxxxxxxxxxxxxx > http://lists.xensource.com/xen-devel > _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-devel
|
![]() |
Lists.xenproject.org is hosted with RackSpace, monitoring our |