[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-devel] [PATCH v2 05/25] Introduce clear_user
From: Stefano Stabellini <stefano.stabellini@xxxxxxxxxxxxx> Introduce clear_user for x86 and ia64, shamelessly taken from Linux. The x86 version is the 32 bit clear_user implementation. Changes in v2: - change d0 to be a long; - cast addr to long. Signed-off-by: Stefano Stabellini <stefano.stabellini@xxxxxxxxxxxxx> --- xen/arch/ia64/linux/memcpy_mck.S | 177 ++++++++++++++++++++++++++++++++++++++ xen/arch/x86/usercopy.c | 36 ++++++++ xen/include/asm-ia64/uaccess.h | 12 +++ xen/include/asm-x86/uaccess.h | 1 + 4 files changed, 226 insertions(+), 0 deletions(-) diff --git a/xen/arch/ia64/linux/memcpy_mck.S b/xen/arch/ia64/linux/memcpy_mck.S index 6f308e6..8b07006 100644 --- a/xen/arch/ia64/linux/memcpy_mck.S +++ b/xen/arch/ia64/linux/memcpy_mck.S @@ -659,3 +659,180 @@ EK(.ex_handler, (p17) st8 [dst1]=r39,8); \ /* end of McKinley specific optimization */ END(__copy_user) + +/* + * Theory of operations: + * - we check whether or not the buffer is small, i.e., less than 17 + * in which case we do the byte by byte loop. + * + * - Otherwise we go progressively from 1 byte store to 8byte store in + * the head part, the body is a 16byte store loop and we finish we the + * tail for the last 15 bytes. + * The good point about this breakdown is that the long buffer handling + * contains only 2 branches. + * + * The reason for not using shifting & masking for both the head and the + * tail is to stay semantically correct. This routine is not supposed + * to write bytes outside of the buffer. While most of the time this would + * be ok, we can't tolerate a mistake. A classical example is the case + * of multithreaded code were to the extra bytes touched is actually owned + * by another thread which runs concurrently to ours. Another, less likely, + * example is with device drivers where reading an I/O mapped location may + * have side effects (same thing for writing). + */ +GLOBAL_ENTRY(__do_clear_user) + .prologue + .save ar.pfs, saved_pfs + alloc saved_pfs=ar.pfs,2,0,0,0 + cmp.eq p6,p0=r0,len // check for zero length + .save ar.lc, saved_lc + mov saved_lc=ar.lc // preserve ar.lc (slow) + .body + ;; // avoid WAW on CFM + adds tmp=-1,len // br.ctop is repeat/until + mov ret0=len // return value is length at this point +(p6) br.ret.spnt.many rp + ;; + cmp.lt p6,p0=16,len // if len > 16 then long memset + mov ar.lc=tmp // initialize lc for small count +(p6) br.cond.dptk .long_do_clear + ;; // WAR on ar.lc + // + // worst case 16 iterations, avg 8 iterations + // + // We could have played with the predicates to use the extra + // M slot for 2 stores/iteration but the cost the initialization + // the various counters compared to how long the loop is supposed + // to last on average does not make this solution viable. + // +1: + EX( .Lexit1, st1 [buf]=r0,1 ) + adds len=-1,len // countdown length using len + br.cloop.dptk 1b + ;; // avoid RAW on ar.lc + // + // .Lexit4: comes from byte by byte loop + // len contains bytes left +.Lexit1: + mov ret0=len // faster than using ar.lc + mov ar.lc=saved_lc + br.ret.sptk.many rp // end of short clear_user + + + // + // At this point we know we have more than 16 bytes to copy + // so we focus on alignment (no branches required) + // + // The use of len/len2 for countdown of the number of bytes left + // instead of ret0 is due to the fact that the exception code + // changes the values of r8. + // +.long_do_clear: + tbit.nz p6,p0=buf,0 // odd alignment (for long_do_clear) + ;; + EX( .Lexit3, (p6) st1 [buf]=r0,1 ) // 1-byte aligned +(p6) adds len=-1,len;; // sync because buf is modified + tbit.nz p6,p0=buf,1 + ;; + EX( .Lexit3, (p6) st2 [buf]=r0,2 ) // 2-byte aligned +(p6) adds len=-2,len;; + tbit.nz p6,p0=buf,2 + ;; + EX( .Lexit3, (p6) st4 [buf]=r0,4 ) // 4-byte aligned +(p6) adds len=-4,len;; + tbit.nz p6,p0=buf,3 + ;; + EX( .Lexit3, (p6) st8 [buf]=r0,8 ) // 8-byte aligned +(p6) adds len=-8,len;; + shr.u cnt=len,4 // number of 128-bit (2x64bit) words + ;; + cmp.eq p6,p0=r0,cnt + adds tmp=-1,cnt +(p6) br.cond.dpnt .dotail // we have less than 16 bytes left + ;; + adds buf2=8,buf // setup second base pointer + mov ar.lc=tmp + ;; + + // + // 16bytes/iteration core loop + // + // The second store can never generate a fault because + // we come into the loop only when we are 16-byte aligned. + // This means that if we cross a page then it will always be + // in the first store and never in the second. + // + // + // We need to keep track of the remaining length. A possible (optimistic) + // way would be to use ar.lc and derive how many byte were left by + // doing : left= 16*ar.lc + 16. this would avoid the addition at + // every iteration. + // However we need to keep the synchronization point. A template + // M;;MB does not exist and thus we can keep the addition at no + // extra cycle cost (use a nop slot anyway). It also simplifies the + // (unlikely) error recovery code + // + +2: EX(.Lexit3, st8 [buf]=r0,16 ) + ;; // needed to get len correct when error + st8 [buf2]=r0,16 + adds len=-16,len + br.cloop.dptk 2b + ;; + mov ar.lc=saved_lc + // + // tail correction based on len only + // + // We alternate the use of len3,len2 to allow parallelism and correct + // error handling. We also reuse p6/p7 to return correct value. + // The addition of len2/len3 does not cost anything more compared to + // the regular memset as we had empty slots. + // +.dotail: + mov len2=len // for parallelization of error handling + mov len3=len + tbit.nz p6,p0=len,3 + ;; + EX( .Lexit2, (p6) st8 [buf]=r0,8 ) // at least 8 bytes +(p6) adds len3=-8,len2 + tbit.nz p7,p6=len,2 + ;; + EX( .Lexit2, (p7) st4 [buf]=r0,4 ) // at least 4 bytes +(p7) adds len2=-4,len3 + tbit.nz p6,p7=len,1 + ;; + EX( .Lexit2, (p6) st2 [buf]=r0,2 ) // at least 2 bytes +(p6) adds len3=-2,len2 + tbit.nz p7,p6=len,0 + ;; + EX( .Lexit2, (p7) st1 [buf]=r0 ) // only 1 byte left + mov ret0=r0 // success + br.ret.sptk.many rp // end of most likely path + + // + // Outlined error handling code + // + + // + // .Lexit3: comes from core loop, need restore pr/lc + // len contains bytes left + // + // + // .Lexit2: + // if p6 -> coming from st8 or st2 : len2 contains what's left + // if p7 -> coming from st4 or st1 : len3 contains what's left + // We must restore lc/pr even though might not have been used. +.Lexit2: + .pred.rel "mutex", p6, p7 +(p6) mov len=len2 +(p7) mov len=len3 + ;; + // + // .Lexit4: comes from head, need not restore pr/lc + // len contains bytes left + // +.Lexit3: + mov ret0=len + mov ar.lc=saved_lc + br.ret.sptk.many rp +END(__do_clear_user) diff --git a/xen/arch/x86/usercopy.c b/xen/arch/x86/usercopy.c index d88e635..47dadae 100644 --- a/xen/arch/x86/usercopy.c +++ b/xen/arch/x86/usercopy.c @@ -110,6 +110,42 @@ copy_to_user(void __user *to, const void *from, unsigned n) return n; } +#define __do_clear_user(addr,size) \ +do { \ + long __d0; \ + __asm__ __volatile__( \ + "0: rep; stosl\n" \ + " movl %2,%0\n" \ + "1: rep; stosb\n" \ + "2:\n" \ + ".section .fixup,\"ax\"\n" \ + "3: lea 0(%2,%0,4),%0\n" \ + " jmp 2b\n" \ + ".previous\n" \ + _ASM_EXTABLE(0b,3b) \ + _ASM_EXTABLE(1b,2b) \ + : "=&c"(size), "=&D" (__d0) \ + : "r"(size & 3), "0"(size / 4), "1"((long)addr), "a"(0)); \ +} while (0) + +/** + * clear_user: - Zero a block of memory in user space. + * @to: Destination address, in user space. + * @n: Number of bytes to zero. + * + * Zero a block of memory in user space. + * + * Returns number of bytes that could not be cleared. + * On success, this will be zero. + */ +unsigned long +clear_user(void __user *to, unsigned n) +{ + if ( access_ok(to, n) ) + __do_clear_user(to, n); + return n; +} + /** * copy_from_user: - Copy a block of data from user space. * @to: Destination address, in kernel space. diff --git a/xen/include/asm-ia64/uaccess.h b/xen/include/asm-ia64/uaccess.h index 32ef415..2ececb1 100644 --- a/xen/include/asm-ia64/uaccess.h +++ b/xen/include/asm-ia64/uaccess.h @@ -236,6 +236,18 @@ __copy_from_user (void *to, const void __user *from, unsigned long count) __cu_len; \ }) +extern unsigned long __do_clear_user (void __user * to, unsigned long count); + +#define clear_user(to, n) \ +({ \ + void __user *__cu_to = (to); \ + long __cu_len = (n); \ + \ + if (__access_ok(__cu_to)) \ + __cu_len = __do_clear_user(__cu_to, __cu_len); \ + __cu_len; \ +}) + #define copy_from_user(to, from, n) \ ({ \ void *__cu_to = (to); \ diff --git a/xen/include/asm-x86/uaccess.h b/xen/include/asm-x86/uaccess.h index e3e541b..d6f4458 100644 --- a/xen/include/asm-x86/uaccess.h +++ b/xen/include/asm-x86/uaccess.h @@ -16,6 +16,7 @@ #endif unsigned long copy_to_user(void *to, const void *from, unsigned len); +unsigned long clear_user(void *to, unsigned len); unsigned long copy_from_user(void *to, const void *from, unsigned len); /* Handles exceptions in both to and from, but doesn't do access_ok */ unsigned long __copy_to_user_ll(void *to, const void *from, unsigned n); -- 1.7.2.5 _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-devel
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |