[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH v5 05/25] Introduce clear_user and clear_guest



>>> On 19.01.12 at 17:46, "Jan Beulich" <JBeulich@xxxxxxxx> wrote:
> As I wanted to use the clear_guest stuff you add here for some
> cleanup/latent bug fix in x86's paging code, I pulled this patch into
> my local tree. Unfortunately the ia64 bits don't build, as you omitted
> the #define-s at the top of the file you took the code from.

Further, the file you put this in is one that shouldn't be modified. The
correct way is to simply pull in Linux'es clear_user.S (see below).

Jan

--- a/xen/arch/ia64/linux/Makefile
+++ b/xen/arch/ia64/linux/Makefile
@@ -4,6 +4,7 @@ subdir-y += sn
 
 obj-y += bitop.o
 obj-y += clear_page.o
+obj-y += clear_user.o
 obj-y += copy_page_mck.o
 obj-y += efi_stub.o
 obj-y += extable.o
--- a/xen/arch/ia64/linux/README.origin
+++ b/xen/arch/ia64/linux/README.origin
@@ -15,6 +15,7 @@ pcdp.h                        -> linux/drivers/firmware/pcdp.
 
 bitop.c                        -> linux/arch/ia64/lib/bitop.c
 clear_page.S           -> linux/arch/ia64/lib/clear_page.S
+clear_user.S           -> linux/arch/ia64/lib/clear_user.S
 copy_page_mck.S                -> linux/arch/ia64/lib/copy_page_mck.S
 flush.S                        -> linux/arch/ia64/lib/flush.S
 idiv32.S               -> linux/arch/ia64/lib/idiv32.S
--- /dev/null
+++ b/xen/arch/ia64/linux/clear_user.S
@@ -0,0 +1,209 @@
+/*
+ * This routine clears to zero a linear memory buffer in user space.
+ *
+ * Inputs:
+ *     in0:    address of buffer
+ *     in1:    length of buffer in bytes
+ * Outputs:
+ *     r8:     number of bytes that didn't get cleared due to a fault
+ *
+ * Copyright (C) 1998, 1999, 2001 Hewlett-Packard Co
+ *     Stephane Eranian <eranian@xxxxxxxxxx>
+ */
+
+#include <asm/asmmacro.h>
+
+//
+// arguments
+//
+#define buf            r32
+#define len            r33
+
+//
+// local registers
+//
+#define cnt            r16
+#define buf2           r17
+#define saved_lc       r18
+#define saved_pfs      r19
+#define tmp            r20
+#define len2           r21
+#define len3           r22
+
+//
+// Theory of operations:
+//     - we check whether or not the buffer is small, i.e., less than 17
+//       in which case we do the byte by byte loop.
+//
+//     - Otherwise we go progressively from 1 byte store to 8byte store in
+//       the head part, the body is a 16byte store loop and we finish we the
+//       tail for the last 15 bytes.
+//       The good point about this breakdown is that the long buffer handling
+//       contains only 2 branches.
+//
+//     The reason for not using shifting & masking for both the head and the
+//     tail is to stay semantically correct. This routine is not supposed
+//     to write bytes outside of the buffer. While most of the time this would
+//     be ok, we can't tolerate a mistake. A classical example is the case
+//     of multithreaded code were to the extra bytes touched is actually owned
+//     by another thread which runs concurrently to ours. Another, less likely,
+//     example is with device drivers where reading an I/O mapped location may
+//     have side effects (same thing for writing).
+//
+
+GLOBAL_ENTRY(__do_clear_user)
+       .prologue
+       .save ar.pfs, saved_pfs
+       alloc   saved_pfs=ar.pfs,2,0,0,0
+       cmp.eq p6,p0=r0,len             // check for zero length
+       .save ar.lc, saved_lc
+       mov saved_lc=ar.lc              // preserve ar.lc (slow)
+       .body
+       ;;                              // avoid WAW on CFM
+       adds tmp=-1,len                 // br.ctop is repeat/until
+       mov ret0=len                    // return value is length at this point
+(p6)   br.ret.spnt.many rp
+       ;;
+       cmp.lt p6,p0=16,len             // if len > 16 then long memset
+       mov ar.lc=tmp                   // initialize lc for small count
+(p6)   br.cond.dptk .long_do_clear
+       ;;                              // WAR on ar.lc
+       //
+       // worst case 16 iterations, avg 8 iterations
+       //
+       // We could have played with the predicates to use the extra
+       // M slot for 2 stores/iteration but the cost the initialization
+       // the various counters compared to how long the loop is supposed
+       // to last on average does not make this solution viable.
+       //
+1:
+       EX( .Lexit1, st1 [buf]=r0,1 )
+       adds len=-1,len                 // countdown length using len
+       br.cloop.dptk 1b
+       ;;                              // avoid RAW on ar.lc
+       //
+       // .Lexit4: comes from byte by byte loop
+       //          len contains bytes left
+.Lexit1:
+       mov ret0=len                    // faster than using ar.lc
+       mov ar.lc=saved_lc
+       br.ret.sptk.many rp             // end of short clear_user
+
+
+       //
+       // At this point we know we have more than 16 bytes to copy
+       // so we focus on alignment (no branches required)
+       //
+       // The use of len/len2 for countdown of the number of bytes left
+       // instead of ret0 is due to the fact that the exception code
+       // changes the values of r8.
+       //
+.long_do_clear:
+       tbit.nz p6,p0=buf,0             // odd alignment (for long_do_clear)
+       ;;
+       EX( .Lexit3, (p6) st1 [buf]=r0,1 )      // 1-byte aligned
+(p6)   adds len=-1,len;;               // sync because buf is modified
+       tbit.nz p6,p0=buf,1
+       ;;
+       EX( .Lexit3, (p6) st2 [buf]=r0,2 )      // 2-byte aligned
+(p6)   adds len=-2,len;;
+       tbit.nz p6,p0=buf,2
+       ;;
+       EX( .Lexit3, (p6) st4 [buf]=r0,4 )      // 4-byte aligned
+(p6)   adds len=-4,len;;
+       tbit.nz p6,p0=buf,3
+       ;;
+       EX( .Lexit3, (p6) st8 [buf]=r0,8 )      // 8-byte aligned
+(p6)   adds len=-8,len;;
+       shr.u cnt=len,4         // number of 128-bit (2x64bit) words
+       ;;
+       cmp.eq p6,p0=r0,cnt
+       adds tmp=-1,cnt
+(p6)   br.cond.dpnt .dotail            // we have less than 16 bytes left
+       ;;
+       adds buf2=8,buf                 // setup second base pointer
+       mov ar.lc=tmp
+       ;;
+
+       //
+       // 16bytes/iteration core loop
+       //
+       // The second store can never generate a fault because
+       // we come into the loop only when we are 16-byte aligned.
+       // This means that if we cross a page then it will always be
+       // in the first store and never in the second.
+       //
+       //
+       // We need to keep track of the remaining length. A possible 
(optimistic)
+       // way would be to use ar.lc and derive how many byte were left by
+       // doing : left= 16*ar.lc + 16.  this would avoid the addition at
+       // every iteration.
+       // However we need to keep the synchronization point. A template
+       // M;;MB does not exist and thus we can keep the addition at no
+       // extra cycle cost (use a nop slot anyway). It also simplifies the
+       // (unlikely)  error recovery code
+       //
+
+2:     EX(.Lexit3, st8 [buf]=r0,16 )
+       ;;                              // needed to get len correct when error
+       st8 [buf2]=r0,16
+       adds len=-16,len
+       br.cloop.dptk 2b
+       ;;
+       mov ar.lc=saved_lc
+       //
+       // tail correction based on len only
+       //
+       // We alternate the use of len3,len2 to allow parallelism and correct
+       // error handling. We also reuse p6/p7 to return correct value.
+       // The addition of len2/len3 does not cost anything more compared to
+       // the regular memset as we had empty slots.
+       //
+.dotail:
+       mov len2=len                    // for parallelization of error handling
+       mov len3=len
+       tbit.nz p6,p0=len,3
+       ;;
+       EX( .Lexit2, (p6) st8 [buf]=r0,8 )      // at least 8 bytes
+(p6)   adds len3=-8,len2
+       tbit.nz p7,p6=len,2
+       ;;
+       EX( .Lexit2, (p7) st4 [buf]=r0,4 )      // at least 4 bytes
+(p7)   adds len2=-4,len3
+       tbit.nz p6,p7=len,1
+       ;;
+       EX( .Lexit2, (p6) st2 [buf]=r0,2 )      // at least 2 bytes
+(p6)   adds len3=-2,len2
+       tbit.nz p7,p6=len,0
+       ;;
+       EX( .Lexit2, (p7) st1 [buf]=r0 )        // only 1 byte left
+       mov ret0=r0                             // success
+       br.ret.sptk.many rp                     // end of most likely path
+
+       //
+       // Outlined error handling code
+       //
+
+       //
+       // .Lexit3: comes from core loop, need restore pr/lc
+       //          len contains bytes left
+       //
+       //
+       // .Lexit2:
+       //      if p6 -> coming from st8 or st2 : len2 contains what's left
+       //      if p7 -> coming from st4 or st1 : len3 contains what's left
+       // We must restore lc/pr even though might not have been used.
+.Lexit2:
+       .pred.rel "mutex", p6, p7
+(p6)   mov len=len2
+(p7)   mov len=len3
+       ;;
+       //
+       // .Lexit4: comes from head, need not restore pr/lc
+       //          len contains bytes left
+       //
+.Lexit3:
+       mov ret0=len
+       mov ar.lc=saved_lc
+       br.ret.sptk.many rp
+END(__do_clear_user)



_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.