Xen project Mailing List

Re: [Xen-devel] [PATCH v5 05/25] Introduce clear_user and clear_guest

From: Stefano Stabellini <stefano.stabellini@xxxxxxxxxxxxx>

Date: Fri, 20 Jan 2012 11:28:29 +0000

Cc: "Tim Deegan \(3P\)" <Tim.Deegan@xxxxxxxxxx>, "xen-devel@xxxxxxxxxxxxxxxxxxx" <xen-devel@xxxxxxxxxxxxxxxxxxx>, Ian Campbell <Ian.Campbell@xxxxxxxxxx>, Stefano Stabellini <Stefano.Stabellini@xxxxxxxxxxxxx>

Delivery-date: Fri, 20 Jan 2012 11:28:11 +0000

List-id: Xen developer discussion <xen-devel.lists.xensource.com>

On Thu, 19 Jan 2012, Jan Beulich wrote: > >>> On 19.01.12 at 17:46, "Jan Beulich" <JBeulich@xxxxxxxx> wrote: > > As I wanted to use the clear_guest stuff you add here for some > > cleanup/latent bug fix in x86's paging code, I pulled this patch into > > my local tree. Unfortunately the ia64 bits don't build, as you omitted > > the #define-s at the top of the file you took the code from. > > Further, the file you put this in is one that shouldn't be modified. The > correct way is to simply pull in Linux'es clear_user.S (see below). > Yes, I think that you are correct. > --- a/xen/arch/ia64/linux/Makefile > +++ b/xen/arch/ia64/linux/Makefile > @@ -4,6 +4,7 @@ subdir-y += sn > > obj-y += bitop.o > obj-y += clear_page.o > +obj-y += clear_user.o > obj-y += copy_page_mck.o > obj-y += efi_stub.o > obj-y += extable.o > --- a/xen/arch/ia64/linux/README.origin > +++ b/xen/arch/ia64/linux/README.origin > @@ -15,6 +15,7 @@ pcdp.h -> linux/drivers/firmware/pcdp. > > bitop.c -> linux/arch/ia64/lib/bitop.c > clear_page.S -> linux/arch/ia64/lib/clear_page.S > +clear_user.S -> linux/arch/ia64/lib/clear_user.S > copy_page_mck.S -> linux/arch/ia64/lib/copy_page_mck.S > flush.S -> linux/arch/ia64/lib/flush.S > idiv32.S -> linux/arch/ia64/lib/idiv32.S > --- /dev/null > +++ b/xen/arch/ia64/linux/clear_user.S > @@ -0,0 +1,209 @@ > +/* > + * This routine clears to zero a linear memory buffer in user space. > + * > + * Inputs: > + * in0: address of buffer > + * in1: length of buffer in bytes > + * Outputs: > + * r8: number of bytes that didn't get cleared due to a fault > + * > + * Copyright (C) 1998, 1999, 2001 Hewlett-Packard Co > + * Stephane Eranian <eranian@xxxxxxxxxx> > + */ > + > +#include <asm/asmmacro.h> > + > +// > +// arguments > +// > +#define buf r32 > +#define len r33 > + > +// > +// local registers > +// > +#define cnt r16 > +#define buf2 r17 > +#define saved_lc r18 > +#define saved_pfs r19 > +#define tmp r20 > +#define len2 r21 > +#define len3 r22 > + > +// > +// Theory of operations: > +// - we check whether or not the buffer is small, i.e., less than 17 > +// in which case we do the byte by byte loop. > +// > +// - Otherwise we go progressively from 1 byte store to 8byte store in > +// the head part, the body is a 16byte store loop and we finish we the > +// tail for the last 15 bytes. > +// The good point about this breakdown is that the long buffer handling > +// contains only 2 branches. > +// > +// The reason for not using shifting & masking for both the head and the > +// tail is to stay semantically correct. This routine is not supposed > +// to write bytes outside of the buffer. While most of the time this would > +// be ok, we can't tolerate a mistake. A classical example is the case > +// of multithreaded code were to the extra bytes touched is actually owned > +// by another thread which runs concurrently to ours. Another, less likely, > +// example is with device drivers where reading an I/O mapped location may > +// have side effects (same thing for writing). > +// > + > +GLOBAL_ENTRY(__do_clear_user) > + .prologue > + .save ar.pfs, saved_pfs > + alloc saved_pfs=ar.pfs,2,0,0,0 > + cmp.eq p6,p0=r0,len // check for zero length > + .save ar.lc, saved_lc > + mov saved_lc=ar.lc // preserve ar.lc (slow) > + .body > + ;; // avoid WAW on CFM > + adds tmp=-1,len // br.ctop is repeat/until > + mov ret0=len // return value is length at this point > +(p6) br.ret.spnt.many rp > + ;; > + cmp.lt p6,p0=16,len // if len > 16 then long memset > + mov ar.lc=tmp // initialize lc for small count > +(p6) br.cond.dptk .long_do_clear > + ;; // WAR on ar.lc > + // > + // worst case 16 iterations, avg 8 iterations > + // > + // We could have played with the predicates to use the extra > + // M slot for 2 stores/iteration but the cost the initialization > + // the various counters compared to how long the loop is supposed > + // to last on average does not make this solution viable. > + // > +1: > + EX( .Lexit1, st1 [buf]=r0,1 ) > + adds len=-1,len // countdown length using len > + br.cloop.dptk 1b > + ;; // avoid RAW on ar.lc > + // > + // .Lexit4: comes from byte by byte loop > + // len contains bytes left > +.Lexit1: > + mov ret0=len // faster than using ar.lc > + mov ar.lc=saved_lc > + br.ret.sptk.many rp // end of short clear_user > + > + > + // > + // At this point we know we have more than 16 bytes to copy > + // so we focus on alignment (no branches required) > + // > + // The use of len/len2 for countdown of the number of bytes left > + // instead of ret0 is due to the fact that the exception code > + // changes the values of r8. > + // > +.long_do_clear: > + tbit.nz p6,p0=buf,0 // odd alignment (for long_do_clear) > + ;; > + EX( .Lexit3, (p6) st1 [buf]=r0,1 ) // 1-byte aligned > +(p6) adds len=-1,len;; // sync because buf is modified > + tbit.nz p6,p0=buf,1 > + ;; > + EX( .Lexit3, (p6) st2 [buf]=r0,2 ) // 2-byte aligned > +(p6) adds len=-2,len;; > + tbit.nz p6,p0=buf,2 > + ;; > + EX( .Lexit3, (p6) st4 [buf]=r0,4 ) // 4-byte aligned > +(p6) adds len=-4,len;; > + tbit.nz p6,p0=buf,3 > + ;; > + EX( .Lexit3, (p6) st8 [buf]=r0,8 ) // 8-byte aligned > +(p6) adds len=-8,len;; > + shr.u cnt=len,4 // number of 128-bit (2x64bit) words > + ;; > + cmp.eq p6,p0=r0,cnt > + adds tmp=-1,cnt > +(p6) br.cond.dpnt .dotail // we have less than 16 bytes left > + ;; > + adds buf2=8,buf // setup second base pointer > + mov ar.lc=tmp > + ;; > + > + // > + // 16bytes/iteration core loop > + // > + // The second store can never generate a fault because > + // we come into the loop only when we are 16-byte aligned. > + // This means that if we cross a page then it will always be > + // in the first store and never in the second. > + // > + // > + // We need to keep track of the remaining length. A possible > (optimistic) > + // way would be to use ar.lc and derive how many byte were left by > + // doing : left= 16*ar.lc + 16. this would avoid the addition at > + // every iteration. > + // However we need to keep the synchronization point. A template > + // M;;MB does not exist and thus we can keep the addition at no > + // extra cycle cost (use a nop slot anyway). It also simplifies the > + // (unlikely) error recovery code > + // > + > +2: EX(.Lexit3, st8 [buf]=r0,16 ) > + ;; // needed to get len correct when error > + st8 [buf2]=r0,16 > + adds len=-16,len > + br.cloop.dptk 2b > + ;; > + mov ar.lc=saved_lc > + // > + // tail correction based on len only > + // > + // We alternate the use of len3,len2 to allow parallelism and correct > + // error handling. We also reuse p6/p7 to return correct value. > + // The addition of len2/len3 does not cost anything more compared to > + // the regular memset as we had empty slots. > + // > +.dotail: > + mov len2=len // for parallelization of error handling > + mov len3=len > + tbit.nz p6,p0=len,3 > + ;; > + EX( .Lexit2, (p6) st8 [buf]=r0,8 ) // at least 8 bytes > +(p6) adds len3=-8,len2 > + tbit.nz p7,p6=len,2 > + ;; > + EX( .Lexit2, (p7) st4 [buf]=r0,4 ) // at least 4 bytes > +(p7) adds len2=-4,len3 > + tbit.nz p6,p7=len,1 > + ;; > + EX( .Lexit2, (p6) st2 [buf]=r0,2 ) // at least 2 bytes > +(p6) adds len3=-2,len2 > + tbit.nz p7,p6=len,0 > + ;; > + EX( .Lexit2, (p7) st1 [buf]=r0 ) // only 1 byte left > + mov ret0=r0 // success > + br.ret.sptk.many rp // end of most likely path > + > + // > + // Outlined error handling code > + // > + > + // > + // .Lexit3: comes from core loop, need restore pr/lc > + // len contains bytes left > + // > + // > + // .Lexit2: > + // if p6 -> coming from st8 or st2 : len2 contains what's left > + // if p7 -> coming from st4 or st1 : len3 contains what's left > + // We must restore lc/pr even though might not have been used. > +.Lexit2: > + .pred.rel "mutex", p6, p7 > +(p6) mov len=len2 > +(p7) mov len=len3 > + ;; > + // > + // .Lexit4: comes from head, need not restore pr/lc > + // len contains bytes left > + // > +.Lexit3: > + mov ret0=len > + mov ar.lc=saved_lc > + br.ret.sptk.many rp > +END(__do_clear_user) > > _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-devel

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.