[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH RFC v2 1/4] x86/mm: Shadow and p2m changes for PV mem_access



>>No, at least not about unspecified hypothetical ones. But again - a
>>vague statement like you gave, without any kind of quantification of
>>the imposed overhead, isn't going to be good enough a judgment.
>>After all pausing a domain can be quite problematic for its performance
>>if that happens reasonably frequently. Otoh I admit that the user of
>>your new mechanism has a certain level of control over the impact via
>>the number of pages (s)he wants to write-protect. So yes, perhaps it
>>isn't going to be too bad as long as the hackery you need to do isn't.
>
>I just wanted to give an update as to where I stand so as to not leave this
>thread hanging. As I was working through pausing and unpausing the domain
>during Xen writes to guest memory, I found a spot where the write happens
>outside of __copy_to_user_ll() and __put_user_size(). It occurs in
>create_bounce_frame() where Xen writes to the guest stack to setup an
>exception frame. At that moment, if the guest stack is marked read-only, we
>end up in the same situation as with the copy to guest functions but with no
>obvious place to revert the page table entry back to read-only. I am at the
>moment looking for a spot where I can do this.

I have a solution for the create_bounc_frame() issue I described above. Please 
find below a POC patch that includes pausing and unpausing the domain during 
the Xen writes to guest memory. I have it on top of the patch that was using 
CR0.WP to highlight the difference. Please take a look and let me know if this 
solution is acceptable. 

PS: I do realize whatever I do to create_bounce_frame() will have to be 
reflected in the compat version. If this is correct approach I will do the same 
there too.

Thanks,
Aravindh

diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index 49d8545..758f7f8 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -734,6 +734,9 @@ int arch_set_info_guest(
         if ( ((c(ldt_base) & (PAGE_SIZE - 1)) != 0) ||
              (c(ldt_ents) > 8192) )
             return -EINVAL;
+
+        v->arch.pv_vcpu.mfn_access_reset_req = 0;
+        v->arch.pv_vcpu.mfn_access_reset = INVALID_MFN;
     }
     else if ( is_pvh_vcpu(v) )
     {
diff --git a/xen/arch/x86/domain_build.c b/xen/arch/x86/domain_build.c
index d4473c1..6e6b7f8 100644
--- a/xen/arch/x86/domain_build.c
+++ b/xen/arch/x86/domain_build.c
@@ -1168,9 +1168,13 @@ int __init construct_dom0(
                COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*l2tab));
     }
 
-    /* Pages that are part of page tables must be read only. */
     if  ( is_pv_domain(d) )
+    {
+        v->arch.pv_vcpu.mfn_access_reset_req = 0;
+        v->arch.pv_vcpu.mfn_access_reset = INVALID_MFN;
+        /* Pages that are part of page tables must be read only. */
         mark_pv_pt_pages_rdonly(d, l4start, vpt_start, nr_pt_pages);
+    }
 
     /* Mask all upcalls... */
     for ( i = 0; i < XEN_LEGACY_MAX_VCPUS; i++ )
diff --git a/xen/arch/x86/mm/p2m-ma.c b/xen/arch/x86/mm/p2m-ma.c
index d8ad12c..ad37db0 100644
--- a/xen/arch/x86/mm/p2m-ma.c
+++ b/xen/arch/x86/mm/p2m-ma.c
@@ -27,6 +27,8 @@
 #include "mm-locks.h"
 
 /* Override macros from asm/page.h to make them work with mfn_t */
+#undef mfn_valid
+#define mfn_valid(_mfn) __mfn_valid(mfn_x(_mfn))
 #undef mfn_to_page
 #define mfn_to_page(_m) __mfn_to_page(mfn_x(_m))
 
@@ -125,6 +127,34 @@ p2m_mem_access_get_entry(struct p2m_domain *p2m, unsigned 
long gfn,
     return mfn;
 }
 
+void p2m_mem_access_reset_mfn_entry(void)
+{
+    mfn_t mfn = _mfn(current->arch.pv_vcpu.mfn_access_reset);
+    struct page_info *page;
+    struct domain *d = current->domain;
+
+    if ( unlikely(!mfn_valid(mfn)) )
+        return;
+
+    page = mfn_to_page(mfn);
+    if ( page_get_owner(page) != d )
+        return;
+
+    ASSERT(!paging_locked_by_me(d));
+    paging_lock(d);
+
+    shadow_set_access(page, p2m_get_hostp2m(d)->default_access);
+
+    if ( sh_remove_all_mappings(current, mfn) )
+        flush_tlb_mask(d->domain_dirty_cpumask);
+
+    current->arch.pv_vcpu.mfn_access_reset = INVALID_MFN;
+    current->arch.pv_vcpu.mfn_access_reset_req = 0;
+
+    paging_unlock(d);
+    domain_unpause(d);
+}
+
 /* Reset the set_entry and get_entry function pointers */
 void p2m_mem_access_reset(struct p2m_domain *p2m)
 {
diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c
index 9aacd8e..5f02948 100644
--- a/xen/arch/x86/mm/shadow/common.c
+++ b/xen/arch/x86/mm/shadow/common.c
@@ -1132,8 +1132,36 @@ int shadow_write_guest_entry(struct vcpu *v, intpte_t *p,
  * appropriately.  Returns 0 if we page-faulted, 1 for success. */
 {
     int failed;
+    unsigned long mfn_access_reset_saved = INVALID_MFN;
     paging_lock(v->domain);
+
+    /*
+     * If a mem_access listener is present for a PV guest and is listening for
+     * write violations, we want to allow Xen writes to guest memory to go
+     * through. To allow this we are setting PTE.RW for the MFN in question and
+     * resetting this after the write has gone through. The resetting is kicked
+     * off at the end of the guest access functions __copy_to_user_ll() and
+     * __put_user_size() if mfn_access_reset_req is set. Since these functions
+     * are also called by the shadow code when setting the PTE.RW or
+     * sh_remove_all_mappings(), we temporarily set mfn_access_reset_req to 0
+     * prevent p2m_mem_access_reset_entry() from firing.
+     */
+    if ( unlikely(mem_event_check_ring(&v->domain->mem_event->access)) &&
+         is_pv_domain(v->domain) && v->arch.pv_vcpu.mfn_access_reset_req )
+    {
+        mfn_access_reset_saved = v->arch.pv_vcpu.mfn_access_reset;
+        v->arch.pv_vcpu.mfn_access_reset = INVALID_MFN;
+        v->arch.pv_vcpu.mfn_access_reset_req = 0;
+    }
+
     failed = __copy_to_user(p, &new, sizeof(new));
+
+    if ( unlikely(mfn_valid(_mfn(mfn_access_reset_saved))) )
+    {
+        v->arch.pv_vcpu.mfn_access_reset = mfn_access_reset_saved;
+        v->arch.pv_vcpu.mfn_access_reset_req = 1;
+    }
+
     if ( failed != sizeof(new) )
         sh_validate_guest_entry(v, gmfn, p, sizeof(new));
     paging_unlock(v->domain);
diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c
index db30396..bcf8fdf 100644
--- a/xen/arch/x86/mm/shadow/multi.c
+++ b/xen/arch/x86/mm/shadow/multi.c
@@ -795,7 +795,7 @@ static inline void safe_write_entry(void *dst, void *src)
 
 
 static inline void 
-shadow_write_entries(void *d, void *s, int entries, mfn_t mfn)
+shadow_write_entries(struct vcpu *v, void *d, void *s, int entries, mfn_t mfn)
 /* This function does the actual writes to shadow pages.
  * It must not be called directly, since it doesn't do the bookkeeping
  * that shadow_set_l*e() functions do. */
@@ -804,6 +804,26 @@ shadow_write_entries(void *d, void *s, int entries, mfn_t 
mfn)
     shadow_l1e_t *src = s;
     void *map = NULL;
     int i;
+    unsigned long mfn_access_reset_saved = INVALID_MFN;
+
+    /*
+     * If a mem_access listener is present for a PV guest and is listening for
+     * write violations, we want to allow Xen writes to guest memory to go
+     * through. To allow this we are setting PTE.RW for the MFN in question and
+     * resetting this after the write has gone through. The resetting is kicked
+     * off at the end of the guest access functions __copy_to_user_ll() and
+     * __put_user_size() if mfn_access_reset_req is set. Since these functions
+     * are also called by the shadow code when setting the PTE.RW or
+     * sh_remove_all_mappings(), we temporarily set mfn_access_reset_req to 0
+     * prevent p2m_mem_access_reset_entry() from firing.
+     */
+    if ( unlikely(mem_event_check_ring(&v->domain->mem_event->access)) &&
+         v->arch.pv_vcpu.mfn_access_reset_req && is_pv_domain(v->domain) )
+    {
+        mfn_access_reset_saved = v->arch.pv_vcpu.mfn_access_reset;
+        v->arch.pv_vcpu.mfn_access_reset = INVALID_MFN;
+        v->arch.pv_vcpu.mfn_access_reset_req = 0;
+    }
 
     /* Because we mirror access rights at all levels in the shadow, an
      * l2 (or higher) entry with the RW bit cleared will leave us with
@@ -817,6 +837,12 @@ shadow_write_entries(void *d, void *s, int entries, mfn_t 
mfn)
         dst = map + ((unsigned long)dst & (PAGE_SIZE - 1));
     }
 
+    if ( unlikely(mfn_valid(_mfn(mfn_access_reset_saved))) &&
+         is_pv_domain(v->domain) )
+    {
+        v->arch.pv_vcpu.mfn_access_reset = mfn_access_reset_saved;
+        v->arch.pv_vcpu.mfn_access_reset_req = 1;
+    }
 
     for ( i = 0; i < entries; i++ )
         safe_write_entry(dst++, src++);
@@ -924,7 +950,7 @@ static int shadow_set_l4e(struct vcpu *v,
     }
 
     /* Write the new entry */
-    shadow_write_entries(sl4e, &new_sl4e, 1, sl4mfn);
+    shadow_write_entries(v, sl4e, &new_sl4e, 1, sl4mfn);
     flags |= SHADOW_SET_CHANGED;
 
     if ( shadow_l4e_get_flags(old_sl4e) & _PAGE_PRESENT ) 
@@ -969,7 +995,7 @@ static int shadow_set_l3e(struct vcpu *v,
     }
 
     /* Write the new entry */
-    shadow_write_entries(sl3e, &new_sl3e, 1, sl3mfn);
+    shadow_write_entries(v, sl3e, &new_sl3e, 1, sl3mfn);
     flags |= SHADOW_SET_CHANGED;
 
     if ( shadow_l3e_get_flags(old_sl3e) & _PAGE_PRESENT ) 
@@ -1051,9 +1077,9 @@ static int shadow_set_l2e(struct vcpu *v,
 
     /* Write the new entry */
 #if GUEST_PAGING_LEVELS == 2
-    shadow_write_entries(sl2e, &pair, 2, sl2mfn);
+    shadow_write_entries(v, sl2e, &pair, 2, sl2mfn);
 #else /* normal case */
-    shadow_write_entries(sl2e, &new_sl2e, 1, sl2mfn);
+    shadow_write_entries(v, sl2e, &new_sl2e, 1, sl2mfn);
 #endif
     flags |= SHADOW_SET_CHANGED;
 
@@ -1218,7 +1244,7 @@ static int shadow_set_l1e(struct vcpu *v,
     } 
 
     /* Write the new entry */
-    shadow_write_entries(sl1e, &new_sl1e, 1, sl1mfn);
+    shadow_write_entries(v, sl1e, &new_sl1e, 1, sl1mfn);
     flags |= SHADOW_SET_CHANGED;
 
     if ( (shadow_l1e_get_flags(old_sl1e) & _PAGE_PRESENT) 
@@ -3061,23 +3087,24 @@ static int sh_page_fault(struct vcpu *v,
 
             /*
              * Do not police writes to guest memory from the Xen hypervisor.
-             * This keeps PV mem_access on par with HVM. Turn off CR0.WP here 
to
-             * allow the write to go through if the guest has marked the page 
as
-             * writable. Turn it back on in the guest access functions
-             * __copy_to_user / __put_user_size() after the write is completed.
+             * This keeps PV mem_access on par with HVM. Pause the guest and
+             * mark the access entry as RW here to allow the write to go 
through
+             * if the guest has marked the page as writable. Unpause the guest
+             * and set the access value back to the default at the end of
+             * __copy_to_user, __put_user_size() and create_bounce_frame() 
after
+             * the write is completed. The guest is paused to prevent other
+             * VCPUs from writing to this page during this window.
              */
             if ( violation && access_w &&
                  regs->eip >= XEN_VIRT_START && regs->eip <= XEN_VIRT_END )
             {
-                unsigned long cr0 = read_cr0();
-
                 violation = 0;
-                if ( cr0 & X86_CR0_WP &&
-                     guest_l1e_get_flags(gw.l1e) & _PAGE_RW )
+                if ( guest_l1e_get_flags(gw.l1e) & _PAGE_RW )
                 {
-                    cr0 &= ~X86_CR0_WP;
-                    write_cr0(cr0);
-                    v->arch.pv_vcpu.need_cr0_wp_set = 1;
+                    domain_pause_nosync(d);
+                    v->arch.pv_vcpu.mfn_access_reset = mfn_x(gmfn);
+                    v->arch.pv_vcpu.mfn_access_reset_req = 1;
+                    shadow_set_access(mfn_to_page(gmfn), p2m_access_rw);
                 }
             }
 
diff --git a/xen/arch/x86/usercopy.c b/xen/arch/x86/usercopy.c
index eecf429..a0d11a9 100644
--- a/xen/arch/x86/usercopy.c
+++ b/xen/arch/x86/usercopy.c
@@ -8,6 +8,7 @@
 
 #include <xen/lib.h>
 #include <xen/sched.h>
+#include <asm/p2m.h>
 #include <asm/uaccess.h>
 
 unsigned long __copy_to_user_ll(void __user *to, const void *from, unsigned n)
@@ -47,16 +48,12 @@ unsigned long __copy_to_user_ll(void __user *to, const void 
*from, unsigned n)
 
     /*
      * A mem_access listener was present and Xen tried to write to guest 
memory.
-     * To allow this write to go through without an event being sent to the
-     * listener or the pagetable entry being modified, we disabled CR0.WP in 
the
-     * shadow pagefault handler. We are enabling it back here again.
+     * To allow this write to go through we modified the PTE in the shadow page
+     * fault handler. We are resetting the access permission of the page that
+     * was written to its default value here.
      */
-    if ( unlikely(current->arch.pv_vcpu.need_cr0_wp_set) )
-    {
-        write_cr0(read_cr0() | X86_CR0_WP);
-        current->arch.pv_vcpu.need_cr0_wp_set = 0;
-    }
-
+    if ( unlikely(current->arch.pv_vcpu.mfn_access_reset_req) )
+        p2m_mem_access_reset_mfn_entry();
     return __n;
 }
 
diff --git a/xen/arch/x86/x86_64/asm-offsets.c 
b/xen/arch/x86/x86_64/asm-offsets.c
index 3994f4d..97ea966 100644
--- a/xen/arch/x86/x86_64/asm-offsets.c
+++ b/xen/arch/x86/x86_64/asm-offsets.c
@@ -86,6 +86,7 @@ void __dummy__(void)
     OFFSET(VCPU_trap_ctxt, struct vcpu, arch.pv_vcpu.trap_ctxt);
     OFFSET(VCPU_kernel_sp, struct vcpu, arch.pv_vcpu.kernel_sp);
     OFFSET(VCPU_kernel_ss, struct vcpu, arch.pv_vcpu.kernel_ss);
+    OFFSET(VCPU_mfn_access_reset_req, struct vcpu, 
arch.pv_vcpu.mfn_access_reset_req);
     OFFSET(VCPU_guest_context_flags, struct vcpu, arch.vgc_flags);
     OFFSET(VCPU_nmi_pending, struct vcpu, nmi_pending);
     OFFSET(VCPU_mce_pending, struct vcpu, mce_pending);
diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S
index a3ed216..27fc97f 100644
--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -441,6 +441,12 @@ UNLIKELY_START(z, create_bounce_frame_bad_bounce_ip)
         jmp   asm_domain_crash_synchronous  /* Does not return */
 __UNLIKELY_END(create_bounce_frame_bad_bounce_ip)
         movq  %rax,UREGS_rip+8(%rsp)
+        cmpb  $1, VCPU_mfn_access_reset_req(%rbx)
+        je    2f
+        ret
+2:      SAVE_ALL
+        call  p2m_mem_access_reset_mfn_entry
+        RESTORE_ALL
         ret
         _ASM_EXTABLE(.Lft2,  dom_crash_sync_extable)
         _ASM_EXTABLE(.Lft3,  dom_crash_sync_extable)
diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
index cf2ae2a..fa58444 100644
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -385,10 +385,13 @@ struct pv_vcpu
     struct vcpu_time_info pending_system_time;
 
     /*
-     * Flag that tracks if CR0.WP needs to be set after a Xen write to guest
-     * memory when a PV domain has a mem_access listener attached to it.
+     * Track if a mfn's access permission needs to be reset after a Xen write 
to
+     * guest memory when a PV domain has a mem_access listener attached to it.
+     * The boolean is used to allow for easy checking for this condition in
+     * create_bounce_frame().
      */
-    bool_t need_cr0_wp_set;
+    bool_t mfn_access_reset_req;
+    unsigned long mfn_access_reset;
 };
 
 struct arch_vcpu
diff --git a/xen/include/asm-x86/uaccess.h b/xen/include/asm-x86/uaccess.h
index 947470d..80c7a78 100644
--- a/xen/include/asm-x86/uaccess.h
+++ b/xen/include/asm-x86/uaccess.h
@@ -21,6 +21,8 @@ unsigned long __copy_from_user_ll(void *to, const void *from, 
unsigned n);
 extern long __get_user_bad(void);
 extern void __put_user_bad(void);
 
+extern void p2m_mem_access_reset_mfn_entry(void);
+
 /**
  * get_user: - Get a simple variable from user space.
  * @x:   Variable to store result.
diff --git a/xen/include/asm-x86/x86_64/uaccess.h 
b/xen/include/asm-x86/x86_64/uaccess.h
index 6d13ec6..111e4ae 100644
--- a/xen/include/asm-x86/x86_64/uaccess.h
+++ b/xen/include/asm-x86/x86_64/uaccess.h
@@ -67,11 +67,8 @@ do {                                                         
        \
        case 8: __put_user_asm(x,ptr,retval,"q","","ir",errret);break;  \
        default: __put_user_bad();                                      \
        }                                                               \
-    if ( unlikely(current->arch.pv_vcpu.need_cr0_wp_set) ) \
-    { \
-        write_cr0(read_cr0() | X86_CR0_WP); \
-        current->arch.pv_vcpu.need_cr0_wp_set = 0; \
-    } \
+    if ( unlikely(current->arch.pv_vcpu.mfn_access_reset_req) ) \
+        p2m_mem_access_reset_mfn_entry(); \
 } while (0)
 
 #define __get_user_size(x,ptr,size,retval,errret)                      \


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.