[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH RFC v2 1/4] x86/mm: Shadow and p2m changes for PV mem_access



>>>>That's what you need to figure out. The simplistic solution (single
>>>>stepping just the critical instruction(s)) is probably not going to be
>>>>acceptable due to its fragility. I have no good other suggestions, but
>>>>I'm not eager to allow code in that weakens protection.
>>>
>>> From the debugging I have done to get this working, this is what the
>>> flow should be. Xen tries to write to guest page marked read only and
>>> page fault occurs. So __copy_to_user_ll() ->
>>> handle_exception_saved->do_page_fault() and CR0.WP is cleared. Once
>>> the fault is handled __copy_to_user_ll() is retried and it goes
>>> through. At the end of which CR0.WP is turned on. So this is the only
>>> window that pv_vcpu.need_cr0_wp_set should be true. Is there a spot
>>> outside of this window that I check to see if it is set and if it is, turn 
>>> it back
>on
>>again? Would that be a sufficient bound?
>>
>>That's the obvious (direct) path. What you leave aside are any interrupts
>>occurring in between.
>
>True. I was thinking about disabling interrupts in this window but that
>wouldn't account for non-maskable ones. This is going to be a tough nut to
>crack.

I took another stab at solving this issue. This is what the solution looks like:

1. mem_access listener attaches to a PV domain and listens for write violation.
2. Xen tries to write to a guest page that is marked not writable.
3. PF occurs.
        - Do not pass on the violation to the mem_access listener.
        - Temporarily change the access permission for the page in question to 
R/W.
        - Allow the fault to be handled which will end up with a PTE with the 
RW bit set.
        - Stash the mfn in question in the pv_vcpu structure.
        - Check if this field is set on exiting from guest access functions. If 
it is, reset the page permission to the default value and drop its shadows.
4. I had to take care that the checks in the guest access functions for 
resetting the page permissions do not kick in when the shadow code is trying to 
construct the PTE for the page in question or when removing all the mappings.

Here is a POC patch that does the above. I have it on top of the patch that was 
using CR0.WP to highlight the difference. I realize some of the other comments 
like using shadow_flags have not been addressed here. I just want to get 
feedback if this is a viable solution before addressing those issues.

Thanks,
Aravindh

diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index 49d8545..e6fd08e 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -734,6 +734,8 @@ int arch_set_info_guest(
         if ( ((c(ldt_base) & (PAGE_SIZE - 1)) != 0) ||
              (c(ldt_ents) > 8192) )
             return -EINVAL;
+
+        v->arch.pv_vcpu.mfn_access_reset = INVALID_MFN;
     }
     else if ( is_pvh_vcpu(v) )
     {
diff --git a/xen/arch/x86/domain_build.c b/xen/arch/x86/domain_build.c
index d4473c1..f60be0c 100644
--- a/xen/arch/x86/domain_build.c
+++ b/xen/arch/x86/domain_build.c
@@ -1168,9 +1168,12 @@ int __init construct_dom0(
                COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*l2tab));
     }
 
-    /* Pages that are part of page tables must be read only. */
     if  ( is_pv_domain(d) )
+    {
+        v->arch.pv_vcpu.mfn_access_reset = INVALID_MFN;
+        /* Pages that are part of page tables must be read only. */
         mark_pv_pt_pages_rdonly(d, l4start, vpt_start, nr_pt_pages);
+    }
 
     /* Mask all upcalls... */
     for ( i = 0; i < XEN_LEGACY_MAX_VCPUS; i++ )
diff --git a/xen/arch/x86/mm/p2m-ma.c b/xen/arch/x86/mm/p2m-ma.c
index d8ad12c..aae972a 100644
--- a/xen/arch/x86/mm/p2m-ma.c
+++ b/xen/arch/x86/mm/p2m-ma.c
@@ -27,6 +27,8 @@
 #include "mm-locks.h"
 
 /* Override macros from asm/page.h to make them work with mfn_t */
+#undef mfn_valid
+#define mfn_valid(_mfn) __mfn_valid(mfn_x(_mfn))
 #undef mfn_to_page
 #define mfn_to_page(_m) __mfn_to_page(mfn_x(_m))
 
@@ -125,6 +127,32 @@ p2m_mem_access_get_entry(struct p2m_domain *p2m, unsigned 
long gfn,
     return mfn;
 }
 
+void p2m_mem_access_reset_entry(void)
+{
+    mfn_t mfn = _mfn(current->arch.pv_vcpu.mfn_access_reset);
+    struct page_info *page;
+    struct domain *d = current->domain;
+
+    if ( !mfn_valid(mfn) )
+        return;
+
+    page = mfn_to_page(mfn);
+    if ( page_get_owner(page) != d )
+        return;
+
+    ASSERT(!paging_locked_by_me(d));
+    paging_lock(d);
+
+    shadow_set_access(page, p2m_get_hostp2m(d)->default_access);
+
+    if ( sh_remove_all_mappings(current, mfn) )
+        flush_tlb_mask(d->domain_dirty_cpumask);
+
+    current->arch.pv_vcpu.mfn_access_reset = INVALID_MFN;
+
+    paging_unlock(d);
+}
+
 /* Reset the set_entry and get_entry function pointers */
 void p2m_mem_access_reset(struct p2m_domain *p2m)
 {
diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c
index 9aacd8e..76e6fa9 100644
--- a/xen/arch/x86/mm/shadow/common.c
+++ b/xen/arch/x86/mm/shadow/common.c
@@ -1132,8 +1132,33 @@ int shadow_write_guest_entry(struct vcpu *v, intpte_t *p,
  * appropriately.  Returns 0 if we page-faulted, 1 for success. */
 {
     int failed;
+    unsigned long mfn_access_reset_saved = INVALID_MFN;
     paging_lock(v->domain);
+
+    /*
+     * If a mem_access listener is present for a PV guest and is listening for
+     * write violations, we want to allow Xen writes to guest memory to go
+     * through. To allow this we are setting PTE.RW for the MFN in question and
+     * resetting this after the write has gone through. The resetting is kicked
+     * off at the end of the guest access functions __copy_to_user_ll() and
+     * __put_user_size() if mfn_access_reset is a valid MFN. Since these
+     * functions are also called by the shadow code when setting the PTE.RW or
+     * sh_remove_all_mappings(), we temporarily set mfn_access_reset to an
+     * invalid value to prevent p2m_mem_access_reset_entry() from firing.
+     */
+    if ( unlikely(mem_event_check_ring(&v->domain->mem_event->access)) &&
+         mfn_valid(_mfn(v->arch.pv_vcpu.mfn_access_reset)) &&
+         is_pv_domain(v->domain) )
+    {
+        mfn_access_reset_saved = v->arch.pv_vcpu.mfn_access_reset;
+        v->arch.pv_vcpu.mfn_access_reset = INVALID_MFN;
+    }
+
     failed = __copy_to_user(p, &new, sizeof(new));
+
+    if ( unlikely(mfn_valid(_mfn(mfn_access_reset_saved))) )
+        v->arch.pv_vcpu.mfn_access_reset = mfn_access_reset_saved;
+
     if ( failed != sizeof(new) )
         sh_validate_guest_entry(v, gmfn, p, sizeof(new));
     paging_unlock(v->domain);
diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c
index db30396..f572d23 100644
--- a/xen/arch/x86/mm/shadow/multi.c
+++ b/xen/arch/x86/mm/shadow/multi.c
@@ -795,7 +795,7 @@ static inline void safe_write_entry(void *dst, void *src)
 
 
 static inline void 
-shadow_write_entries(void *d, void *s, int entries, mfn_t mfn)
+shadow_write_entries(struct vcpu *v, void *d, void *s, int entries, mfn_t mfn)
 /* This function does the actual writes to shadow pages.
  * It must not be called directly, since it doesn't do the bookkeeping
  * that shadow_set_l*e() functions do. */
@@ -804,6 +804,26 @@ shadow_write_entries(void *d, void *s, int entries, mfn_t 
mfn)
     shadow_l1e_t *src = s;
     void *map = NULL;
     int i;
+    unsigned long mfn_access_reset_saved = INVALID_MFN;
+
+    /*
+     * If a mem_access listener is present for a PV guest and is listening for
+     * write violations, we want to allow Xen writes to guest memory to go
+     * through. To allow this we are setting PTE.RW for the MFN in question and
+     * resetting this after the write has gone through. The resetting is kicked
+     * off at the end of the guest access functions __copy_to_user_ll() and
+     * __put_user_size() if mfn_access_reset is a valid MFN. Since these
+     * functions are also called by the shadow code when setting the PTE.RW or
+     * sh_remove_all_mappings(), we temporarily set mfn_access_reset to an
+     * invalid value to prevent p2m_mem_access_reset_entry() from firing.
+     */
+    if ( unlikely(mem_event_check_ring(&v->domain->mem_event->access)) &&
+         mfn_valid(_mfn(v->arch.pv_vcpu.mfn_access_reset)) &&
+         is_pv_domain(v->domain) )
+    {
+        mfn_access_reset_saved = v->arch.pv_vcpu.mfn_access_reset;
+        v->arch.pv_vcpu.mfn_access_reset = INVALID_MFN;
+    }
 
     /* Because we mirror access rights at all levels in the shadow, an
      * l2 (or higher) entry with the RW bit cleared will leave us with
@@ -817,6 +837,9 @@ shadow_write_entries(void *d, void *s, int entries, mfn_t 
mfn)
         dst = map + ((unsigned long)dst & (PAGE_SIZE - 1));
     }
 
+    if ( unlikely(mfn_valid(_mfn(mfn_access_reset_saved))) &&
+         is_pv_domain(v->domain) )
+        v->arch.pv_vcpu.mfn_access_reset = mfn_access_reset_saved;
 
     for ( i = 0; i < entries; i++ )
         safe_write_entry(dst++, src++);
@@ -924,7 +947,7 @@ static int shadow_set_l4e(struct vcpu *v,
     }
 
     /* Write the new entry */
-    shadow_write_entries(sl4e, &new_sl4e, 1, sl4mfn);
+    shadow_write_entries(v, sl4e, &new_sl4e, 1, sl4mfn);
     flags |= SHADOW_SET_CHANGED;
 
     if ( shadow_l4e_get_flags(old_sl4e) & _PAGE_PRESENT ) 
@@ -969,7 +992,7 @@ static int shadow_set_l3e(struct vcpu *v,
     }
 
     /* Write the new entry */
-    shadow_write_entries(sl3e, &new_sl3e, 1, sl3mfn);
+    shadow_write_entries(v, sl3e, &new_sl3e, 1, sl3mfn);
     flags |= SHADOW_SET_CHANGED;
 
     if ( shadow_l3e_get_flags(old_sl3e) & _PAGE_PRESENT ) 
@@ -1051,9 +1074,9 @@ static int shadow_set_l2e(struct vcpu *v,
 
     /* Write the new entry */
 #if GUEST_PAGING_LEVELS == 2
-    shadow_write_entries(sl2e, &pair, 2, sl2mfn);
+    shadow_write_entries(v, sl2e, &pair, 2, sl2mfn);
 #else /* normal case */
-    shadow_write_entries(sl2e, &new_sl2e, 1, sl2mfn);
+    shadow_write_entries(v, sl2e, &new_sl2e, 1, sl2mfn);
 #endif
     flags |= SHADOW_SET_CHANGED;
 
@@ -1218,7 +1241,7 @@ static int shadow_set_l1e(struct vcpu *v,
     } 
 
     /* Write the new entry */
-    shadow_write_entries(sl1e, &new_sl1e, 1, sl1mfn);
+    shadow_write_entries(v, sl1e, &new_sl1e, 1, sl1mfn);
     flags |= SHADOW_SET_CHANGED;
 
     if ( (shadow_l1e_get_flags(old_sl1e) & _PAGE_PRESENT) 
@@ -3069,15 +3092,11 @@ static int sh_page_fault(struct vcpu *v,
             if ( violation && access_w &&
                  regs->eip >= XEN_VIRT_START && regs->eip <= XEN_VIRT_END )
             {
-                unsigned long cr0 = read_cr0();
-
                 violation = 0;
-                if ( cr0 & X86_CR0_WP &&
-                     guest_l1e_get_flags(gw.l1e) & _PAGE_RW )
+                if ( guest_l1e_get_flags(gw.l1e) & _PAGE_RW )
                 {
-                    cr0 &= ~X86_CR0_WP;
-                    write_cr0(cr0);
-                    v->arch.pv_vcpu.need_cr0_wp_set = 1;
+                    v->arch.pv_vcpu.mfn_access_reset = mfn_x(gmfn);
+                    shadow_set_access(mfn_to_page(gmfn), p2m_access_rw);
                 }
             }
 
diff --git a/xen/arch/x86/usercopy.c b/xen/arch/x86/usercopy.c
index eecf429..e2d192d 100644
--- a/xen/arch/x86/usercopy.c
+++ b/xen/arch/x86/usercopy.c
@@ -8,6 +8,7 @@
 
 #include <xen/lib.h>
 #include <xen/sched.h>
+#include <asm/p2m.h>
 #include <asm/uaccess.h>
 
 unsigned long __copy_to_user_ll(void __user *to, const void *from, unsigned n)
@@ -47,16 +48,12 @@ unsigned long __copy_to_user_ll(void __user *to, const void 
*from, unsigned n)
 
     /*
      * A mem_access listener was present and Xen tried to write to guest 
memory.
-     * To allow this write to go through without an event being sent to the
-     * listener or the pagetable entry being modified, we disabled CR0.WP in 
the
-     * shadow pagefault handler. We are enabling it back here again.
+     * To allow this write to go through we modified the PTE in the shadow page
+     * fault handler. We are resetting the access permission of the page that
+     * was written to its default value here.
      */
-    if ( unlikely(current->arch.pv_vcpu.need_cr0_wp_set) )
-    {
-        write_cr0(read_cr0() | X86_CR0_WP);
-        current->arch.pv_vcpu.need_cr0_wp_set = 0;
-    }
-
+    if ( unlikely(mfn_valid(current->arch.pv_vcpu.mfn_access_reset)) )
+        p2m_mem_access_reset_entry();
     return __n;
 }
 
diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
index cf2ae2a..4b6b782 100644
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -385,10 +385,10 @@ struct pv_vcpu
     struct vcpu_time_info pending_system_time;
 
     /*
-     * Flag that tracks if CR0.WP needs to be set after a Xen write to guest
-     * memory when a PV domain has a mem_access listener attached to it.
+     * Track if a mfn's access permission needs to be reset after a Xen write 
to
+     * guest memory when a PV domain has a mem_access listener attached to it.
      */
-    bool_t need_cr0_wp_set;
+    unsigned long mfn_access_reset;
 };
 
 struct arch_vcpu
diff --git a/xen/include/asm-x86/uaccess.h b/xen/include/asm-x86/uaccess.h
index 947470d..333d4b1 100644
--- a/xen/include/asm-x86/uaccess.h
+++ b/xen/include/asm-x86/uaccess.h
@@ -21,6 +21,8 @@ unsigned long __copy_from_user_ll(void *to, const void *from, 
unsigned n);
 extern long __get_user_bad(void);
 extern void __put_user_bad(void);
 
+extern void p2m_mem_access_reset_entry(void);
+
 /**
  * get_user: - Get a simple variable from user space.
  * @x:   Variable to store result.
diff --git a/xen/include/asm-x86/x86_64/uaccess.h 
b/xen/include/asm-x86/x86_64/uaccess.h
index 6d13ec6..7cacbcd 100644
--- a/xen/include/asm-x86/x86_64/uaccess.h
+++ b/xen/include/asm-x86/x86_64/uaccess.h
@@ -1,8 +1,6 @@
 #ifndef __X86_64_UACCESS_H
 #define __X86_64_UACCESS_H
 
-#include <xen/sched.h>
-
 #define COMPAT_ARG_XLAT_VIRT_BASE ((void *)ARG_XLAT_START(current))
 #define COMPAT_ARG_XLAT_SIZE      (2*PAGE_SIZE)
 struct vcpu;
@@ -67,11 +65,8 @@ do {                                                         
        \
        case 8: __put_user_asm(x,ptr,retval,"q","","ir",errret);break;  \
        default: __put_user_bad();                                      \
        }                                                               \
-    if ( unlikely(current->arch.pv_vcpu.need_cr0_wp_set) ) \
-    { \
-        write_cr0(read_cr0() | X86_CR0_WP); \
-        current->arch.pv_vcpu.need_cr0_wp_set = 0; \
-    } \
+    if ( unlikely(mfn_valid(current->arch.pv_vcpu.mfn_access_reset)) ) \
+        p2m_mem_access_reset_entry(); \
 } while (0)
 
 #define __get_user_size(x,ptr,size,retval,errret)                      \

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.