[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH v3 3/6] x86/pagewalk: Re-implement the pagetable walker



The existing pagetable walker has complicated return semantics, which squeeze
multiple pieces of information into single integer.  This would be fine if the
information didn't overlap, but it does.

Specifically, _PAGE_INVALID_BITS for 3-level guests alias _PAGE_PAGED and
_PAGE_SHARED.  A guest which constructs a PTE with bits 52 or 53 set (the
start of the upper software-available range) will create a virtual address
which, when walked by Xen, tricks Xen into believing the frame is paged or
shared.  This behaviour was introduced by XSA-173 (c/s 8b17648).

It is also complicated to turn rc back into a normal pagefault error code.
Instead, change the calling semantics to return a boolean indicating success,
and have the function accumulate a real pagefault error code as it goes
(including synthetic error codes, which do not alias hardware ones).  This
requires an equivalent adjustment to map_domain_gfn().

Issues fixed:
 * 2-level PSE36 superpages now return the correct translation.
 * 2-level L2 superpages without CR0.PSE now return the correct translation.
 * SMEP now inhibits a user instruction fetch even if NX isn't active.
 * Supervisor writes without CR0.WP now set the leaf dirty bit.
 * L4e._PAGE_GLOBAL is strictly reserved on AMD.
 * 3-level l3 entries have all reserved bits checked.
 * 3-level entries can no longer alias Xen's idea of paged or shared.

Signed-off-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
Reviewed-by: Tim Deegan <tim@xxxxxxx>
Reviewed-by: George Dunlap <george.dunlap@xxxxxxxxxx>
Reviewed-by: Jan Beulich <jbeulich@xxxxxxxx>
---
 xen/arch/x86/mm/guest_walk.c      | 449 ++++++++++++++++++++------------------
 xen/arch/x86/mm/hap/guest_walk.c  |  21 +-
 xen/arch/x86/mm/hap/nested_ept.c  |   2 +-
 xen/arch/x86/mm/p2m.c             |  17 +-
 xen/arch/x86/mm/shadow/multi.c    |  27 +--
 xen/include/asm-x86/guest_pt.h    |  62 ++++--
 xen/include/asm-x86/p2m.h         |   2 +-
 xen/include/asm-x86/page.h        |   3 -
 xen/include/asm-x86/processor.h   |   2 +
 xen/include/asm-x86/x86_64/page.h |   6 -
 10 files changed, 303 insertions(+), 288 deletions(-)

diff --git a/xen/arch/x86/mm/guest_walk.c b/xen/arch/x86/mm/guest_walk.c
index c526363..e34a5ec 100644
--- a/xen/arch/x86/mm/guest_walk.c
+++ b/xen/arch/x86/mm/guest_walk.c
@@ -32,44 +32,6 @@ asm(".file \"" __OBJECT_FILE__ "\"");
 #include <asm/page.h>
 #include <asm/guest_pt.h>
 
-extern const uint32_t gw_page_flags[];
-#if GUEST_PAGING_LEVELS == CONFIG_PAGING_LEVELS
-const uint32_t gw_page_flags[] = {
-    /* I/F -  Usr Wr */
-    /* 0   0   0   0 */ _PAGE_PRESENT,
-    /* 0   0   0   1 */ _PAGE_PRESENT|_PAGE_RW,
-    /* 0   0   1   0 */ _PAGE_PRESENT|_PAGE_USER,
-    /* 0   0   1   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER,
-    /* 0   1   0   0 */ _PAGE_PRESENT,
-    /* 0   1   0   1 */ _PAGE_PRESENT|_PAGE_RW,
-    /* 0   1   1   0 */ _PAGE_PRESENT|_PAGE_USER,
-    /* 0   1   1   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER,
-    /* 1   0   0   0 */ _PAGE_PRESENT|_PAGE_NX_BIT,
-    /* 1   0   0   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_NX_BIT,
-    /* 1   0   1   0 */ _PAGE_PRESENT|_PAGE_USER|_PAGE_NX_BIT,
-    /* 1   0   1   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT,
-    /* 1   1   0   0 */ _PAGE_PRESENT|_PAGE_NX_BIT,
-    /* 1   1   0   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_NX_BIT,
-    /* 1   1   1   0 */ _PAGE_PRESENT|_PAGE_USER|_PAGE_NX_BIT,
-    /* 1   1   1   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT,
-};
-#endif
-
-/* Flags that are needed in a pagetable entry, with the sense of NX inverted */
-static uint32_t mandatory_flags(struct vcpu *v, uint32_t pfec) 
-{
-    /* Don't demand not-NX if the CPU wouldn't enforce it. */
-    if ( !guest_nx_enabled(v) )
-        pfec &= ~PFEC_insn_fetch;
-
-    /* Don't demand R/W if the CPU wouldn't enforce it. */
-    if ( is_hvm_vcpu(v) && unlikely(!hvm_wp_enabled(v)) 
-         && !(pfec & PFEC_user_mode) )
-        pfec &= ~PFEC_write_access;
-
-    return gw_page_flags[(pfec & 0x1f) >> 1] | _PAGE_INVALID_BITS;
-}
-
 /* Modify a guest pagetable entry to set the Accessed and Dirty bits.
  * Returns non-zero if it actually writes to guest memory. */
 static uint32_t set_ad_bits(void *guest_p, void *walk_p, int set_dirty)
@@ -90,62 +52,33 @@ static uint32_t set_ad_bits(void *guest_p, void *walk_p, 
int set_dirty)
     return 0;
 }
 
-#if GUEST_PAGING_LEVELS >= 4
-static bool_t pkey_fault(struct vcpu *vcpu, uint32_t pfec,
-        uint32_t pte_flags, uint32_t pte_pkey)
-{
-    uint32_t pkru;
-
-    /* When page isn't present,  PKEY isn't checked. */
-    if ( !(pfec & PFEC_page_present) || is_pv_vcpu(vcpu) )
-        return 0;
-
-    /*
-     * PKU:  additional mechanism by which the paging controls
-     * access to user-mode addresses based on the value in the
-     * PKRU register. A fault is considered as a PKU violation if all
-     * of the following conditions are true:
-     * 1.CR4_PKE=1.
-     * 2.EFER_LMA=1.
-     * 3.Page is present with no reserved bit violations.
-     * 4.The access is not an instruction fetch.
-     * 5.The access is to a user page.
-     * 6.PKRU.AD=1 or
-     *      the access is a data write and PKRU.WD=1 and
-     *          either CR0.WP=1 or it is a user access.
-     */
-    if ( !hvm_pku_enabled(vcpu) ||
-         !hvm_long_mode_enabled(vcpu) ||
-         (pfec & PFEC_reserved_bit) ||
-         (pfec & PFEC_insn_fetch) ||
-         !(pte_flags & _PAGE_USER) )
-        return 0;
-
-    pkru = read_pkru();
-    if ( unlikely(pkru) )
-    {
-        bool_t pkru_ad = read_pkru_ad(pkru, pte_pkey);
-        bool_t pkru_wd = read_pkru_wd(pkru, pte_pkey);
-
-        /* Condition 6 */
-        if ( pkru_ad ||
-             (pkru_wd && (pfec & PFEC_write_access) &&
-              (hvm_wp_enabled(vcpu) || (pfec & PFEC_user_mode))) )
-            return 1;
-    }
-
-    return 0;
-}
-#endif
-
-/* Walk the guest pagetables, after the manner of a hardware walker. */
-/* Because the walk is essentially random, it can cause a deadlock 
- * warning in the p2m locking code. Highly unlikely this is an actual
- * deadlock, because who would walk page table in the opposite order? */
-uint32_t
+/*
+ * Walk the guest pagetables, after the manner of a hardware walker.
+ *
+ * This is a condensing of the 'Paging' chapters from Intel and AMD software
+ * manuals.  Please refer closely to them.
+ *
+ * A pagetable walk consists of two parts:
+ *   1) to find whether a translation exists, and
+ *   2) if a translation does exist, to check whether the translation's access
+ *      rights permit the access.
+ *
+ * A translation is found by following the pagetable structure (starting at
+ * %cr3) to a leaf entry (an L1 PTE, or a higher level entry with PSE set)
+ * which identifies the physical destination of the access.
+ *
+ * A translation from one level to the next exists if the PTE is both present
+ * and has no reserved bits set.  If the pagewalk counters a situation where a
+ * translation does not exist, the walk stops at that point.
+ *
+ * The access rights (NX, User, RW bits) are collected as the walk progresses.
+ * If a translation exists, the accumulated access rights are compared to the
+ * requested walk, to see whether the access is permitted.
+ */
+bool
 guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m,
                   unsigned long va, walk_t *gw, 
-                  uint32_t pfec, mfn_t top_mfn, void *top_map)
+                  uint32_t walk, mfn_t top_mfn, void *top_map)
 {
     struct domain *d = v->domain;
     p2m_type_t p2mt;
@@ -155,64 +88,44 @@ guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m,
     guest_l3e_t *l3p = NULL;
     guest_l4e_t *l4p;
 #endif
-    unsigned int pkey;
-    uint32_t gflags, mflags, iflags, rc = 0;
-    bool_t smep = 0, smap = 0;
+    uint32_t gflags, rc;
     bool_t pse1G = 0, pse2M = 0;
     p2m_query_t qt = P2M_ALLOC | P2M_UNSHARE;
 
-    /* Only implicit supervisor data accesses exist. */
-    ASSERT(!(pfec & PFEC_implicit) ||
-           !(pfec & (PFEC_insn_fetch|PFEC_user_mode)));
+#define AR_ACCUM_AND (_PAGE_USER | _PAGE_RW)
+#define AR_ACCUM_OR  (_PAGE_NX_BIT)
+    /* Start with all AND bits set, all OR bits clear. */
+    uint32_t ar, ar_and = ~0u, ar_or = 0;
 
-    perfc_incr(guest_walk);
-    memset(gw, 0, sizeof(*gw));
-    gw->va = va;
-
-    /* Mandatory bits that must be set in every entry.  We invert NX and
-     * the invalid bits, to calculate as if there were an "X" bit that
-     * allowed access.  We will accumulate, in rc, the set of flags that
-     * are missing/unwanted. */
-    mflags = mandatory_flags(v, pfec);
-    iflags = (_PAGE_NX_BIT | _PAGE_INVALID_BITS);
+    bool walk_ok = false;
 
-    if ( is_hvm_domain(d) && !(pfec & PFEC_user_mode) )
-    {
-        const struct cpu_user_regs *regs = guest_cpu_user_regs();
+    /*
+     * TODO - We should ASSERT() that only the following bits are set as
+     * inputs to a guest walk, but a whole load of code currently passes in
+     * other PFEC_ constants.
+     */
+    walk &= (PFEC_implicit | PFEC_insn_fetch | PFEC_user_mode | 
PFEC_write_access);
 
-        /* SMEP: kernel-mode instruction fetches from user-mode mappings
-         * should fault.  Unlike NX or invalid bits, we're looking for _all_
-         * entries in the walk to have _PAGE_USER set, so we need to do the
-         * whole walk as if it were a user-mode one and then invert the 
answer. */
-        smep =  hvm_smep_enabled(v) && (pfec & PFEC_insn_fetch);
+    /* Only implicit supervisor data accesses exist. */
+    ASSERT(!(walk & PFEC_implicit) ||
+           !(walk & (PFEC_insn_fetch | PFEC_user_mode)));
 
-        switch ( v->arch.smap_check_policy )
-        {
-        case SMAP_CHECK_HONOR_CPL_AC:
-            /*
-             * SMAP: kernel-mode data accesses from user-mode mappings
-             * should fault.
-             * A fault is considered as a SMAP violation if the following
-             * conditions come true:
-             *   - X86_CR4_SMAP is set in CR4
-             *   - A user page is accessed
-             *   - CPL = 3 or X86_EFLAGS_AC is clear
-             *   - Page fault in kernel mode
-             */
-            smap = hvm_smap_enabled(v) &&
-                   ((hvm_get_cpl(v) == 3) || !(regs->eflags & X86_EFLAGS_AC));
-            break;
-        case SMAP_CHECK_ENABLED:
-            smap = hvm_smap_enabled(v);
-            break;
-        default:
-            ASSERT(v->arch.smap_check_policy == SMAP_CHECK_DISABLED);
-            break;
-        }
-    }
+    /*
+     * PFEC_insn_fetch is only used as an input to pagetable walking if NX or
+     * SMEP are enabled.  Otherwise, instruction fetches are indistinguishable
+     * from data reads.
+     *
+     * This property can be demonstrated on real hardware by having NX and
+     * SMEP inactive, but SMAP active, and observing that EFLAGS.AC determines
+     * whether a pagefault occures for supervisor execution on user mappings.
+     */
+    if ( !(guest_nx_enabled(v) || guest_smep_enabled(v)) )
+        walk &= ~PFEC_insn_fetch;
 
-    if ( smep || smap )
-        mflags |= _PAGE_USER;
+    perfc_incr(guest_walk);
+    memset(gw, 0, sizeof(*gw));
+    gw->va = va;
+    gw->pfec = walk & (PFEC_insn_fetch | PFEC_user_mode | PFEC_write_access);
 
 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
@@ -221,17 +134,20 @@ guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m,
     gw->l4mfn = top_mfn;
     l4p = (guest_l4e_t *) top_map;
     gw->l4e = l4p[guest_l4_table_offset(va)];
-    gflags = guest_l4e_get_flags(gw->l4e) ^ iflags;
-    if ( !(gflags & _PAGE_PRESENT) ) {
-        rc |= _PAGE_PRESENT;
+    gflags = guest_l4e_get_flags(gw->l4e);
+    if ( !(gflags & _PAGE_PRESENT) )
         goto out;
-    }
-    if ( gflags & _PAGE_PSE )
+
+    /* Check for reserved bits. */
+    if ( guest_l4e_rsvd_bits(v, gw->l4e) )
     {
-        rc |= _PAGE_PSE | _PAGE_INVALID_BIT;
+        gw->pfec |= PFEC_reserved_bit | PFEC_page_present;
         goto out;
     }
-    rc |= ((gflags & mflags) ^ mflags);
+
+    /* Accumulate l4e access rights. */
+    ar_and &= gflags;
+    ar_or  |= gflags;
 
     /* Map the l3 table */
     l3p = map_domain_gfn(p2m, 
@@ -241,17 +157,28 @@ guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m,
                          qt,
                          &rc); 
     if(l3p == NULL)
+    {
+        gw->pfec |= rc & PFEC_synth_mask;
         goto out;
+    }
+
     /* Get the l3e and check its flags*/
     gw->l3e = l3p[guest_l3_table_offset(va)];
-    pkey = guest_l3e_get_pkey(gw->l3e);
-    gflags = guest_l3e_get_flags(gw->l3e) ^ iflags;
-    if ( !(gflags & _PAGE_PRESENT) ) {
-        rc |= _PAGE_PRESENT;
+    gflags = guest_l3e_get_flags(gw->l3e);
+    if ( !(gflags & _PAGE_PRESENT) )
+        goto out;
+
+    /* Check for reserved bits, including possibly _PAGE_PSE. */
+    if ( guest_l3e_rsvd_bits(v, gw->l3e) )
+    {
+        gw->pfec |= PFEC_reserved_bit | PFEC_page_present;
         goto out;
     }
-    rc |= ((gflags & mflags) ^ mflags);
     
+    /* Accumulate l3e access rights. */
+    ar_and &= gflags;
+    ar_or  |= gflags;
+
     pse1G = !!(gflags & _PAGE_PSE);
 
     if ( pse1G )
@@ -272,26 +199,25 @@ guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m,
             /* _PAGE_PSE_PAT not set: remove _PAGE_PAT from flags. */
             flags &= ~_PAGE_PAT;
 
-        if ( !guest_can_use_l3_superpages(d) )
-            rc |= _PAGE_PSE | _PAGE_INVALID_BIT;
-        if ( gfn_x(start) & GUEST_L3_GFN_MASK & ~0x1 )
-            rc |= _PAGE_INVALID_BITS;
-
         /* Increment the pfn by the right number of 4k pages. */
         start = _gfn((gfn_x(start) & ~GUEST_L3_GFN_MASK) +
                      ((va >> PAGE_SHIFT) & GUEST_L3_GFN_MASK));
         gw->l1e = guest_l1e_from_gfn(start, flags);
         gw->l2mfn = gw->l1mfn = INVALID_MFN;
-        goto set_ad;
+        goto leaf;
     }
 
 #else /* PAE only... */
 
     /* Get the l3e and check its flag */
     gw->l3e = ((guest_l3e_t *) top_map)[guest_l3_table_offset(va)];
-    if ( !(guest_l3e_get_flags(gw->l3e) & _PAGE_PRESENT) ) 
+    gflags = guest_l3e_get_flags(gw->l3e);
+    if ( !(gflags & _PAGE_PRESENT) )
+        goto out;
+
+    if ( guest_l3e_rsvd_bits(v, gw->l3e) )
     {
-        rc |= _PAGE_PRESENT;
+        gw->pfec |= PFEC_reserved_bit | PFEC_page_present;
         goto out;
     }
 
@@ -305,7 +231,11 @@ guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m,
                          qt,
                          &rc); 
     if(l2p == NULL)
+    {
+        gw->pfec |= rc & PFEC_synth_mask;
         goto out;
+    }
+
     /* Get the l2e */
     gw->l2e = l2p[guest_l2_table_offset(va)];
 
@@ -318,15 +248,32 @@ guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m,
 
 #endif /* All levels... */
 
-    pkey = guest_l2e_get_pkey(gw->l2e);
-    gflags = guest_l2e_get_flags(gw->l2e) ^ iflags;
-    if ( !(gflags & _PAGE_PRESENT) ) {
-        rc |= _PAGE_PRESENT;
+    /* Check the l2e flags. */
+    gflags = guest_l2e_get_flags(gw->l2e);
+    if ( !(gflags & _PAGE_PRESENT) )
+        goto out;
+
+    /*
+     * In 2-level paging without CR0.PSE, there are no reserved bits, and the
+     * PAT/PSE bit is ignored.
+     */
+    if ( GUEST_PAGING_LEVELS == 2 && !guest_can_use_l2_superpages(v) )
+    {
+        gw->l2e.l2 &= ~_PAGE_PSE;
+        gflags &= ~_PAGE_PSE;
+    }
+    /* else check for reserved bits, including possibly _PAGE_PSE. */
+    else if ( guest_l2e_rsvd_bits(v, gw->l2e) )
+    {
+        gw->pfec |= PFEC_reserved_bit | PFEC_page_present;
         goto out;
     }
-    rc |= ((gflags & mflags) ^ mflags);
 
-    pse2M = (gflags & _PAGE_PSE) && guest_can_use_l2_superpages(v);
+    /* Accumulate l2e access rights. */
+    ar_and &= gflags;
+    ar_or  |= gflags;
+
+    pse2M = !!(gflags & _PAGE_PSE);
 
     if ( pse2M )
     {
@@ -334,7 +281,11 @@ guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m,
          * no guest l1e.  We make one up so that the propagation code
          * can generate a shadow l1 table.  Start with the gfn of the 
          * first 4k-page of the superpage. */
+#if GUEST_PAGING_LEVELS == 2
+        gfn_t start = _gfn(unfold_pse36(gw->l2e.l2) >> PAGE_SHIFT);
+#else
         gfn_t start = guest_l2e_get_gfn(gw->l2e);
+#endif
         /* Grant full access in the l1e, since all the guest entry's 
          * access controls are enforced in the shadow l2e. */
         int flags = (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
@@ -348,70 +299,137 @@ guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m,
             /* _PAGE_PSE_PAT not set: remove _PAGE_PAT from flags. */
             flags &= ~_PAGE_PAT;
 
-        if ( gfn_x(start) & GUEST_L2_GFN_MASK & ~0x1 )
-            rc |= _PAGE_INVALID_BITS;
-
-        /* Increment the pfn by the right number of 4k pages.  
-         * Mask out PAT and invalid bits. */
+        /* Increment the pfn by the right number of 4k pages. */
         start = _gfn((gfn_x(start) & ~GUEST_L2_GFN_MASK) +
                      guest_l1_table_offset(va));
+#if GUEST_PAGING_LEVELS == 2
+         /* Wider than 32 bits if PSE36 superpage. */
+        gw->el1e = (gfn_x(start) << PAGE_SHIFT) | flags;
+#else
         gw->l1e = guest_l1e_from_gfn(start, flags);
+#endif
         gw->l1mfn = INVALID_MFN;
-    } 
-    else 
+        goto leaf;
+    }
+
+    /* Map the l1 table */
+    l1p = map_domain_gfn(p2m,
+                         guest_l2e_get_gfn(gw->l2e),
+                         &gw->l1mfn,
+                         &p2mt,
+                         qt,
+                         &rc);
+    if ( l1p == NULL )
     {
-        /* Not a superpage: carry on and find the l1e. */
-        l1p = map_domain_gfn(p2m, 
-                             guest_l2e_get_gfn(gw->l2e), 
-                             &gw->l1mfn,
-                             &p2mt,
-                             qt,
-                             &rc);
-        if(l1p == NULL)
+        gw->pfec |= rc & PFEC_synth_mask;
+        goto out;
+    }
+    gw->l1e = l1p[guest_l1_table_offset(va)];
+    gflags = guest_l1e_get_flags(gw->l1e);
+    if ( !(gflags & _PAGE_PRESENT) )
+        goto out;
+
+    /* Check for reserved bits. */
+    if ( guest_l1e_rsvd_bits(v, gw->l1e) )
+    {
+        gw->pfec |= PFEC_reserved_bit | PFEC_page_present;
+        goto out;
+    }
+
+    /* Accumulate l1e access rights. */
+    ar_and &= gflags;
+    ar_or  |= gflags;
+
+ leaf:
+    gw->pfec |= PFEC_page_present;
+
+    /*
+     * The pagetable walk has returned a successful translation.  Now check
+     * access rights to see whether the access should succeed.
+     */
+    ar = (ar_and & AR_ACCUM_AND) | (ar_or & AR_ACCUM_OR);
+
+    if ( (walk & PFEC_insn_fetch) && (ar & _PAGE_NX_BIT) )
+        /* Requested an instruction fetch and found NX? Fail. */
+        goto out;
+
+    if ( walk & PFEC_user_mode ) /* Requested a user acess. */
+    {
+        if ( !(ar & _PAGE_USER) )
+            /* Got a supervisor walk?  Unconditional fail. */
             goto out;
-        gw->l1e = l1p[guest_l1_table_offset(va)];
-        pkey = guest_l1e_get_pkey(gw->l1e);
-        gflags = guest_l1e_get_flags(gw->l1e) ^ iflags;
-        if ( !(gflags & _PAGE_PRESENT) ) {
-            rc |= _PAGE_PRESENT;
+
+        if ( (walk & PFEC_write_access) && !(ar & _PAGE_RW) )
+            /* Requested a write and only got a read? Fail. */
             goto out;
+    }
+    else /* Requested a supervisor access. */
+    {
+        if ( ar & _PAGE_USER ) /* Got a user walk. */
+        {
+            if ( (walk & PFEC_insn_fetch) && guest_smep_enabled(v) )
+                /* User insn fetch and smep? Fail. */
+                goto out;
+
+            if ( !(walk & PFEC_insn_fetch) && guest_smap_enabled(v) &&
+                 ((walk & PFEC_implicit) ||
+                  !(guest_cpu_user_regs()->eflags & X86_EFLAGS_AC)) )
+                /* User data access and smap? Fail. */
+                goto out;
         }
-        rc |= ((gflags & mflags) ^ mflags);
+
+        if ( (walk & PFEC_write_access) && !(ar & _PAGE_RW) &&
+             guest_wp_enabled(v) )
+            /* Requested a write, got a read, and CR0.WP is set? Fail. */
+            goto out;
     }
 
 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
-set_ad:
-    if ( pkey_fault(v, pfec, gflags, pkey) )
-        rc |= _PAGE_PKEY_BITS;
+    /*
+     * If all access checks are thusfar ok, check Protection Key for 64bit
+     * user data accesses.
+     *
+     * N.B. In the case that the walk ended with a superpage, the fabricated
+     * gw->l1e contains the appropriate leaf pkey.
+     */
+    if ( (walk & PFEC_user_mode) && !(walk & PFEC_insn_fetch) &&
+         guest_pku_enabled(v) )
+    {
+        unsigned int pkey = guest_l1e_get_pkey(gw->l1e);
+        unsigned int pkru = read_pkru();
+
+        if ( read_pkru_ad(pkru, pkey) ||
+             ((ar & PFEC_write_access) && read_pkru_wd(pkru, pkey)) )
+        {
+            gw->pfec |= PFEC_prot_key;
+            goto out;
+        }
+    }
 #endif
-    /* Now re-invert the user-mode requirement for SMEP and SMAP */
-    if ( smep || smap )
-        rc ^= _PAGE_USER;
+
+    walk_ok = true;
 
     /* Go back and set accessed and dirty bits only if the walk was a
      * success.  Although the PRMs say higher-level _PAGE_ACCESSED bits
      * get set whenever a lower-level PT is used, at least some hardware
      * walkers behave this way. */
-    if ( rc == 0 ) 
-    {
 #if GUEST_PAGING_LEVELS == 4 /* 64-bit only... */
-        if ( set_ad_bits(l4p + guest_l4_table_offset(va), &gw->l4e, 0) )
-            paging_mark_dirty(d, gw->l4mfn);
-        if ( set_ad_bits(l3p + guest_l3_table_offset(va), &gw->l3e,
-                         (pse1G && (pfec & PFEC_write_access))) )
-            paging_mark_dirty(d, gw->l3mfn);
+    if ( set_ad_bits(l4p + guest_l4_table_offset(va), &gw->l4e, 0) )
+        paging_mark_dirty(d, gw->l4mfn);
+    if ( set_ad_bits(l3p + guest_l3_table_offset(va), &gw->l3e,
+                     (pse1G && (walk & PFEC_write_access))) )
+        paging_mark_dirty(d, gw->l3mfn);
 #endif
-        if ( !pse1G ) 
+    if ( !pse1G )
+    {
+        if ( set_ad_bits(l2p + guest_l2_table_offset(va), &gw->l2e,
+                         (pse2M && (walk & PFEC_write_access))) )
+            paging_mark_dirty(d, gw->l2mfn);
+        if ( !pse2M )
         {
-            if ( set_ad_bits(l2p + guest_l2_table_offset(va), &gw->l2e,
-                             (pse2M && (pfec & PFEC_write_access))) )
-                paging_mark_dirty(d, gw->l2mfn);
-            if ( !pse2M ) 
-            {
-                if ( set_ad_bits(l1p + guest_l1_table_offset(va), &gw->l1e, 
-                                 (pfec & PFEC_write_access)) )
-                    paging_mark_dirty(d, gw->l1mfn);
-            }
+            if ( set_ad_bits(l1p + guest_l1_table_offset(va), &gw->l1e,
+                             (walk & PFEC_write_access)) )
+                paging_mark_dirty(d, gw->l1mfn);
         }
     }
 
@@ -436,10 +454,5 @@ guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m,
         put_page(mfn_to_page(mfn_x(gw->l1mfn)));
     }
 
-    /* If this guest has a restricted physical address space then the
-     * target GFN must fit within it. */
-    if ( !(rc & _PAGE_PRESENT) && !gfn_valid(d, guest_walk_to_gfn(gw)) )
-        rc |= _PAGE_INVALID_BITS;
-
-    return rc;
+    return walk_ok;
 }
diff --git a/xen/arch/x86/mm/hap/guest_walk.c b/xen/arch/x86/mm/hap/guest_walk.c
index e202c9a..5a4896b 100644
--- a/xen/arch/x86/mm/hap/guest_walk.c
+++ b/xen/arch/x86/mm/hap/guest_walk.c
@@ -50,7 +50,7 @@ unsigned long hap_p2m_ga_to_gfn(GUEST_PAGING_LEVELS)(
     struct vcpu *v, struct p2m_domain *p2m, unsigned long cr3,
     paddr_t ga, uint32_t *pfec, unsigned int *page_order)
 {
-    uint32_t missing;
+    bool walk_ok;
     mfn_t top_mfn;
     void *top_map;
     p2m_type_t p2mt;
@@ -91,12 +91,12 @@ unsigned long hap_p2m_ga_to_gfn(GUEST_PAGING_LEVELS)(
 #if GUEST_PAGING_LEVELS == 3
     top_map += (cr3 & ~(PAGE_MASK | 31));
 #endif
-    missing = guest_walk_tables(v, p2m, ga, &gw, *pfec, top_mfn, top_map);
+    walk_ok = guest_walk_tables(v, p2m, ga, &gw, *pfec, top_mfn, top_map);
     unmap_domain_page(top_map);
     put_page(top_page);
 
     /* Interpret the answer */
-    if ( missing == 0 )
+    if ( walk_ok )
     {
         gfn_t gfn = guest_walk_to_gfn(&gw);
         struct page_info *page;
@@ -123,20 +123,7 @@ unsigned long hap_p2m_ga_to_gfn(GUEST_PAGING_LEVELS)(
         return gfn_x(gfn);
     }
 
-    if ( missing & _PAGE_PRESENT )
-        *pfec &= ~PFEC_page_present;
-
-    if ( missing & _PAGE_INVALID_BITS ) 
-        *pfec |= PFEC_reserved_bit;
-
-    if ( missing & _PAGE_PKEY_BITS )
-        *pfec |= PFEC_prot_key;
-
-    if ( missing & _PAGE_PAGED )
-        *pfec = PFEC_page_paged;
-
-    if ( missing & _PAGE_SHARED )
-        *pfec = PFEC_page_shared;
+    *pfec = gw.pfec;
 
  out_tweak_pfec:
     /*
diff --git a/xen/arch/x86/mm/hap/nested_ept.c b/xen/arch/x86/mm/hap/nested_ept.c
index 02b27b1..14b1bb0 100644
--- a/xen/arch/x86/mm/hap/nested_ept.c
+++ b/xen/arch/x86/mm/hap/nested_ept.c
@@ -208,7 +208,7 @@ nept_walk_tables(struct vcpu *v, unsigned long l2ga, 
ept_walk_t *gw)
     goto out;
 
 map_err:
-    if ( rc == _PAGE_PAGED )
+    if ( rc == PFEC_page_paged )
     {
         ret = EPT_TRANSLATE_RETRY;
         goto out;
diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
index a5651a3..d38004c 100644
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -1774,17 +1774,18 @@ unsigned long paging_gva_to_gfn(struct vcpu *v,
 }
 
 /*
- * If the map is non-NULL, we leave this function having
- * acquired an extra ref on mfn_to_page(*mfn).
+ * If the map is non-NULL, we leave this function having acquired an extra ref
+ * on mfn_to_page(*mfn).  In all cases, *pfec contains appropriate
+ * synthetic/structure PFEC_* bits.
  */
 void *map_domain_gfn(struct p2m_domain *p2m, gfn_t gfn, mfn_t *mfn,
-                     p2m_type_t *p2mt, p2m_query_t q, uint32_t *rc)
+                     p2m_type_t *p2mt, p2m_query_t q, uint32_t *pfec)
 {
     struct page_info *page;
 
     if ( !gfn_valid(p2m->domain, gfn) )
     {
-        *rc = _PAGE_INVALID_BIT;
+        *pfec = PFEC_reserved_bit | PFEC_page_present;
         return NULL;
     }
 
@@ -1796,21 +1797,23 @@ void *map_domain_gfn(struct p2m_domain *p2m, gfn_t gfn, 
mfn_t *mfn,
         if ( page )
             put_page(page);
         p2m_mem_paging_populate(p2m->domain, gfn_x(gfn));
-        *rc = _PAGE_PAGED;
+        *pfec = PFEC_page_paged;
         return NULL;
     }
     if ( p2m_is_shared(*p2mt) )
     {
         if ( page )
             put_page(page);
-        *rc = _PAGE_SHARED;
+        *pfec = PFEC_page_shared;
         return NULL;
     }
     if ( !page )
     {
-        *rc |= _PAGE_PRESENT;
+        *pfec = 0;
         return NULL;
     }
+
+    *pfec = PFEC_page_present;
     *mfn = page_to_mfn(page);
     ASSERT(mfn_valid(*mfn));
 
diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c
index 95e2f85..ee110d0 100644
--- a/xen/arch/x86/mm/shadow/multi.c
+++ b/xen/arch/x86/mm/shadow/multi.c
@@ -171,7 +171,7 @@ delete_shadow_status(struct domain *d, mfn_t gmfn, u32 
shadow_type, mfn_t smfn)
 /**************************************************************************/
 /* Functions for walking the guest page tables */
 
-static inline uint32_t
+static inline bool
 sh_walk_guest_tables(struct vcpu *v, unsigned long va, walk_t *gw,
                      uint32_t pfec)
 {
@@ -2858,6 +2858,7 @@ static int sh_page_fault(struct vcpu *v,
     int r;
     p2m_type_t p2mt;
     uint32_t rc, error_code;
+    bool walk_ok;
     int version;
     const struct npfec access = {
          .read_access = 1,
@@ -3075,21 +3076,20 @@ static int sh_page_fault(struct vcpu *v,
      * shadow page table. */
     version = atomic_read(&d->arch.paging.shadow.gtable_dirty_version);
     rmb();
-    rc = sh_walk_guest_tables(v, va, &gw, error_code);
+    walk_ok = sh_walk_guest_tables(v, va, &gw, error_code);
 
 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
     regs->error_code &= ~PFEC_page_present;
-    if ( !(rc & _PAGE_PRESENT) )
+    if ( gw.pfec & PFEC_page_present )
         regs->error_code |= PFEC_page_present;
 #endif
 
-    if ( rc != 0 )
+    if ( !walk_ok )
     {
         perfc_incr(shadow_fault_bail_real_fault);
         SHADOW_PRINTK("not a shadow fault\n");
         reset_early_unshadow(v);
-        if ( (rc & _PAGE_INVALID_BITS) )
-            regs->error_code |= PFEC_reserved_bit;
+        regs->error_code = gw.pfec & PFEC_arch_mask;
         goto propagate;
     }
 
@@ -3723,7 +3723,7 @@ sh_gva_to_gfn(struct vcpu *v, struct p2m_domain *p2m,
 {
     walk_t gw;
     gfn_t gfn;
-    uint32_t missing;
+    bool walk_ok;
 
 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
     /* Check the vTLB cache first */
@@ -3732,18 +3732,9 @@ sh_gva_to_gfn(struct vcpu *v, struct p2m_domain *p2m,
         return vtlb_gfn;
 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
 
-    if ( (missing = sh_walk_guest_tables(v, va, &gw, *pfec)) != 0 )
+    if ( !(walk_ok = sh_walk_guest_tables(v, va, &gw, *pfec)) )
     {
-        if ( (missing & _PAGE_PRESENT) )
-            *pfec &= ~PFEC_page_present;
-        if ( missing & _PAGE_INVALID_BITS )
-            *pfec |= PFEC_reserved_bit;
-        /*
-         * SDM Intel 64 Volume 3, Chapter Paging, PAGE-FAULT EXCEPTIONS:
-         * The PFEC_insn_fetch flag is set only when NX or SMEP are enabled.
-         */
-        if ( is_hvm_vcpu(v) && !hvm_nx_enabled(v) && !hvm_smep_enabled(v) )
-            *pfec &= ~PFEC_insn_fetch;
+        *pfec = gw.pfec;
         return gfn_x(INVALID_GFN);
     }
     gfn = guest_walk_to_gfn(&gw);
diff --git a/xen/include/asm-x86/guest_pt.h b/xen/include/asm-x86/guest_pt.h
index 387b1ed..72126d5 100644
--- a/xen/include/asm-x86/guest_pt.h
+++ b/xen/include/asm-x86/guest_pt.h
@@ -259,19 +259,26 @@ static inline bool guest_nx_enabled(const struct vcpu *v)
     return is_pv_vcpu(v) ? cpu_has_nx : hvm_nx_enabled(v);
 }
 
+static inline bool guest_wp_enabled(const struct vcpu *v)
+{
+    /* PV guests can't control CR0.WP, and it is unconditionally set by Xen. */
+    return is_pv_vcpu(v) || hvm_wp_enabled(v);
+}
 
-/*
- * Some bits are invalid in any pagetable entry.
- * Normal flags values get represented in 24-bit values (see
- * get_pte_flags() and put_pte_flags()), so set bit 24 in
- * addition to be able to flag out of range frame numbers.
- */
-#if GUEST_PAGING_LEVELS == 3
-#define _PAGE_INVALID_BITS \
-    (_PAGE_INVALID_BIT | get_pte_flags(((1ull << 63) - 1) & ~(PAGE_SIZE - 1)))
-#else /* 2-level and 4-level */
-#define _PAGE_INVALID_BITS _PAGE_INVALID_BIT
-#endif
+static inline bool guest_smep_enabled(const struct vcpu *v)
+{
+    return !is_pv_vcpu(v) && hvm_smep_enabled(v);
+}
+
+static inline bool guest_smap_enabled(const struct vcpu *v)
+{
+    return !is_pv_vcpu(v) && hvm_smap_enabled(v);
+}
+
+static inline bool guest_pku_enabled(const struct vcpu *v)
+{
+    return !is_pv_vcpu(v) && hvm_pku_enabled(v);
+}
 
 /* Helpers for identifying whether guest entries have reserved bits set. */
 
@@ -336,13 +343,19 @@ struct guest_pagetable_walk
     guest_l3e_t l3e;            /* Guest's level 3 entry */
 #endif
     guest_l2e_t l2e;            /* Guest's level 2 entry */
-    guest_l1e_t l1e;            /* Guest's level 1 entry (or fabrication) */
+    union
+    {
+        guest_l1e_t l1e;        /* Guest's level 1 entry (or fabrication). */
+        uint64_t   el1e;        /* L2 PSE36 superpages wider than 32 bits. */
+    };
 #if GUEST_PAGING_LEVELS >= 4
     mfn_t l4mfn;                /* MFN that the level 4 entry was in */
     mfn_t l3mfn;                /* MFN that the level 3 entry was in */
 #endif
     mfn_t l2mfn;                /* MFN that the level 2 entry was in */
     mfn_t l1mfn;                /* MFN that the level 1 entry was in */
+
+    uint32_t pfec;              /* Accumulated PFEC_* error code from walk. */
 };
 
 /* Given a walk_t, translate the gw->va into the guest's notion of the
@@ -351,7 +364,9 @@ static inline gfn_t guest_walk_to_gfn(const walk_t *gw)
 {
     if ( !(guest_l1e_get_flags(gw->l1e) & _PAGE_PRESENT) )
         return INVALID_GFN;
-    return guest_l1e_get_gfn(gw->l1e);
+    return (GUEST_PAGING_LEVELS == 2
+            ? _gfn(gw->el1e >> PAGE_SHIFT)
+            : guest_l1e_get_gfn(gw->l1e));
 }
 
 /* Given a walk_t, translate the gw->va into the guest's notion of the
@@ -397,8 +412,8 @@ static inline unsigned int guest_walk_to_page_order(const 
walk_t *gw)
  * we go.  For the purposes of reading pagetables we treat all non-RAM
  * memory as contining zeroes.
  *
- * Returns 0 for success, or the set of permission bits that we failed on
- * if the walk did not complete.
+ * Returns a boolean indicating success or failure.  walk_t.pfec contains
+ * the accumulated error code on failure.
  */
 
 /* Macro-fu so you can call guest_walk_tables() and get the right one. */
@@ -406,7 +421,7 @@ static inline unsigned int guest_walk_to_page_order(const 
walk_t *gw)
 #define GPT_RENAME(_n, _l) GPT_RENAME2(_n, _l)
 #define guest_walk_tables GPT_RENAME(guest_walk_tables, GUEST_PAGING_LEVELS)
 
-extern uint32_t
+bool
 guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m, unsigned long va,
                   walk_t *gw, uint32_t pfec, mfn_t top_mfn, void *top_map);
 
@@ -426,8 +441,21 @@ static inline void print_gw(const walk_t *gw)
 #endif /* All levels... */
     gprintk(XENLOG_INFO, "   l2e=%" PRI_gpte " l2mfn=%" PRI_mfn "\n",
             gw->l2e.l2, mfn_x(gw->l2mfn));
+#if GUEST_PAGING_LEVELS == 2
+    gprintk(XENLOG_INFO, "  el1e=%08" PRIx64 " l1mfn=%" PRI_mfn "\n",
+            gw->el1e, mfn_x(gw->l1mfn));
+#else
     gprintk(XENLOG_INFO, "   l1e=%" PRI_gpte " l1mfn=%" PRI_mfn "\n",
             gw->l1e.l1, mfn_x(gw->l1mfn));
+#endif
+    gprintk(XENLOG_INFO, "   pfec=%02x[%c%c%c%c%c%c]\n", gw->pfec,
+            gw->pfec & PFEC_prot_key     ? 'K' : '-',
+            gw->pfec & PFEC_insn_fetch   ? 'I' : 'd',
+            gw->pfec & PFEC_reserved_bit ? 'R' : '-',
+            gw->pfec & PFEC_user_mode    ? 'U' : 's',
+            gw->pfec & PFEC_write_access ? 'W' : 'r',
+            gw->pfec & PFEC_page_present ? 'P' : '-'
+        );
 }
 
 #endif /* _XEN_ASM_GUEST_PT_H */
diff --git a/xen/include/asm-x86/p2m.h b/xen/include/asm-x86/p2m.h
index 470d29d..bc189d1 100644
--- a/xen/include/asm-x86/p2m.h
+++ b/xen/include/asm-x86/p2m.h
@@ -679,7 +679,7 @@ int p2m_set_entry(struct p2m_domain *p2m, unsigned long 
gfn, mfn_t mfn,
 extern void p2m_pt_init(struct p2m_domain *p2m);
 
 void *map_domain_gfn(struct p2m_domain *p2m, gfn_t gfn, mfn_t *mfn,
-                     p2m_type_t *p2mt, p2m_query_t q, uint32_t *rc);
+                     p2m_type_t *p2mt, p2m_query_t q, uint32_t *pfec);
 
 /* Debugging and auditing of the P2M code? */
 #ifndef NDEBUG
diff --git a/xen/include/asm-x86/page.h b/xen/include/asm-x86/page.h
index 46faffc..bc5946b 100644
--- a/xen/include/asm-x86/page.h
+++ b/xen/include/asm-x86/page.h
@@ -314,9 +314,6 @@ void efi_update_l4_pgtable(unsigned int l4idx, 
l4_pgentry_t);
 #define _PAGE_PSE_PAT  _AC(0x1000,U)
 #define _PAGE_AVAIL_HIGH (_AC(0x7ff, U) << 12)
 #define _PAGE_NX       (cpu_has_nx ? _PAGE_NX_BIT : 0)
-/* non-architectural flags */
-#define _PAGE_PAGED   0x2000U
-#define _PAGE_SHARED  0x4000U
 
 /*
  * Debug option: Ensure that granted mappings are not implicitly unmapped.
diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h
index d3ba8ea..75632d9 100644
--- a/xen/include/asm-x86/processor.h
+++ b/xen/include/asm-x86/processor.h
@@ -73,10 +73,12 @@
 #define PFEC_reserved_bit   (_AC(1,U) << 3)
 #define PFEC_insn_fetch     (_AC(1,U) << 4)
 #define PFEC_prot_key       (_AC(1,U) << 5)
+#define PFEC_arch_mask      (_AC(0xffff,U)) /* Architectural PFEC values. */
 /* Internally used only flags. */
 #define PFEC_page_paged     (1U<<16)
 #define PFEC_page_shared    (1U<<17)
 #define PFEC_implicit       (1U<<18) /* Pagewalk input for ldt/gdt/idt/tr 
accesses. */
+#define PFEC_synth_mask     (~PFEC_arch_mask) /* Synthetic PFEC values. */
 
 /* Other exception error code values. */
 #define X86_XEC_EXT         (_AC(1,U) << 0)
diff --git a/xen/include/asm-x86/x86_64/page.h 
b/xen/include/asm-x86/x86_64/page.h
index 5a613bc..1a6cae6 100644
--- a/xen/include/asm-x86/x86_64/page.h
+++ b/xen/include/asm-x86/x86_64/page.h
@@ -152,12 +152,6 @@ typedef l4_pgentry_t root_pgentry_t;
 #define _PAGE_GNTTAB (1U<<22)
 
 /*
- * Bit 24 of a 24-bit flag mask!  This is not any bit of a real pte,
- * and is only used for signalling in variables that contain flags.
- */
-#define _PAGE_INVALID_BIT (1U<<24)
-
-/*
  * Bit 12 of a 24-bit flag mask. This corresponds to bit 52 of a pte.
  * This is needed to distinguish between user and kernel PTEs since _PAGE_USER
  * is asserted for both.
-- 
2.1.4


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
https://lists.xen.org/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.