[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] [xen staging-4.10] x86/pv: Force a guest into shadow mode when it writes an L1TF-vulnerable PTE



commit c67a8b808ad16864248f24ef099e7d181e107dcb
Author:     Juergen Gross <jgross@xxxxxxxx>
AuthorDate: Mon Jul 23 08:11:40 2018 +0200
Commit:     Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
CommitDate: Tue Aug 14 17:16:28 2018 +0100

    x86/pv: Force a guest into shadow mode when it writes an L1TF-vulnerable PTE
    
    See the comment in shadow.h for an explanation of L1TF and the safety
    consideration of the PTEs.
    
    In the case that CONFIG_SHADOW_PAGING isn't compiled in, crash the domain
    instead.  This allows well-behaved PV guests to function, while preventing
    L1TF from being exploited.  (Note: PV guest kernels which haven't been 
updated
    with L1TF mitigations will likely be crashed as soon as they try paging a
    piece of userspace out to disk.)
    
    This is part of XSA-273 / CVE-2018-3620.
    
    Signed-off-by: Juergen Gross <jgross@xxxxxxxx>
    Signed-off-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
    Reviewed-by: Tim Deegan <tim@xxxxxxx>
    Reviewed-by: Jan Beulich <jbeulich@xxxxxxxx>
    (cherry picked from commit 06e8b622d3f3c0fa5075e91b041c6f45549ad70a)
---
 xen/arch/x86/mm.c               | 22 ++++++++--
 xen/arch/x86/pv/ro-page-fault.c |  5 +++
 xen/include/asm-x86/shadow.h    | 94 +++++++++++++++++++++++++++++++++++++++++
 xen/include/xen/tasklet.h       |  5 +++
 4 files changed, 123 insertions(+), 3 deletions(-)

diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 821bd41ac2..f3dfe35785 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -1158,7 +1158,7 @@ get_page_from_l2e(
     int rc;
 
     if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
-        return 1;
+        return pv_l1tf_check_l2e(d, l2e) ? -ERESTART : 1;
 
     if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
     {
@@ -1193,7 +1193,7 @@ get_page_from_l3e(
     int rc;
 
     if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
-        return 1;
+        return pv_l1tf_check_l3e(d, l3e) ? -ERESTART : 1;
 
     if ( unlikely((l3e_get_flags(l3e) & l3_disallow_mask(d))) )
     {
@@ -1226,7 +1226,7 @@ get_page_from_l4e(
     int rc;
 
     if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
-        return 1;
+        return pv_l1tf_check_l4e(d, l4e) ? -ERESTART : 1;
 
     if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
     {
@@ -1421,6 +1421,13 @@ static int alloc_l1_table(struct page_info *page)
 
     for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
     {
+        if ( !(l1e_get_flags(pl1e[i]) & _PAGE_PRESENT) )
+        {
+            ret = pv_l1tf_check_l1e(d, pl1e[i]) ? -ERESTART : 0;
+            if ( ret )
+                goto out;
+        }
+
         switch ( ret = get_page_from_l1e(pl1e[i], d, d) )
         {
         default:
@@ -1441,6 +1448,7 @@ static int alloc_l1_table(struct page_info *page)
 
  fail:
     gdprintk(XENLOG_WARNING, "Failure in alloc_l1_table: slot %#x\n", i);
+ out:
     while ( i-- > 0 )
         put_page_from_l1e(pl1e[i], d);
 
@@ -2037,6 +2045,8 @@ static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t 
nl1e,
             rc = -EBUSY;
         }
     }
+    else if ( pv_l1tf_check_l1e(pt_dom, nl1e) )
+        return -ERESTART;
     else if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, pt_vcpu,
                                      preserve_ad)) )
     {
@@ -2100,6 +2110,8 @@ static int mod_l2_entry(l2_pgentry_t *pl2e,
             rc = -EBUSY;
         }
     }
+    else if ( pv_l1tf_check_l2e(d, nl2e) )
+        return -ERESTART;
     else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu,
                                      preserve_ad)) )
     {
@@ -2161,6 +2173,8 @@ static int mod_l3_entry(l3_pgentry_t *pl3e,
             rc = -EFAULT;
         }
     }
+    else if ( pv_l1tf_check_l3e(d, nl3e) )
+        return -ERESTART;
     else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu,
                                      preserve_ad)) )
     {
@@ -2226,6 +2240,8 @@ static int mod_l4_entry(l4_pgentry_t *pl4e,
             rc = -EFAULT;
         }
     }
+    else if ( pv_l1tf_check_l4e(d, nl4e) )
+        return -ERESTART;
     else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu,
                                      preserve_ad)) )
     {
diff --git a/xen/arch/x86/pv/ro-page-fault.c b/xen/arch/x86/pv/ro-page-fault.c
index 6b2976d3df..622bb7dff0 100644
--- a/xen/arch/x86/pv/ro-page-fault.c
+++ b/xen/arch/x86/pv/ro-page-fault.c
@@ -29,6 +29,7 @@
 #include <asm/mm.h>
 #include <asm/pci.h>
 #include <asm/pv/mm.h>
+#include <asm/shadow.h>
 
 #include "emulate.h"
 #include "mm.h"
@@ -127,6 +128,10 @@ static int ptwr_emulated_update(unsigned long addr, 
paddr_t old, paddr_t val,
 
     /* Check the new PTE. */
     nl1e = l1e_from_intpte(val);
+
+    if ( !(l1e_get_flags(nl1e) & _PAGE_PRESENT) && pv_l1tf_check_l1e(d, nl1e) )
+        return X86EMUL_RETRY;
+
     switch ( ret = get_page_from_l1e(nl1e, d, d) )
     {
     default:
diff --git a/xen/include/asm-x86/shadow.h b/xen/include/asm-x86/shadow.h
index 14afb7db52..f40f411871 100644
--- a/xen/include/asm-x86/shadow.h
+++ b/xen/include/asm-x86/shadow.h
@@ -124,8 +124,102 @@ static inline int shadow_domctl(struct domain *d,
  * What we can do is force a PV guest which writes a vulnerable PTE into
  * shadow mode, so Xen controls the pagetables which are reachable by the CPU
  * pagewalk.
+ *
+ * The core of the L1TF vulnerability is that the address bits of the PTE
+ * (accounting for PSE and factoring in the level-relevant part of the linear
+ * access) are sent for an L1D lookup (to retrieve the next-level PTE, or
+ * eventual memory address) before the Present or reserved bits (which would
+ * cause a terminal fault) are accounted for.  If an L1D hit occurs, the
+ * resulting data is available for potentially dependent instructions.
+ *
+ * For Present PTEs, the PV type-count safety logic ensures that the address
+ * bits always point at a guest-accessible frame, which is safe WRT L1TF from
+ * Xen's point of view.  In practice, a PV guest should be unable to set any
+ * reserved bits, so should be unable to create any present L1TF-vulnerable
+ * PTEs at all.
+ *
+ * Therefore, these safety checks apply to Not-Present PTEs only, where
+ * traditionally, Xen would have let the guest write any value it chose.
+ *
+ * The all-zero PTE potentially leaks mfn 0.  All software on the system is
+ * expected to cooperate and not put any secrets there.  In a Xen system,
+ * neither Xen nor dom0 are expected to touch mfn 0, as it typically contains
+ * the real mode IVT and Bios Data Area.  Therefore, mfn 0 is considered safe.
+ *
+ * Any PTE whose address is higher than the maximum cacheable address is safe,
+ * as it won't get an L1D hit.
+ *
+ * Speculative superpages also need accounting for, as PSE is considered
+ * irrespective of Present.  We disallow PSE being set, as it allows an
+ * attacker to leak 2M or 1G of data starting from mfn 0.  Also, because of
+ * recursive/linear pagetables, we must consider PSE even at L4, as hardware
+ * will interpret an L4e as an L3e during a recursive walk.
  */
 
+static inline bool is_l1tf_safe_maddr(intpte_t pte)
+{
+    paddr_t maddr = pte & l1tf_addr_mask;
+
+    return maddr == 0 || maddr >= l1tf_safe_maddr;
+}
+
+static inline bool pv_l1tf_check_pte(struct domain *d, unsigned int level,
+                                     intpte_t pte)
+{
+    ASSERT(is_pv_domain(d));
+    ASSERT(!(pte & _PAGE_PRESENT));
+
+    if ( d->arch.pv_domain.check_l1tf && !paging_mode_sh_forced(d) &&
+         (((level > 1) && (pte & _PAGE_PSE)) || !is_l1tf_safe_maddr(pte)) )
+    {
+#ifdef CONFIG_SHADOW_PAGING
+        struct tasklet *t = &d->arch.paging.shadow.pv_l1tf_tasklet;
+
+        printk(XENLOG_G_WARNING
+               "d%d L1TF-vulnerable L%ue %016"PRIx64" - Shadowing\n",
+               d->domain_id, level, pte);
+        /*
+         * Safety consideration for accessing tasklet.scheduled_on without the
+         * tasklet lock.  This is a singleshot tasklet with the side effect of
+         * setting PG_SH_forced (checked just above).  Multiple vcpus can race
+         * to schedule the tasklet, but if we observe it scheduled anywhere,
+         * that is good enough.
+         */
+        smp_rmb();
+        if ( !tasklet_is_scheduled(t) )
+            tasklet_schedule(t);
+#else
+        printk(XENLOG_G_ERR
+               "d%d L1TF-vulnerable L%ue %016"PRIx64" - Crashing\n",
+               d->domain_id, level, pte);
+        domain_crash(d);
+#endif
+        return true;
+    }
+
+    return false;
+}
+
+static inline bool pv_l1tf_check_l1e(struct domain *d, l1_pgentry_t l1e)
+{
+    return pv_l1tf_check_pte(d, 1, l1e.l1);
+}
+
+static inline bool pv_l1tf_check_l2e(struct domain *d, l2_pgentry_t l2e)
+{
+    return pv_l1tf_check_pte(d, 2, l2e.l2);
+}
+
+static inline bool pv_l1tf_check_l3e(struct domain *d, l3_pgentry_t l3e)
+{
+    return pv_l1tf_check_pte(d, 3, l3e.l3);
+}
+
+static inline bool pv_l1tf_check_l4e(struct domain *d, l4_pgentry_t l4e)
+{
+    return pv_l1tf_check_pte(d, 4, l4e.l4);
+}
+
 void pv_l1tf_tasklet(unsigned long data);
 
 static inline void pv_l1tf_domain_init(struct domain *d)
diff --git a/xen/include/xen/tasklet.h b/xen/include/xen/tasklet.h
index 23d69c738e..bc9ddace6d 100644
--- a/xen/include/xen/tasklet.h
+++ b/xen/include/xen/tasklet.h
@@ -50,6 +50,11 @@ static inline bool tasklet_work_to_do(unsigned int cpu)
                                                 TASKLET_scheduled);
 }
 
+static inline bool tasklet_is_scheduled(const struct tasklet *t)
+{
+    return t->scheduled_on != -1;
+}
+
 void tasklet_schedule_on_cpu(struct tasklet *t, unsigned int cpu);
 void tasklet_schedule(struct tasklet *t);
 void do_tasklet(void);
--
generated by git-patchbot for /home/xen/git/xen.git#staging-4.10

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/xen-changelog

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.