Xen project Mailing List

[Xen-changelog] [xen staging-4.9] x86/pv: Force a guest into shadow mode when it writes an L1TF-vulnerable PTE

Date: Tue, 14 Aug 2018 17:24:53 +0000

Delivery-date: Tue, 14 Aug 2018 17:24:56 +0000

List-id: "Change log for Mercurial \(receive only\)" <xen-changelog.lists.xenproject.org>

commit f73c7770426344a670e0d42e926ff8c0f897ee1b Author: Juergen Gross <jgross@xxxxxxxx> AuthorDate: Mon Jul 23 08:11:40 2018 +0200 Commit: Andrew Cooper <andrew.cooper3@xxxxxxxxxx> CommitDate: Tue Aug 14 17:20:01 2018 +0100 x86/pv: Force a guest into shadow mode when it writes an L1TF-vulnerable PTE See the comment in shadow.h for an explanation of L1TF and the safety consideration of the PTEs. In the case that CONFIG_SHADOW_PAGING isn't compiled in, crash the domain instead. This allows well-behaved PV guests to function, while preventing L1TF from being exploited. (Note: PV guest kernels which haven't been updated with L1TF mitigations will likely be crashed as soon as they try paging a piece of userspace out to disk.) This is part of XSA-273 / CVE-2018-3620. Signed-off-by: Juergen Gross <jgross@xxxxxxxx> Signed-off-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx> Reviewed-by: Tim Deegan <tim@xxxxxxx> Reviewed-by: Jan Beulich <jbeulich@xxxxxxxx> (cherry picked from commit 06e8b622d3f3c0fa5075e91b041c6f45549ad70a) --- xen/arch/x86/mm.c | 26 ++++++++++-- xen/include/asm-x86/shadow.h | 94 ++++++++++++++++++++++++++++++++++++++++++++ xen/include/xen/tasklet.h | 5 +++ 3 files changed, 122 insertions(+), 3 deletions(-) diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c index d6d9546e20..0185c1c1cf 100644 --- a/xen/arch/x86/mm.c +++ b/xen/arch/x86/mm.c @@ -1251,7 +1251,7 @@ get_page_from_l2e( int rc; if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ) - return 1; + return pv_l1tf_check_l2e(d, l2e) ? -ERESTART : 1; if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) ) { @@ -1293,7 +1293,7 @@ get_page_from_l3e( int rc; if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ) - return 1; + return pv_l1tf_check_l3e(d, l3e) ? -ERESTART : 1; if ( unlikely((l3e_get_flags(l3e) & l3_disallow_mask(d))) ) { @@ -1320,7 +1320,7 @@ get_page_from_l4e( int rc; if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) ) - return 1; + return pv_l1tf_check_l4e(d, l4e) ? -ERESTART : 1; if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) ) { @@ -1567,6 +1567,13 @@ static int alloc_l1_table(struct page_info *page) for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) { + if ( !(l1e_get_flags(pl1e[i]) & _PAGE_PRESENT) ) + { + ret = pv_l1tf_check_l1e(d, pl1e[i]) ? -ERESTART : 0; + if ( ret ) + goto out; + } + if ( is_guest_l1_slot(i) ) switch ( ret = get_page_from_l1e(pl1e[i], d, d) ) { @@ -1588,6 +1595,7 @@ static int alloc_l1_table(struct page_info *page) fail: gdprintk(XENLOG_WARNING, "Failure in alloc_l1_table: slot %#x\n", i); + out: while ( i-- > 0 ) if ( is_guest_l1_slot(i) ) put_page_from_l1e(pl1e[i], d); @@ -2189,6 +2197,8 @@ static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e, rc = -EBUSY; } } + else if ( pv_l1tf_check_l1e(pt_dom, nl1e) ) + return -ERESTART; else if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, pt_vcpu, preserve_ad)) ) { @@ -2252,6 +2262,8 @@ static int mod_l2_entry(l2_pgentry_t *pl2e, rc = -EBUSY; } } + else if ( pv_l1tf_check_l2e(d, nl2e) ) + return -ERESTART; else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu, preserve_ad)) ) { @@ -2320,6 +2332,8 @@ static int mod_l3_entry(l3_pgentry_t *pl3e, rc = -EFAULT; } } + else if ( pv_l1tf_check_l3e(d, nl3e) ) + return -ERESTART; else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu, preserve_ad)) ) { @@ -2385,6 +2399,8 @@ static int mod_l4_entry(l4_pgentry_t *pl4e, rc = -EFAULT; } } + else if ( pv_l1tf_check_l4e(d, nl4e) ) + return -ERESTART; else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu, preserve_ad)) ) { @@ -5660,6 +5676,10 @@ static int ptwr_emulated_update( /* Check the new PTE. */ nl1e = l1e_from_intpte(val); + + if ( !(l1e_get_flags(nl1e) & _PAGE_PRESENT) && pv_l1tf_check_l1e(d, nl1e) ) + return X86EMUL_RETRY; + switch ( ret = get_page_from_l1e(nl1e, d, d) ) { default: diff --git a/xen/include/asm-x86/shadow.h b/xen/include/asm-x86/shadow.h index cbb226285b..46223b4f97 100644 --- a/xen/include/asm-x86/shadow.h +++ b/xen/include/asm-x86/shadow.h @@ -123,8 +123,102 @@ static inline int shadow_domctl(struct domain *d, xen_domctl_shadow_op_t *sc, * What we can do is force a PV guest which writes a vulnerable PTE into * shadow mode, so Xen controls the pagetables which are reachable by the CPU * pagewalk. + * + * The core of the L1TF vulnerability is that the address bits of the PTE + * (accounting for PSE and factoring in the level-relevant part of the linear + * access) are sent for an L1D lookup (to retrieve the next-level PTE, or + * eventual memory address) before the Present or reserved bits (which would + * cause a terminal fault) are accounted for. If an L1D hit occurs, the + * resulting data is available for potentially dependent instructions. + * + * For Present PTEs, the PV type-count safety logic ensures that the address + * bits always point at a guest-accessible frame, which is safe WRT L1TF from + * Xen's point of view. In practice, a PV guest should be unable to set any + * reserved bits, so should be unable to create any present L1TF-vulnerable + * PTEs at all. + * + * Therefore, these safety checks apply to Not-Present PTEs only, where + * traditionally, Xen would have let the guest write any value it chose. + * + * The all-zero PTE potentially leaks mfn 0. All software on the system is + * expected to cooperate and not put any secrets there. In a Xen system, + * neither Xen nor dom0 are expected to touch mfn 0, as it typically contains + * the real mode IVT and Bios Data Area. Therefore, mfn 0 is considered safe. + * + * Any PTE whose address is higher than the maximum cacheable address is safe, + * as it won't get an L1D hit. + * + * Speculative superpages also need accounting for, as PSE is considered + * irrespective of Present. We disallow PSE being set, as it allows an + * attacker to leak 2M or 1G of data starting from mfn 0. Also, because of + * recursive/linear pagetables, we must consider PSE even at L4, as hardware + * will interpret an L4e as an L3e during a recursive walk. */ +static inline bool is_l1tf_safe_maddr(intpte_t pte) +{ + paddr_t maddr = pte & l1tf_addr_mask; + + return maddr == 0 || maddr >= l1tf_safe_maddr; +} + +static inline bool pv_l1tf_check_pte(struct domain *d, unsigned int level, + intpte_t pte) +{ + ASSERT(is_pv_domain(d)); + ASSERT(!(pte & _PAGE_PRESENT)); + + if ( d->arch.pv_domain.check_l1tf && !paging_mode_sh_forced(d) && + (((level > 1) && (pte & _PAGE_PSE)) || !is_l1tf_safe_maddr(pte)) ) + { +#ifdef CONFIG_SHADOW_PAGING + struct tasklet *t = &d->arch.paging.shadow.pv_l1tf_tasklet; + + printk(XENLOG_G_WARNING + "d%d L1TF-vulnerable L%ue %016"PRIx64" - Shadowing\n", + d->domain_id, level, pte); + /* + * Safety consideration for accessing tasklet.scheduled_on without the + * tasklet lock. This is a singleshot tasklet with the side effect of + * setting PG_SH_forced (checked just above). Multiple vcpus can race + * to schedule the tasklet, but if we observe it scheduled anywhere, + * that is good enough. + */ + smp_rmb(); + if ( !tasklet_is_scheduled(t) ) + tasklet_schedule(t); +#else + printk(XENLOG_G_ERR + "d%d L1TF-vulnerable L%ue %016"PRIx64" - Crashing\n", + d->domain_id, level, pte); + domain_crash(d); +#endif + return true; + } + + return false; +} + +static inline bool pv_l1tf_check_l1e(struct domain *d, l1_pgentry_t l1e) +{ + return pv_l1tf_check_pte(d, 1, l1e.l1); +} + +static inline bool pv_l1tf_check_l2e(struct domain *d, l2_pgentry_t l2e) +{ + return pv_l1tf_check_pte(d, 2, l2e.l2); +} + +static inline bool pv_l1tf_check_l3e(struct domain *d, l3_pgentry_t l3e) +{ + return pv_l1tf_check_pte(d, 3, l3e.l3); +} + +static inline bool pv_l1tf_check_l4e(struct domain *d, l4_pgentry_t l4e) +{ + return pv_l1tf_check_pte(d, 4, l4e.l4); +} + void pv_l1tf_tasklet(unsigned long data); static inline void pv_l1tf_domain_init(struct domain *d) diff --git a/xen/include/xen/tasklet.h b/xen/include/xen/tasklet.h index 8c3de7e20e..88c91507e5 100644 --- a/xen/include/xen/tasklet.h +++ b/xen/include/xen/tasklet.h @@ -40,6 +40,11 @@ DECLARE_PER_CPU(unsigned long, tasklet_work_to_do); #define TASKLET_enqueued (1ul << _TASKLET_enqueued) #define TASKLET_scheduled (1ul << _TASKLET_scheduled) +static inline bool tasklet_is_scheduled(const struct tasklet *t) +{ + return t->scheduled_on != -1; +} + void tasklet_schedule_on_cpu(struct tasklet *t, unsigned int cpu); void tasklet_schedule(struct tasklet *t); void do_tasklet(void); -- generated by git-patchbot for /home/xen/git/xen.git#staging-4.9 _______________________________________________ Xen-changelog mailing list Xen-changelog@xxxxxxxxxxxxxxxxxxxx https://lists.xenproject.org/xen-changelog

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.