[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] Re: [Xen-devel] [PATCH v6][XSA-97] x86/paging: make log-dirty operations preemptible
On 02/10/14 16:15, Jan Beulich wrote: > Both the freeing and the inspection of the bitmap get done in (nested) > loops which - besides having a rather high iteration count in general, > albeit that would be covered by XSA-77 - have the number of non-trivial > iterations they need to perform (indirectly) controllable by both the > guest they are for and any domain controlling the guest (including the > one running qemu for it). > > Note that the tying of the continuations to the invoking domain (which > previously [wrongly] used the invoking vCPU instead) implies that the > tools requesting such operations have to make sure they don't issue > multiple similar operations in parallel. > > Note further that this breaks supervisor-mode kernel assumptions in > hypercall_create_continuation() (where regs->eip gets rewound to the > current hypercall stub beginning), but otoh > hypercall_cancel_continuation() doesn't work in that mode either. > Perhaps time to rip out all the remains of that feature? > > This is part of CVE-2014-5146 / XSA-97. > > Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx> > Reviewed-by: Tim Deegan <tim@xxxxxxx> This patch has had moderate testing in XenRT and not yet fallen over (previous iterations reliably fell over in a number of ways, given our standard VM livecycle operation tests). All testing was with migration-v2 on a Xen-4.4 base. Tested-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx> > --- > v6: Move stats (dirty_count and fault_count) zeroing to the final > success exit path (also fixing pre-existing issues with them > getting cleaned but an error getting detected/reported later). > v5: Also wire up helper hypercall in HVM/PVH hypercall tables (pointed > out by Tim). > v4: Fold in Andrew's paging_mode_log_dirty() guarding adjustments to > paging_log_dirty_disable() and its caller. Convert tying > continuations to domain (was vCPU). Introduce a separate internal > hypercall (__HYPERCALL_arch_1) to deal with the continuation (both > suggested by Tim and Andrew). As a result hap_domctl() and > shadow_domctl() now don't propagate -ERESTART into paging_domctl() > anymore, as that would imply setting d->arch.paging.preempt fields > without holding the paging lock. Sadly this results in mixed > methods used for continuations here. > v3: Convert if(!resuming) to ASSERT() in paging_log_dirty_op(). > v2: Re-order L4 loop continuation/termination handling in > paging_free_log_dirty_bitmap(). Add an ASSERT() in a special case > exit path of paging_log_dirty_op(). > > --- a/xen/arch/x86/domain.c > +++ b/xen/arch/x86/domain.c > @@ -1941,7 +1941,9 @@ int domain_relinquish_resources(struct d > pci_release_devices(d); > > /* Tear down paging-assistance stuff. */ > - paging_teardown(d); > + ret = paging_teardown(d); > + if ( ret ) > + return ret; > > /* Drop the in-use references to page-table bases. */ > for_each_vcpu ( d, v ) > --- a/xen/arch/x86/domctl.c > +++ b/xen/arch/x86/domctl.c > @@ -61,9 +61,11 @@ long arch_do_domctl( > > case XEN_DOMCTL_shadow_op: > { > - ret = paging_domctl(d, > - &domctl->u.shadow_op, > - guest_handle_cast(u_domctl, void)); > + ret = paging_domctl(d, &domctl->u.shadow_op, > + guest_handle_cast(u_domctl, void), 0); > + if ( ret == -ERESTART ) > + return hypercall_create_continuation(__HYPERVISOR_arch_1, > + "h", u_domctl); > copyback = 1; > } > break; > --- a/xen/arch/x86/hvm/hvm.c > +++ b/xen/arch/x86/hvm/hvm.c > @@ -4770,7 +4770,8 @@ static hvm_hypercall_t *const hvm_hyperc > HYPERCALL(hvm_op), > HYPERCALL(sysctl), > HYPERCALL(domctl), > - HYPERCALL(tmem_op) > + HYPERCALL(tmem_op), > + [ __HYPERVISOR_arch_1 ] = (hvm_hypercall_t *)paging_domctl_continuation > }; > > #define COMPAT_CALL(x) \ > @@ -4790,7 +4791,8 @@ static hvm_hypercall_t *const hvm_hyperc > HYPERCALL(hvm_op), > HYPERCALL(sysctl), > HYPERCALL(domctl), > - HYPERCALL(tmem_op) > + HYPERCALL(tmem_op), > + [ __HYPERVISOR_arch_1 ] = (hvm_hypercall_t *)paging_domctl_continuation > }; > > /* PVH 32bitfixme. */ > @@ -4808,7 +4810,8 @@ static hvm_hypercall_t *const pvh_hyperc > [ __HYPERVISOR_physdev_op ] = (hvm_hypercall_t *)hvm_physdev_op, > HYPERCALL(hvm_op), > HYPERCALL(sysctl), > - HYPERCALL(domctl) > + HYPERCALL(domctl), > + [ __HYPERVISOR_arch_1 ] = (hvm_hypercall_t *)paging_domctl_continuation > }; > > int hvm_do_hypercall(struct cpu_user_regs *regs) > --- a/xen/arch/x86/mm/paging.c > +++ b/xen/arch/x86/mm/paging.c > @@ -26,6 +26,7 @@ > #include <asm/shadow.h> > #include <asm/p2m.h> > #include <asm/hap.h> > +#include <asm/event.h> > #include <asm/hvm/nestedhvm.h> > #include <xen/numa.h> > #include <xsm/xsm.h> > @@ -116,26 +117,46 @@ static void paging_free_log_dirty_page(s > d->arch.paging.free_page(d, mfn_to_page(mfn)); > } > > -void paging_free_log_dirty_bitmap(struct domain *d) > +static int paging_free_log_dirty_bitmap(struct domain *d, int rc) > { > mfn_t *l4, *l3, *l2; > int i4, i3, i2; > > + paging_lock(d); > + > if ( !mfn_valid(d->arch.paging.log_dirty.top) ) > - return; > + { > + paging_unlock(d); > + return 0; > + } > > - paging_lock(d); > + if ( !d->arch.paging.preempt.dom ) > + { > + memset(&d->arch.paging.preempt.log_dirty, 0, > + sizeof(d->arch.paging.preempt.log_dirty)); > + ASSERT(rc <= 0); > + d->arch.paging.preempt.log_dirty.done = -rc; > + } > + else if ( d->arch.paging.preempt.dom != current->domain || > + d->arch.paging.preempt.op != XEN_DOMCTL_SHADOW_OP_OFF ) > + { > + paging_unlock(d); > + return -EBUSY; > + } > > l4 = map_domain_page(mfn_x(d->arch.paging.log_dirty.top)); > + i4 = d->arch.paging.preempt.log_dirty.i4; > + i3 = d->arch.paging.preempt.log_dirty.i3; > + rc = 0; > > - for ( i4 = 0; i4 < LOGDIRTY_NODE_ENTRIES; i4++ ) > + for ( ; i4 < LOGDIRTY_NODE_ENTRIES; i4++, i3 = 0 ) > { > if ( !mfn_valid(l4[i4]) ) > continue; > > l3 = map_domain_page(mfn_x(l4[i4])); > > - for ( i3 = 0; i3 < LOGDIRTY_NODE_ENTRIES; i3++ ) > + for ( ; i3 < LOGDIRTY_NODE_ENTRIES; i3++ ) > { > if ( !mfn_valid(l3[i3]) ) > continue; > @@ -148,20 +169,54 @@ void paging_free_log_dirty_bitmap(struct > > unmap_domain_page(l2); > paging_free_log_dirty_page(d, l3[i3]); > + l3[i3] = _mfn(INVALID_MFN); > + > + if ( i3 < LOGDIRTY_NODE_ENTRIES - 1 && hypercall_preempt_check() > ) > + { > + d->arch.paging.preempt.log_dirty.i3 = i3 + 1; > + d->arch.paging.preempt.log_dirty.i4 = i4; > + rc = -ERESTART; > + break; > + } > } > > unmap_domain_page(l3); > + if ( rc ) > + break; > paging_free_log_dirty_page(d, l4[i4]); > + l4[i4] = _mfn(INVALID_MFN); > + > + if ( i4 < LOGDIRTY_NODE_ENTRIES - 1 && hypercall_preempt_check() ) > + { > + d->arch.paging.preempt.log_dirty.i3 = 0; > + d->arch.paging.preempt.log_dirty.i4 = i4 + 1; > + rc = -ERESTART; > + break; > + } > } > > unmap_domain_page(l4); > - paging_free_log_dirty_page(d, d->arch.paging.log_dirty.top); > - d->arch.paging.log_dirty.top = _mfn(INVALID_MFN); > > - ASSERT(d->arch.paging.log_dirty.allocs == 0); > - d->arch.paging.log_dirty.failed_allocs = 0; > + if ( !rc ) > + { > + paging_free_log_dirty_page(d, d->arch.paging.log_dirty.top); > + d->arch.paging.log_dirty.top = _mfn(INVALID_MFN); > + > + ASSERT(d->arch.paging.log_dirty.allocs == 0); > + d->arch.paging.log_dirty.failed_allocs = 0; > + > + rc = -d->arch.paging.preempt.log_dirty.done; > + d->arch.paging.preempt.dom = NULL; > + } > + else > + { > + d->arch.paging.preempt.dom = current->domain; > + d->arch.paging.preempt.op = XEN_DOMCTL_SHADOW_OP_OFF; > + } > > paging_unlock(d); > + > + return rc; > } > > int paging_log_dirty_enable(struct domain *d, bool_t log_global) > @@ -187,15 +242,25 @@ int paging_log_dirty_enable(struct domai > return ret; > } > > -int paging_log_dirty_disable(struct domain *d) > +static int paging_log_dirty_disable(struct domain *d, bool_t resuming) > { > - int ret; > + int ret = 1; > + > + if ( !resuming ) > + { > + domain_pause(d); > + /* Safe because the domain is paused. */ > + if ( paging_mode_log_dirty(d) ) > + { > + ret = d->arch.paging.log_dirty.disable_log_dirty(d); > + ASSERT(ret <= 0); > + } > + } > + > + ret = paging_free_log_dirty_bitmap(d, ret); > + if ( ret == -ERESTART ) > + return ret; > > - domain_pause(d); > - /* Safe because the domain is paused. */ > - ret = d->arch.paging.log_dirty.disable_log_dirty(d); > - if ( !paging_mode_log_dirty(d) ) > - paging_free_log_dirty_bitmap(d); > domain_unpause(d); > > return ret; > @@ -335,7 +400,9 @@ int paging_mfn_is_dirty(struct domain *d > > /* Read a domain's log-dirty bitmap and stats. If the operation is a CLEAN, > * clear the bitmap and stats as well. */ > -int paging_log_dirty_op(struct domain *d, struct xen_domctl_shadow_op *sc) > +static int paging_log_dirty_op(struct domain *d, > + struct xen_domctl_shadow_op *sc, > + bool_t resuming) > { > int rv = 0, clean = 0, peek = 1; > unsigned long pages = 0; > @@ -343,9 +410,22 @@ int paging_log_dirty_op(struct domain *d > unsigned long *l1 = NULL; > int i4, i3, i2; > > - domain_pause(d); > + if ( !resuming ) > + domain_pause(d); > paging_lock(d); > > + if ( !d->arch.paging.preempt.dom ) > + memset(&d->arch.paging.preempt.log_dirty, 0, > + sizeof(d->arch.paging.preempt.log_dirty)); > + else if ( d->arch.paging.preempt.dom != current->domain || > + d->arch.paging.preempt.op != sc->op ) > + { > + paging_unlock(d); > + ASSERT(!resuming); > + domain_unpause(d); > + return -EBUSY; > + } > + > clean = (sc->op == XEN_DOMCTL_SHADOW_OP_CLEAN); > > PAGING_DEBUG(LOGDIRTY, "log-dirty %s: dom %u faults=%u dirty=%u\n", > @@ -357,12 +437,6 @@ int paging_log_dirty_op(struct domain *d > sc->stats.fault_count = d->arch.paging.log_dirty.fault_count; > sc->stats.dirty_count = d->arch.paging.log_dirty.dirty_count; > > - if ( clean ) > - { > - d->arch.paging.log_dirty.fault_count = 0; > - d->arch.paging.log_dirty.dirty_count = 0; > - } > - > if ( guest_handle_is_null(sc->dirty_bitmap) ) > /* caller may have wanted just to clean the state or access stats. */ > peek = 0; > @@ -374,17 +448,15 @@ int paging_log_dirty_op(struct domain *d > goto out; > } > > - pages = 0; > l4 = paging_map_log_dirty_bitmap(d); > + i4 = d->arch.paging.preempt.log_dirty.i4; > + i3 = d->arch.paging.preempt.log_dirty.i3; > + pages = d->arch.paging.preempt.log_dirty.done; > > - for ( i4 = 0; > - (pages < sc->pages) && (i4 < LOGDIRTY_NODE_ENTRIES); > - i4++ ) > + for ( ; (pages < sc->pages) && (i4 < LOGDIRTY_NODE_ENTRIES); i4++, i3 = > 0 ) > { > l3 = (l4 && mfn_valid(l4[i4])) ? map_domain_page(mfn_x(l4[i4])) : > NULL; > - for ( i3 = 0; > - (pages < sc->pages) && (i3 < LOGDIRTY_NODE_ENTRIES); > - i3++ ) > + for ( ; (pages < sc->pages) && (i3 < LOGDIRTY_NODE_ENTRIES); i3++ ) > { > l2 = ((l3 && mfn_valid(l3[i3])) ? > map_domain_page(mfn_x(l3[i3])) : NULL); > @@ -419,18 +491,58 @@ int paging_log_dirty_op(struct domain *d > } > if ( l2 ) > unmap_domain_page(l2); > + > + if ( i3 < LOGDIRTY_NODE_ENTRIES - 1 && hypercall_preempt_check() > ) > + { > + d->arch.paging.preempt.log_dirty.i4 = i4; > + d->arch.paging.preempt.log_dirty.i3 = i3 + 1; > + rv = -ERESTART; > + break; > + } > } > if ( l3 ) > unmap_domain_page(l3); > + > + if ( !rv && i4 < LOGDIRTY_NODE_ENTRIES - 1 && > + hypercall_preempt_check() ) > + { > + d->arch.paging.preempt.log_dirty.i4 = i4 + 1; > + d->arch.paging.preempt.log_dirty.i3 = 0; > + rv = -ERESTART; > + } > + if ( rv ) > + break; > } > if ( l4 ) > unmap_domain_page(l4); > > - if ( pages < sc->pages ) > - sc->pages = pages; > + if ( !rv ) > + { > + d->arch.paging.preempt.dom = NULL; > + if ( clean ) > + { > + d->arch.paging.log_dirty.fault_count = 0; > + d->arch.paging.log_dirty.dirty_count = 0; > + } > + } > + else > + { > + d->arch.paging.preempt.dom = current->domain; > + d->arch.paging.preempt.op = sc->op; > + d->arch.paging.preempt.log_dirty.done = pages; > + } > > paging_unlock(d); > > + if ( rv ) > + { > + /* Never leave the domain paused on real errors. */ > + ASSERT(rv == -ERESTART); > + return rv; > + } > + > + if ( pages < sc->pages ) > + sc->pages = pages; > if ( clean ) > { > /* We need to further call clean_dirty_bitmap() functions of specific > @@ -441,6 +553,7 @@ int paging_log_dirty_op(struct domain *d > return rv; > > out: > + d->arch.paging.preempt.dom = NULL; > paging_unlock(d); > domain_unpause(d); > > @@ -504,12 +617,6 @@ void paging_log_dirty_init(struct domain > d->arch.paging.log_dirty.clean_dirty_bitmap = clean_dirty_bitmap; > } > > -/* This function fress log dirty bitmap resources. */ > -static void paging_log_dirty_teardown(struct domain*d) > -{ > - paging_free_log_dirty_bitmap(d); > -} > - > /************************************************/ > /* CODE FOR PAGING SUPPORT */ > /************************************************/ > @@ -551,7 +658,7 @@ void paging_vcpu_init(struct vcpu *v) > > > int paging_domctl(struct domain *d, xen_domctl_shadow_op_t *sc, > - XEN_GUEST_HANDLE_PARAM(void) u_domctl) > + XEN_GUEST_HANDLE_PARAM(void) u_domctl, bool_t resuming) > { > int rc; > > @@ -575,6 +682,20 @@ int paging_domctl(struct domain *d, xen_ > return -EINVAL; > } > > + if ( resuming > + ? (d->arch.paging.preempt.dom != current->domain || > + d->arch.paging.preempt.op != sc->op) > + : (d->arch.paging.preempt.dom && > + sc->op != XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION) ) > + { > + printk(XENLOG_G_DEBUG > + "%pv: Paging op %#x on Dom%u with unfinished prior op %#x by > Dom%u\n", > + current, sc->op, d->domain_id, d->arch.paging.preempt.op, > + d->arch.paging.preempt.dom > + ? d->arch.paging.preempt.dom->domain_id : DOMID_INVALID); > + return -EBUSY; > + } > + > rc = xsm_shadow_control(XSM_HOOK, d, sc->op); > if ( rc ) > return rc; > @@ -597,14 +718,13 @@ int paging_domctl(struct domain *d, xen_ > return paging_log_dirty_enable(d, 1); > > case XEN_DOMCTL_SHADOW_OP_OFF: > - if ( paging_mode_log_dirty(d) ) > - if ( (rc = paging_log_dirty_disable(d)) != 0 ) > - return rc; > + if ( (rc = paging_log_dirty_disable(d, resuming)) != 0 ) > + return rc; > break; > > case XEN_DOMCTL_SHADOW_OP_CLEAN: > case XEN_DOMCTL_SHADOW_OP_PEEK: > - return paging_log_dirty_op(d, sc); > + return paging_log_dirty_op(d, sc, resuming); > } > > /* Here, dispatch domctl to the appropriate paging code */ > @@ -614,19 +734,67 @@ int paging_domctl(struct domain *d, xen_ > return shadow_domctl(d, sc, u_domctl); > } > > +long paging_domctl_continuation(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) > u_domctl) > +{ > + struct xen_domctl op; > + struct domain *d; > + int ret; > + > + if ( copy_from_guest(&op, u_domctl, 1) ) > + return -EFAULT; > + > + if ( op.interface_version != XEN_DOMCTL_INTERFACE_VERSION || > + op.cmd != XEN_DOMCTL_shadow_op ) > + return -EBADRQC; > + > + d = rcu_lock_domain_by_id(op.domain); > + if ( d == NULL ) > + return -ESRCH; > + > + ret = xsm_domctl(XSM_OTHER, d, op.cmd); > + if ( !ret ) > + { > + if ( domctl_lock_acquire() ) > + { > + ret = paging_domctl(d, &op.u.shadow_op, > + guest_handle_cast(u_domctl, void), 1); > + > + domctl_lock_release(); > + } > + else > + ret = -ERESTART; > + } > + > + rcu_unlock_domain(d); > + > + if ( ret == -ERESTART ) > + ret = hypercall_create_continuation(__HYPERVISOR_arch_1, > + "h", u_domctl); > + else if ( __copy_field_to_guest(u_domctl, &op, u.shadow_op) ) > + ret = -EFAULT; > + > + return ret; > +} > + > /* Call when destroying a domain */ > -void paging_teardown(struct domain *d) > +int paging_teardown(struct domain *d) > { > + int rc; > + > if ( hap_enabled(d) ) > hap_teardown(d); > else > shadow_teardown(d); > > /* clean up log dirty resources. */ > - paging_log_dirty_teardown(d); > + rc = paging_free_log_dirty_bitmap(d, 0); > + if ( rc == -ERESTART ) > + return rc; > > /* Move populate-on-demand cache back to domain_list for destruction */ > p2m_pod_empty_cache(d); > + > + return rc; > } > > /* Call once all of the references to the domain have gone away */ > --- a/xen/arch/x86/x86_64/compat/entry.S > +++ b/xen/arch/x86/x86_64/compat/entry.S > @@ -421,6 +421,7 @@ ENTRY(compat_hypercall_table) > .quad compat_ni_hypercall > .endr > .quad do_mca /* 48 */ > + .quad paging_domctl_continuation > .rept NR_hypercalls-((.-compat_hypercall_table)/8) > .quad compat_ni_hypercall > .endr > @@ -469,6 +470,7 @@ ENTRY(compat_hypercall_args_table) > .byte 0 /* compat_ni_hypercall */ > .endr > .byte 1 /* do_mca */ > + .byte 1 /* paging_domctl_continuation */ > .rept NR_hypercalls-(.-compat_hypercall_args_table) > .byte 0 /* compat_ni_hypercall */ > .endr > --- a/xen/arch/x86/x86_64/entry.S > +++ b/xen/arch/x86/x86_64/entry.S > @@ -776,6 +776,7 @@ ENTRY(hypercall_table) > .quad do_ni_hypercall > .endr > .quad do_mca /* 48 */ > + .quad paging_domctl_continuation > .rept NR_hypercalls-((.-hypercall_table)/8) > .quad do_ni_hypercall > .endr > @@ -824,6 +825,7 @@ ENTRY(hypercall_args_table) > .byte 0 /* do_ni_hypercall */ > .endr > .byte 1 /* do_mca */ /* 48 */ > + .byte 1 /* paging_domctl_continuation */ > .rept NR_hypercalls-(.-hypercall_args_table) > .byte 0 /* do_ni_hypercall */ > .endr > --- a/xen/common/domain.c > +++ b/xen/common/domain.c > @@ -619,7 +619,6 @@ int domain_kill(struct domain *d) > { > if ( rc == -ERESTART ) > rc = -EAGAIN; > - BUG_ON(rc != -EAGAIN); > break; > } > if ( sched_move_domain(d, cpupool0) ) > --- a/xen/include/asm-x86/domain.h > +++ b/xen/include/asm-x86/domain.h > @@ -187,6 +187,20 @@ struct paging_domain { > struct hap_domain hap; > /* log dirty support */ > struct log_dirty_domain log_dirty; > + > + /* preemption handling */ > + struct { > + const struct domain *dom; > + unsigned int op; > + union { > + struct { > + unsigned long done:PADDR_BITS - PAGE_SHIFT; > + unsigned long i4:PAGETABLE_ORDER; > + unsigned long i3:PAGETABLE_ORDER; > + } log_dirty; > + }; > + } preempt; > + > /* alloc/free pages from the pool for paging-assistance structures > * (used by p2m and log-dirty code for their tries) */ > struct page_info * (*alloc_page)(struct domain *d); > --- a/xen/include/asm-x86/paging.h > +++ b/xen/include/asm-x86/paging.h > @@ -132,9 +132,6 @@ struct paging_mode { > > /***************************************************************************** > * Log dirty code */ > > -/* free log dirty bitmap resource */ > -void paging_free_log_dirty_bitmap(struct domain *d); > - > /* get the dirty bitmap for a specific range of pfns */ > void paging_log_dirty_range(struct domain *d, > unsigned long begin_pfn, > @@ -144,9 +141,6 @@ void paging_log_dirty_range(struct domai > /* enable log dirty */ > int paging_log_dirty_enable(struct domain *d, bool_t log_global); > > -/* disable log dirty */ > -int paging_log_dirty_disable(struct domain *d); > - > /* log dirty initialization */ > void paging_log_dirty_init(struct domain *d, > int (*enable_log_dirty)(struct domain *d, > @@ -203,10 +197,13 @@ int paging_domain_init(struct domain *d, > * and disable ephemeral shadow modes (test mode and log-dirty mode) and > * manipulate the log-dirty bitmap. */ > int paging_domctl(struct domain *d, xen_domctl_shadow_op_t *sc, > - XEN_GUEST_HANDLE_PARAM(void) u_domctl); > + XEN_GUEST_HANDLE_PARAM(void) u_domctl, bool_t resuming); > + > +/* Helper hypercall for dealing with continuations. */ > +long paging_domctl_continuation(XEN_GUEST_HANDLE_PARAM(xen_domctl_t)); > > /* Call when destroying a domain */ > -void paging_teardown(struct domain *d); > +int paging_teardown(struct domain *d); > > /* Call once all of the references to the domain have gone away */ > void paging_final_teardown(struct domain *d); > > _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxx http://lists.xen.org/xen-devel
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |