[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-devel] [PATCH 2 of 2] Improve ring management for memory events. Do not lose guest events
xen/arch/x86/hvm/hvm.c | 21 ++- xen/arch/x86/mm/mem_event.c | 203 +++++++++++++++++++++++++++++---------- xen/arch/x86/mm/mem_sharing.c | 17 ++- xen/arch/x86/mm/p2m.c | 47 +++++---- xen/common/memory.c | 7 +- xen/include/asm-x86/mem_event.h | 16 ++- xen/include/asm-x86/p2m.h | 6 +- xen/include/xen/mm.h | 2 + xen/include/xen/sched.h | 5 +- 9 files changed, 229 insertions(+), 95 deletions(-) The memevent code currently has a mechanism for reserving space in the ring before putting an event, but each caller must individually ensure that the vCPUs are correctly paused if no space is available. This fixes that issue by reversing the semantics: we ensure that enough space is always left for one event per vCPU in the ring. If, after putting the current request, this constraint will be violated by the current vCPU when putting putting another request in the ring, we pause the vCPU. For mem events caused by foreign vcpus, we simply drop the event if the space constraint will be violated. Foreign vcpus are expected to retry mapping operations (and thus the associated paging populate mem events will be re issued). Finally, guests can cause an unbound number of paging drop events via the balloon -> decrease_reservation hypercall. Thus, notify the hypercall there is no more space in the ring so a continuation is scheduled. With all these mechanisms, no guest events are lost and there is no need for wait-queues. Our testing includes 1. ballooning down 512 MiBs; 2. a mem event mode in which every page access in a four-vCPU guest results in an event, with no vCPU pausing, and the four vCPUs touching all RAM. No guest events were lost in either case, and qemu-dm had no mapping problems. Finally, mark_and_pause_vcpus was misleading, beacause it wasn't strictly pausing the vcpu's, rather sending them to sleep. Change to actual vcpu_pause, which protects wakeups via an atomic count. This is useful when an event pauses a vcpu and the vcpu gets mark and paused due to ring congestion. Signed-off-by: Adin Scannell <adin@xxxxxxxxxxx> Signed-off-by: Andres Lagar-Cavilla <andres@xxxxxxxxxxxxxxxx> diff -r 4c4a9ed0728c -r d1d238616c0c xen/arch/x86/hvm/hvm.c --- a/xen/arch/x86/hvm/hvm.c +++ b/xen/arch/x86/hvm/hvm.c @@ -4119,7 +4119,6 @@ static int hvm_memory_event_traps(long p struct vcpu* v = current; struct domain *d = v->domain; mem_event_request_t req; - int rc; if ( !(p & HVMPME_MODE_MASK) ) return 0; @@ -4127,10 +4126,7 @@ static int hvm_memory_event_traps(long p if ( (p & HVMPME_onchangeonly) && (value == old) ) return 1; - rc = mem_event_check_ring(d, &d->mem_event->access); - if ( rc ) - return rc; - + memset(&req, 0, sizeof(req)); req.type = MEM_EVENT_TYPE_ACCESS; req.reason = reason; @@ -4150,7 +4146,20 @@ static int hvm_memory_event_traps(long p req.gla_valid = 1; } - mem_event_put_request(d, &d->mem_event->access, &req); + if ( mem_event_put_request(d, &d->mem_event->access, &req) == -ENOSYS ) + { + /* rc == -ENOSYS + * If there was no ring to send the event, then simply unpause the + * vcpu (if we had previously paused it). It will retry the + * instruction (with the exception "handled") and go on */ + if ( req.flags & MEM_EVENT_FLAG_VCPU_PAUSED ) + vcpu_unpause(v); + /* rc == -EBUSY + * If the ring is full, the vcpu has been marked and paused, + * and the exception has been communicated to the consumer. + * Once there is room in the ring, the vcpu will be woken up + * and will retry. */ + } return 1; } diff -r 4c4a9ed0728c -r d1d238616c0c xen/arch/x86/mm/mem_event.c --- a/xen/arch/x86/mm/mem_event.c +++ b/xen/arch/x86/mm/mem_event.c @@ -93,6 +93,9 @@ static int mem_event_enable( put_gfn(dom_mem_event, ring_gfn); put_gfn(dom_mem_event, shared_gfn); + /* Set the number of currently blocked vCPUs to 0. */ + med->blocked = 0; + /* Allocate event channel */ rc = alloc_unbound_xen_event_channel(d->vcpu[0], current->domain->domain_id, @@ -111,7 +114,7 @@ static int mem_event_enable( mem_event_ring_lock_init(med); /* Wake any VCPUs paused for memory events */ - mem_event_unpause_vcpus(d); + mem_event_unpause_vcpus(d, med); return 0; @@ -125,8 +128,57 @@ static int mem_event_enable( return rc; } -static int mem_event_disable(struct mem_event_domain *med) +static void __mem_event_unpause_vcpus(struct domain *d, + struct mem_event_domain *med, int free) { + struct vcpu *v; + int i, j, k; + int online = d->max_vcpus; + + /* + * We ensure that we only have vCPUs online if there are enough free slots + * for their memory events to be processed. This will ensure that no + * memory events are lost (due to the fact that certain types of events + * cannot be replayed, we need to ensure that there is space in the ring + * for when they are hit). + * See comment below in mem_event_put_request(). + */ + for_each_vcpu ( d, v ) + if ( test_bit(_VPF_mem_event, &v->pause_flags) ) + online--; + + /* We remember which vcpu last woke up to avoid scanning always linearly + * from zero and starving higher-numbered vcpus under high load */ + if ( d->vcpu ) + { + for (i = med->last_vcpu_wake_up + 1, j = 0; j < d->max_vcpus; i++, j++) + { + k = i % d->max_vcpus; + v = d->vcpu[k]; + if ( !v ) continue; + + if ( !(med->blocked) || online >= free ) + break; + if ( test_and_clear_bit(_VPF_mem_event, &v->pause_flags) ) + { + vcpu_unpause(v); + online++; + med->blocked--; + med->last_vcpu_wake_up = k; + } + } + } +} + +static int mem_event_disable(struct domain *d, struct mem_event_domain *med) +{ + if ( !med ) + return 0; + + /* Wake up everybody, regardless */ + med->last_vcpu_wake_up = 0; + __mem_event_unpause_vcpus(d, med, d->max_vcpus); + unmap_domain_page(med->ring_page); med->ring_page = NULL; @@ -136,34 +188,98 @@ static int mem_event_disable(struct mem_ return 0; } -void mem_event_put_request(struct domain *d, struct mem_event_domain *med, mem_event_request_t *req) +static inline int mem_event_ring_free(struct domain *d, struct mem_event_domain *med) { + int free_requests; + + free_requests = RING_FREE_REQUESTS(&med->front_ring); + if ( unlikely(free_requests < d->max_vcpus) ) + { + /* This may happen during normal operation (hopefully not often). */ + gdprintk(XENLOG_INFO, "mem_event request slots for domain %d: %d\n", + d->domain_id, free_requests); + } + + return free_requests; +} + +/* Return values + * zero: success + * -ENOSYS: no ring + * -EAGAIN: ring is full and the event has not been transmitted. + * Only foreign vcpu's get EAGAIN + * -EBUSY: guest vcpu has been paused due to ring congestion + */ +int mem_event_put_request(struct domain *d, struct mem_event_domain *med, mem_event_request_t *req) +{ + int ret = 0; + int foreign = (d != current->domain); mem_event_front_ring_t *front_ring; RING_IDX req_prod; + if ( mem_event_check_ring(d, med) ) + return -ENOSYS; + mem_event_ring_lock(med); - front_ring = &med->front_ring; - req_prod = front_ring->req_prod_pvt; - - if ( current->domain != d ) + if ( foreign && + (mem_event_ring_free(d, med) <= (d->max_vcpus - med->blocked)) ) { - req->flags |= MEM_EVENT_FLAG_FOREIGN; - ASSERT( !(req->flags & MEM_EVENT_FLAG_VCPU_PAUSED) ); + /* This is an event caused by a foreign mapper. Putting the event + * in the ring could preclude a guest vcpu from putting an event. + * The foreign mapper has to retry. */ + gdprintk(XENLOG_DEBUG, "Foreign-caused (%u) event will not be put" + " in ring with %d slots %d sleepers", current->domain->domain_id, + mem_event_ring_free(d, med), med->blocked); + mem_event_ring_unlock(med); + return -EAGAIN; } - /* Copy request */ - memcpy(RING_GET_REQUEST(front_ring, req_prod), req, sizeof(*req)); - req_prod++; + if ( mem_event_ring_free(d, med) == 0 ) + { + /* This *should* never happen */ + gdprintk(XENLOG_WARNING, "possible lost mem_event for domain %d\n", + d->domain_id); + ret = -EBUSY; + } + else + { + front_ring = &med->front_ring; + req_prod = front_ring->req_prod_pvt; - /* Update ring */ - med->req_producers--; - front_ring->req_prod_pvt = req_prod; - RING_PUSH_REQUESTS(front_ring); + if ( foreign ) + { + req->flags |= MEM_EVENT_FLAG_FOREIGN; + ASSERT( !(req->flags & MEM_EVENT_FLAG_VCPU_PAUSED) ); + } + + /* Copy request */ + memcpy(RING_GET_REQUEST(front_ring, req_prod), req, sizeof(*req)); + req_prod++; + + /* Update ring */ + front_ring->req_prod_pvt = req_prod; + RING_PUSH_REQUESTS(front_ring); + } + + /* + * We ensure that each vcpu can put at least *one* event -- because some + * events are not repeatable, such as dropping a page. This will ensure no + * vCPU is left with an event that they must place on the ring, but cannot. + * They will be paused after the event is placed. + * See comment above in __mem_event_unpause_vcpus(). + */ + if ( !foreign && mem_event_ring_free(d, med) < d->max_vcpus ) + { + mem_event_mark_and_pause(current, med); + ret = -EBUSY; + } mem_event_ring_unlock(med); notify_via_xen_event_channel(d, med->xen_port); + + return ret; } int mem_event_get_response(struct mem_event_domain *med, mem_event_response_t *rsp) @@ -195,52 +311,31 @@ int mem_event_get_response(struct mem_ev return 1; } -void mem_event_unpause_vcpus(struct domain *d) +void mem_event_unpause_vcpus(struct domain *d, struct mem_event_domain *med) { - struct vcpu *v; + mem_event_ring_lock(med); - for_each_vcpu ( d, v ) - if ( test_and_clear_bit(_VPF_mem_event, &v->pause_flags) ) - vcpu_wake(v); + if ( med->blocked ) + __mem_event_unpause_vcpus(d, med, mem_event_ring_free(d, med)); + + mem_event_ring_unlock(med); } -void mem_event_mark_and_pause(struct vcpu *v) +void mem_event_mark_and_pause(struct vcpu *v, struct mem_event_domain *med) { - set_bit(_VPF_mem_event, &v->pause_flags); - vcpu_sleep_nosync(v); -} - -void mem_event_put_req_producers(struct mem_event_domain *med) -{ - mem_event_ring_lock(med); - med->req_producers--; - mem_event_ring_unlock(med); + if ( !test_and_set_bit(_VPF_mem_event, &v->pause_flags) ) + { + vcpu_pause_nosync(v); + med->blocked++; + } } int mem_event_check_ring(struct domain *d, struct mem_event_domain *med) { - struct vcpu *curr = current; - int free_requests; - int ring_full = 1; + if ( !med->ring_page ) + return -ENOSYS; - if ( !med->ring_page ) - return -1; - - mem_event_ring_lock(med); - - free_requests = RING_FREE_REQUESTS(&med->front_ring); - if ( med->req_producers < free_requests ) - { - med->req_producers++; - ring_full = 0; - } - - if ( ring_full && (curr->domain == d) ) - mem_event_mark_and_pause(curr); - - mem_event_ring_unlock(med); - - return ring_full; + return 0; } /* Registered with Xen-bound event channel for incoming notifications. */ @@ -323,7 +418,7 @@ int mem_event_domctl(struct domain *d, x case XEN_DOMCTL_MEM_EVENT_OP_PAGING_DISABLE: { if ( med->ring_page ) - rc = mem_event_disable(med); + rc = mem_event_disable(d, med); } break; @@ -362,7 +457,7 @@ int mem_event_domctl(struct domain *d, x case XEN_DOMCTL_MEM_EVENT_OP_ACCESS_DISABLE: { if ( med->ring_page ) - rc = mem_event_disable(med); + rc = mem_event_disable(d, med); } break; diff -r 4c4a9ed0728c -r d1d238616c0c xen/arch/x86/mm/mem_sharing.c --- a/xen/arch/x86/mm/mem_sharing.c +++ b/xen/arch/x86/mm/mem_sharing.c @@ -281,12 +281,20 @@ static struct page_info* mem_sharing_all vcpu_pause_nosync(v); req.flags |= MEM_EVENT_FLAG_VCPU_PAUSED; - if(mem_event_check_ring(d, &d->mem_event->share)) return page; - req.gfn = gfn; req.p2mt = p2m_ram_shared; req.vcpu_id = v->vcpu_id; - mem_event_put_request(d, &d->mem_event->share, &req); + + /* If there is no ring, and we're out of memory, then + * there is no way out. */ + if ( mem_event_put_request(d, &d->mem_event->share, + &req) == -ENOSYS ) + { + gdprintk(XENLOG_ERR, + "Failed alloc on unshare path for %hu and no ring " + "to upcall\n", d->domain_id); + domain_crash(d); + } return page; } @@ -310,6 +318,9 @@ int mem_sharing_sharing_resume(struct do vcpu_unpause(d->vcpu[rsp.vcpu_id]); } + /* Unpause any domains that were paused because the ring was full */ + mem_event_unpause_vcpus(d, &d->mem_event->paging); + return 0; } diff -r 4c4a9ed0728c -r d1d238616c0c xen/arch/x86/mm/p2m.c --- a/xen/arch/x86/mm/p2m.c +++ b/xen/arch/x86/mm/p2m.c @@ -871,13 +871,15 @@ int p2m_mem_paging_evict(struct domain * * p2m_mem_paging_drop_page() will notify the pager that a paged-out gfn was * released by the guest. The pager is supposed to drop its reference of the * gfn. + * Returns zero on success, EAGAIN for foreign mappers and EBUSY for guest + * vcpus if the ring is congested. */ -void p2m_mem_paging_drop_page(struct domain *d, unsigned long gfn) +int p2m_mem_paging_drop_page(struct domain *d, unsigned long gfn) { struct vcpu *v = current; mem_event_request_t req; - /* Check that there's space on the ring for this request */ + /* Check that there's a paging listener who cares about this */ if ( mem_event_check_ring(d, &d->mem_event->paging) == 0) { /* Send release notification to pager */ @@ -886,8 +888,11 @@ void p2m_mem_paging_drop_page(struct dom req.gfn = gfn; req.vcpu_id = v->vcpu_id; - mem_event_put_request(d, &d->mem_event->paging, &req); + /* rc cannot be ENOSYS, as we checked before */ + return mem_event_put_request(d, &d->mem_event->paging, &req); } + + return 0; } /** @@ -920,9 +925,14 @@ void p2m_mem_paging_populate(struct doma mfn_t mfn; struct p2m_domain *p2m = p2m_get_hostp2m(d); - /* Check that there's space on the ring for this request */ + /* We're paging. There should be a ring */ if ( mem_event_check_ring(d, &d->mem_event->paging) ) + { + gdprintk(XENLOG_ERR, "Domain %hu paging gfn %lx yet no ring " + "in place\n", d->domain_id, gfn); + domain_crash(d); return; + } memset(&req, 0, sizeof(req)); req.type = MEM_EVENT_TYPE_PAGING; @@ -951,7 +961,6 @@ void p2m_mem_paging_populate(struct doma else if ( p2mt != p2m_ram_paging_out && p2mt != p2m_ram_paged ) { /* gfn is already on its way back and vcpu is not paused */ - mem_event_put_req_producers(&d->mem_event->paging); return; } @@ -960,7 +969,10 @@ void p2m_mem_paging_populate(struct doma req.p2mt = p2mt; req.vcpu_id = v->vcpu_id; - mem_event_put_request(d, &d->mem_event->paging, &req); + (void)mem_event_put_request(d, &d->mem_event->paging, &req); + /* If the ring is full, foreign mappers will retry, and guest + * vcpus remain paused. Guest vcpu will wake up and retry once + * the consumer has made some space in the ring. */ } /** @@ -1094,7 +1106,7 @@ void p2m_mem_paging_resume(struct domain } /* Unpause any domains that were paused because the ring was full */ - mem_event_unpause_vcpus(d); + mem_event_unpause_vcpus(d, &d->mem_event->paging); } void p2m_mem_access_check(unsigned long gpa, bool_t gla_valid, unsigned long gla, @@ -1105,7 +1117,6 @@ void p2m_mem_access_check(unsigned long unsigned long gfn = gpa >> PAGE_SHIFT; struct domain *d = v->domain; struct p2m_domain* p2m = p2m_get_hostp2m(d); - int res; mfn_t mfn; p2m_type_t p2mt; p2m_access_t p2ma; @@ -1123,17 +1134,14 @@ void p2m_mem_access_check(unsigned long p2m_unlock(p2m); /* Otherwise, check if there is a memory event listener, and send the message along */ - res = mem_event_check_ring(d, &d->mem_event->access); - if ( res < 0 ) + if ( mem_event_check_ring(d, &d->mem_event->access) == -ENOSYS ) { /* No listener */ if ( p2m->access_required ) { - printk(XENLOG_INFO - "Memory access permissions failure, no mem_event listener: pausing VCPU %d, dom %d\n", - v->vcpu_id, d->domain_id); - - mem_event_mark_and_pause(v); + gdprintk(XENLOG_INFO, "Memory access permissions failure, no mem_event " + "listener VCPU %d, dom %d\n", v->vcpu_id, d->domain_id); + domain_crash(v->domain); } else { @@ -1145,8 +1153,6 @@ void p2m_mem_access_check(unsigned long return; } - else if ( res > 0 ) - return; /* No space in buffer; VCPU paused */ memset(&req, 0, sizeof(req)); req.type = MEM_EVENT_TYPE_ACCESS; @@ -1167,9 +1173,8 @@ void p2m_mem_access_check(unsigned long req.vcpu_id = v->vcpu_id; - mem_event_put_request(d, &d->mem_event->access, &req); - - /* VCPU paused, mem event request sent */ + (void)mem_event_put_request(d, &d->mem_event->access, &req); + /* VCPU paused */ } void p2m_mem_access_resume(struct domain *d) @@ -1188,7 +1193,7 @@ void p2m_mem_access_resume(struct domain /* Unpause any domains that were paused because the ring was full or no listener * was available */ - mem_event_unpause_vcpus(d); + mem_event_unpause_vcpus(d, &d->mem_event->access); } diff -r 4c4a9ed0728c -r d1d238616c0c xen/common/memory.c --- a/xen/common/memory.c +++ b/xen/common/memory.c @@ -165,10 +165,13 @@ int guest_remove_page(struct domain *d, mfn = mfn_x(get_gfn(d, gmfn, &p2mt)); if ( unlikely(p2m_is_paging(p2mt)) ) { + int rc; guest_physmap_remove_page(d, gmfn, mfn, 0); - p2m_mem_paging_drop_page(d, gmfn); + rc = p2m_mem_paging_drop_page(d, gmfn); put_gfn(d, gmfn); - return 1; + /* drop_page returns zero on success, we return 1 on success. + * drop_page returns negative on error, never returns 1. */ + return rc ? rc : 1; } #else mfn = gmfn_to_mfn(d, gmfn); diff -r 4c4a9ed0728c -r d1d238616c0c xen/include/asm-x86/mem_event.h --- a/xen/include/asm-x86/mem_event.h +++ b/xen/include/asm-x86/mem_event.h @@ -25,12 +25,18 @@ #define __MEM_EVENT_H__ /* Pauses VCPU while marking pause flag for mem event */ -void mem_event_mark_and_pause(struct vcpu *v); +void mem_event_mark_and_pause(struct vcpu *v, struct mem_event_domain *med); int mem_event_check_ring(struct domain *d, struct mem_event_domain *med); -void mem_event_put_req_producers(struct mem_event_domain *med); -void mem_event_put_request(struct domain *d, struct mem_event_domain *med, mem_event_request_t *req); -int mem_event_get_response(struct mem_event_domain *med, mem_event_response_t *rsp); -void mem_event_unpause_vcpus(struct domain *d); +void mem_event_unpause_vcpus(struct domain *d, struct mem_event_domain *med); + +/* Returns 0 on success, -ENOSYS if there is no ring, -EBUSY if there is + * a ring or no space. FOr success or -EBUSY. the vCPU is left blocked to + * ensure that the ring does not lose events. In general, put_request should + * not fail, as it attempts to ensure that each vCPU can put at least one req. */ +int mem_event_put_request(struct domain *d, struct mem_event_domain *med, + mem_event_request_t *req); +int mem_event_get_response(struct mem_event_domain *med, + mem_event_response_t *rsp); int mem_event_domctl(struct domain *d, xen_domctl_mem_event_op_t *mec, XEN_GUEST_HANDLE(void) u_domctl); diff -r 4c4a9ed0728c -r d1d238616c0c xen/include/asm-x86/p2m.h --- a/xen/include/asm-x86/p2m.h +++ b/xen/include/asm-x86/p2m.h @@ -475,7 +475,7 @@ int p2m_mem_paging_nominate(struct domai /* Evict a frame */ int p2m_mem_paging_evict(struct domain *d, unsigned long gfn); /* Tell xenpaging to drop a paged out frame */ -void p2m_mem_paging_drop_page(struct domain *d, unsigned long gfn); +int p2m_mem_paging_drop_page(struct domain *d, unsigned long gfn); /* Start populating a paged out frame */ void p2m_mem_paging_populate(struct domain *d, unsigned long gfn); /* Prepare the p2m for paging a frame in */ @@ -483,8 +483,8 @@ int p2m_mem_paging_prep(struct domain *d /* Resume normal operation (in case a domain was paused) */ void p2m_mem_paging_resume(struct domain *d); #else -static inline void p2m_mem_paging_drop_page(struct domain *d, unsigned long gfn) -{ } +static inline int p2m_mem_paging_drop_page(struct domain *d, unsigned long gfn) +{ return 0; } static inline void p2m_mem_paging_populate(struct domain *d, unsigned long gfn) { } #endif diff -r 4c4a9ed0728c -r d1d238616c0c xen/include/xen/mm.h --- a/xen/include/xen/mm.h +++ b/xen/include/xen/mm.h @@ -318,6 +318,8 @@ page_list_splice(struct page_list_head * void scrub_one_page(struct page_info *); +/* Returns 1 on success, 0 on error, negative if the ring + * for event propagation is full in the presence of paging */ int guest_remove_page(struct domain *d, unsigned long gmfn); #define RAM_TYPE_CONVENTIONAL 0x00000001 diff -r 4c4a9ed0728c -r d1d238616c0c xen/include/xen/sched.h --- a/xen/include/xen/sched.h +++ b/xen/include/xen/sched.h @@ -183,13 +183,16 @@ struct mem_event_domain { /* ring lock */ spinlock_t ring_lock; - unsigned int req_producers; /* shared page */ mem_event_shared_page_t *shared_page; /* shared ring page */ void *ring_page; /* front-end ring */ mem_event_front_ring_t front_ring; + /* the number of vCPUs blocked */ + unsigned int blocked; + /* The last vcpu woken up */ + unsigned int last_vcpu_wake_up; /* event channel port (vcpu0 only) */ int xen_port; }; _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-devel
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |