Xen project Mailing List

[Xen-devel] [PATCH 1 of 4] Improve ring management for memory events

From: Andres Lagar-Cavilla <andres@xxxxxxxxxxxxxxxx>

Date: Mon, 14 Nov 2011 16:58:31 -0500

Cc: andres@xxxxxxxxxxxxxx, keir.xen@xxxxxxxxx, tim@xxxxxxx, olaf@xxxxxxxxx, adin@xxxxxxxxxxxxxx

Delivery-date: Mon, 14 Nov 2011 14:21:48 -0800

Domainkey-signature: a=rsa-sha1; c=nofws; d=lagarcavilla.org; h=content-type :mime-version:content-transfer-encoding:subject:message-id :in-reply-to:references:date:from:to:cc; q=dns; s= lagarcavilla.org; b=V87jRvFGzqg5MhjKATX+I4RkXTh8GDEAN1VRSiTY0KSv 4FaZjbiGDeII0aie1smCkKRZvFmfqvvy5bx6E07t+eLvDCQ8u28cegoUw8Pbzt06 IKhaALIMl4GKsJ5rUgpg0tsvsnODOEiF1Db226CGVzawqlXUlkB+GUqtAGGiZOM=

List-id: Xen developer discussion <xen-devel.lists.xensource.com>

xen/arch/x86/hvm/hvm.c | 21 +++- xen/arch/x86/mm/mem_event.c | 173 +++++++++++++++++++++++++++++---------- xen/arch/x86/mm/mem_sharing.c | 27 ++++- xen/arch/x86/mm/p2m.c | 100 ++++++++++++---------- xen/include/asm-x86/mem_event.h | 16 ++- xen/include/xen/sched.h | 5 +- 6 files changed, 232 insertions(+), 110 deletions(-) The memevent code currently has a mechanism for reserving space in the ring before putting an event, but each caller must individually ensure that the vCPUs are correctly paused if no space is available. This fixes that issue by reversing the semantics, we ensure that enough space is always left for one event per vCPU in the ring. If this constraint will be violated by the current vCPU putting another request in the ring, we pause it after putting the current event. If no foreign mappings cause mem events, then no mem events will be lost. Because events caused by foreign mapping are effectively unbound, we cannot guarantee with certainty lack of event loss. Additionally, we ensure that no events are lost by Xen as a consumer if multiple responses land on the ring for a single domctl. While the current domctl-based notifications to Xen disallow batching, this is required for later patches. Finally, mark_and_pause_vcpus was misleading, beacause it wasn't strictly pausing the vcpu's, rather sending them to sleep. Change to actual vcpu_pause, which protects wakeups via an atomic count. This is useful when an event pauses a vcpu and the vcpu gets mark and paused due to ring congestion. Signed-off-by: Adin Scannell <adin@xxxxxxxxxxx> Signed-off-by: Andres Lagar-Cavilla <andres@xxxxxxxxxxxxxxxx> diff -r 19a5a2cddad3 -r ee909e5a9d85 xen/arch/x86/hvm/hvm.c --- a/xen/arch/x86/hvm/hvm.c +++ b/xen/arch/x86/hvm/hvm.c @@ -4077,7 +4077,6 @@ static int hvm_memory_event_traps(long p struct vcpu* v = current; struct domain *d = v->domain; mem_event_request_t req; - int rc; if ( !(p & HVMPME_MODE_MASK) ) return 0; @@ -4085,10 +4084,6 @@ static int hvm_memory_event_traps(long p if ( (p & HVMPME_onchangeonly) && (value == old) ) return 1; - rc = mem_event_check_ring(d, &d->mem_access); - if ( rc ) - return rc; - memset(&req, 0, sizeof(req)); req.type = MEM_EVENT_TYPE_ACCESS; req.reason = reason; @@ -4108,7 +4103,21 @@ static int hvm_memory_event_traps(long p req.gla_valid = 1; } - mem_event_put_request(d, &d->mem_access, &req); + if ( mem_event_put_request(d, &d->mem_access, &req) ) + { + /* rc == -ENOSYS + * If there was no ring to send the event, then simply unpause the + * vcpu (if we had previously paused it). It will retry the + * instruction (with the exception "handled") and go on + * rc == -EBUSY + * If the ring is full, the vcpu has been marked and paused, + * so we also need to unpause it (if we had previously paused + * it). Once the consumer depletes the ring, the vcpu will be + * woken up and will retry. The consumer may have lost this + * exception, though. */ + if ( req.flags & MEM_EVENT_FLAG_VCPU_PAUSED ) + vcpu_unpause(v); + } return 1; } diff -r 19a5a2cddad3 -r ee909e5a9d85 xen/arch/x86/mm/mem_event.c --- a/xen/arch/x86/mm/mem_event.c +++ b/xen/arch/x86/mm/mem_event.c @@ -91,6 +91,9 @@ static int mem_event_enable(struct domai put_gfn(dom_mem_event, ring_gfn); put_gfn(dom_mem_event, shared_gfn); + /* Set the number of currently blocked vCPUs to 0. */ + med->blocked = 0; + /* Allocate event channel */ rc = alloc_unbound_xen_event_channel(d->vcpu[0], current->domain->domain_id); @@ -108,7 +111,7 @@ static int mem_event_enable(struct domai mem_event_ring_lock_init(med); /* Wake any VCPUs paused for memory events */ - mem_event_unpause_vcpus(d); + mem_event_unpause_vcpus(d, med); return 0; @@ -133,31 +136,83 @@ static int mem_event_disable(struct mem_ return 0; } -void mem_event_put_request(struct domain *d, struct mem_event_domain *med, mem_event_request_t *req) +static inline int mem_event_ring_free(struct domain *d, struct mem_event_domain *med) { + int free_requests; + + free_requests = RING_FREE_REQUESTS(&med->front_ring); + if ( unlikely(free_requests < d->max_vcpus) ) + { + /* This may happen during normal operation (hopefully not often). */ + gdprintk(XENLOG_INFO, "mem_event request slots for domain %d: %d\n", + d->domain_id, free_requests); + } + + return free_requests; +} + +/* Return values + * zero: success + * -ENOSYS: no ring + * -EAGAIN: ring is full and the event has not been transmitted. + * Only foreign vcpu's get EAGAIN + * -EBUSY: guest vcpu has been paused due to ring congestion + */ +int mem_event_put_request(struct domain *d, struct mem_event_domain *med, mem_event_request_t *req) +{ + int ret = 0; mem_event_front_ring_t *front_ring; RING_IDX req_prod; + if ( mem_event_check_ring(d, med) ) + return -ENOSYS; + mem_event_ring_lock(med); - front_ring = &med->front_ring; - req_prod = front_ring->req_prod_pvt; + if ( mem_event_ring_free(d, med) == 0 ) + { + /* This *may* happen, but only when foreign mappings generate events. */ + gdprintk(XENLOG_WARNING, "possible lost mem_event for domain %d\n", + d->domain_id); + ret = -EAGAIN; + } + else + { + front_ring = &med->front_ring; + req_prod = front_ring->req_prod_pvt; - /* Copy request */ - memcpy(RING_GET_REQUEST(front_ring, req_prod), req, sizeof(*req)); - req_prod++; + /* Copy request */ + memcpy(RING_GET_REQUEST(front_ring, req_prod), req, sizeof(*req)); + req_prod++; - /* Update ring */ - med->req_producers--; - front_ring->req_prod_pvt = req_prod; - RING_PUSH_REQUESTS(front_ring); + /* Update ring */ + front_ring->req_prod_pvt = req_prod; + RING_PUSH_REQUESTS(front_ring); + } + + /* + * We ensure that each vcpu can put at least *one* event -- because some + * events are not repeatable, such as dropping a page. This will ensure no + * vCPU is left with an event that they must place on the ring, but cannot. + * They will be paused after the event is placed. + * See large comment below in mem_event_unpause_vcpus(). + */ + if ( current->domain == d && + mem_event_ring_free(d, med) < d->max_vcpus ) + { + mem_event_mark_and_pause(current, med); + /* For guest vcpu, overwrite EAGAIN */ + ret = -EBUSY; + } mem_event_ring_unlock(med); notify_via_xen_event_channel(d, med->xen_port); + + return ret; } -void mem_event_get_response(struct mem_event_domain *med, mem_event_response_t *rsp) +int mem_event_get_response(struct domain *d, struct mem_event_domain *med, mem_event_response_t *rsp) { mem_event_front_ring_t *front_ring; RING_IDX rsp_cons; @@ -167,6 +222,12 @@ void mem_event_get_response(struct mem_e front_ring = &med->front_ring; rsp_cons = front_ring->rsp_cons; + if ( !RING_HAS_UNCONSUMED_RESPONSES(front_ring) ) + { + mem_event_ring_unlock(med); + return 0; + } + /* Copy response */ memcpy(rsp, RING_GET_RESPONSE(front_ring, rsp_cons), sizeof(*rsp)); rsp_cons++; @@ -176,54 +237,74 @@ void mem_event_get_response(struct mem_e front_ring->sring->rsp_event = rsp_cons + 1; mem_event_ring_unlock(med); + + return 1; } -void mem_event_unpause_vcpus(struct domain *d) +void mem_event_unpause_vcpus(struct domain *d, struct mem_event_domain *med) { struct vcpu *v; + int free, i, j, k; + int online = d->max_vcpus; + if ( !med->blocked ) + return; + + mem_event_ring_lock(med); + free = mem_event_ring_free(d, med); + + /* + * We ensure that we only have vCPUs online if there are enough free slots + * for their memory events to be processed. This will ensure that no + * memory events are lost (due to the fact that certain types of events + * cannot be replayed, we need to ensure that there is space in the ring + * for when they are hit). + * See large comment above in mem_event_put_request(). + */ for_each_vcpu ( d, v ) - if ( test_and_clear_bit(_VPF_mem_event, &v->pause_flags) ) - vcpu_wake(v); + if ( test_bit(_VPF_mem_event, &v->pause_flags) ) + online--; + + /* We remember which vcpu last woke up to avoid scanning always linearly + * from xero and starving higher-numbered vcpus under high load */ + if ( d->vcpu ) + { + for (i = med->last_vcpu_wake_up + 1, j = 0; j < d->max_vcpus; i++, j++) + { + k = i % d->max_vcpus; + v = d->vcpu[k]; + if ( !v ) continue; + + if ( !(med->blocked) || online >= free ) + break; + if ( test_and_clear_bit(_VPF_mem_event, &v->pause_flags) ) + { + vcpu_unpause(v); + online++; + med->blocked--; + med->last_vcpu_wake_up = k; + } + } + } + + mem_event_ring_unlock(med); } -void mem_event_mark_and_pause(struct vcpu *v) +void mem_event_mark_and_pause(struct vcpu *v, struct mem_event_domain *med) { - set_bit(_VPF_mem_event, &v->pause_flags); - vcpu_sleep_nosync(v); -} - -void mem_event_put_req_producers(struct mem_event_domain *med) -{ - mem_event_ring_lock(med); - med->req_producers--; - mem_event_ring_unlock(med); + if ( !test_and_set_bit(_VPF_mem_event, &v->pause_flags) ) + { + vcpu_pause_nosync(v); + med->blocked++; + } } int mem_event_check_ring(struct domain *d, struct mem_event_domain *med) { - struct vcpu *curr = current; - int free_requests; - int ring_full = 1; + if ( !med->ring_page ) + return -ENOSYS; - if ( !med->ring_page ) - return -1; - - mem_event_ring_lock(med); - - free_requests = RING_FREE_REQUESTS(&med->front_ring); - if ( med->req_producers < free_requests ) - { - med->req_producers++; - ring_full = 0; - } - - if ( ring_full && (curr->domain == d) ) - mem_event_mark_and_pause(curr); - - mem_event_ring_unlock(med); - - return ring_full; + return 0; } int mem_event_domctl(struct domain *d, xen_domctl_mem_event_op_t *mec, diff -r 19a5a2cddad3 -r ee909e5a9d85 xen/arch/x86/mm/mem_sharing.c --- a/xen/arch/x86/mm/mem_sharing.c +++ b/xen/arch/x86/mm/mem_sharing.c @@ -281,12 +281,19 @@ static struct page_info* mem_sharing_all vcpu_pause_nosync(v); req.flags |= MEM_EVENT_FLAG_VCPU_PAUSED; - if(mem_event_check_ring(d, &d->mem_share)) return page; - req.gfn = gfn; req.p2mt = p2m_ram_shared; req.vcpu_id = v->vcpu_id; - mem_event_put_request(d, &d->mem_share, &req); + + /* If there is no ring, and we're out of memory, then + * there is no way out. */ + if ( mem_event_put_request(d, &d->mem_share, &req) == -ENOSYS ) + { + gdprintk(XENLOG_ERR, + "Failed alloc on unshare path for %hu and no ring " + "to upcall\n", d->domain_id); + domain_crash(d); + } return page; } @@ -300,12 +307,16 @@ int mem_sharing_sharing_resume(struct do { mem_event_response_t rsp; - /* Get request off the ring */ - mem_event_get_response(&d->mem_share, &rsp); + /* Get all requests off the ring */ + while ( mem_event_get_response(d, &d->mem_share, &rsp) ) + { + /* Unpause domain/vcpu */ + if ( rsp.flags & MEM_EVENT_FLAG_VCPU_PAUSED ) + vcpu_unpause(d->vcpu[rsp.vcpu_id]); + } - /* Unpause domain/vcpu */ - if( rsp.flags & MEM_EVENT_FLAG_VCPU_PAUSED ) - vcpu_unpause(d->vcpu[rsp.vcpu_id]); + /* Unpause any domains that were paused because the ring was full */ + mem_event_unpause_vcpus(d, &d->mem_paging); return 0; } diff -r 19a5a2cddad3 -r ee909e5a9d85 xen/arch/x86/mm/p2m.c --- a/xen/arch/x86/mm/p2m.c +++ b/xen/arch/x86/mm/p2m.c @@ -877,7 +877,7 @@ void p2m_mem_paging_drop_page(struct dom struct vcpu *v = current; mem_event_request_t req; - /* Check that there's space on the ring for this request */ + /* Check that there's a paging listener who cares about this */ if ( mem_event_check_ring(d, &d->mem_paging) == 0) { /* Send release notification to pager */ @@ -886,7 +886,8 @@ void p2m_mem_paging_drop_page(struct dom req.gfn = gfn; req.vcpu_id = v->vcpu_id; - mem_event_put_request(d, &d->mem_paging, &req); + /* If the event is lost, tough bananas */ + (void)mem_event_put_request(d, &d->mem_paging, &req); } } @@ -920,9 +921,14 @@ void p2m_mem_paging_populate(struct doma mfn_t mfn; struct p2m_domain *p2m = p2m_get_hostp2m(d); - /* Check that there's space on the ring for this request */ + /* We're paging. There should be a ring */ if ( mem_event_check_ring(d, &d->mem_paging) ) + { + gdprintk(XENLOG_ERR, "Domain %hu paging gfn %lx yet no ring " + "in place\n", d->domain_id, gfn); + domain_crash(d); return; + } memset(&req, 0, sizeof(req)); req.type = MEM_EVENT_TYPE_PAGING; @@ -951,7 +957,6 @@ void p2m_mem_paging_populate(struct doma else if ( p2mt != p2m_ram_paging_out && p2mt != p2m_ram_paged ) { /* gfn is already on its way back and vcpu is not paused */ - mem_event_put_req_producers(&d->mem_paging); return; } @@ -960,7 +965,14 @@ void p2m_mem_paging_populate(struct doma req.p2mt = p2mt; req.vcpu_id = v->vcpu_id; - mem_event_put_request(d, &d->mem_paging, &req); + if ( mem_event_put_request(d, &d->mem_paging, &req) == -EBUSY) + { + /* The ring is full, so we unpause the vcpu (if we had paused + * it), and let it retry, once the consumer has made some space + * in the ring. Checks when pausing ensure this is a guest vcpu */ + if ( req.flags & MEM_EVENT_FLAG_VCPU_PAUSED ) + vcpu_unpause(v); + } } /** @@ -1038,31 +1050,32 @@ void p2m_mem_paging_resume(struct domain p2m_access_t a; mfn_t mfn; - /* Pull the response off the ring */ - mem_event_get_response(&d->mem_paging, &rsp); + /* Pull all responses off the ring */ + while( mem_event_get_response(d, &d->mem_paging, &rsp) ) + { + /* Fix p2m entry if the page was not dropped */ + if ( !(rsp.flags & MEM_EVENT_FLAG_DROP_PAGE) ) + { + p2m_lock(p2m); + mfn = p2m->get_entry(p2m, rsp.gfn, &p2mt, &a, p2m_query, NULL); + /* Allow only pages which were prepared properly, or pages which + * were nominated but not evicted */ + if ( mfn_valid(mfn) && + (p2mt == p2m_ram_paging_in || p2mt == p2m_ram_paging_in_start) ) + { + set_p2m_entry(p2m, rsp.gfn, mfn, PAGE_ORDER_4K, p2m_ram_rw, a); + set_gpfn_from_mfn(mfn_x(mfn), rsp.gfn); + } + p2m_unlock(p2m); + } - /* Fix p2m entry if the page was not dropped */ - if ( !(rsp.flags & MEM_EVENT_FLAG_DROP_PAGE) ) - { - p2m_lock(p2m); - mfn = p2m->get_entry(p2m, rsp.gfn, &p2mt, &a, p2m_query, NULL); - /* Allow only pages which were prepared properly, or pages which - * were nominated but not evicted */ - if ( mfn_valid(mfn) && - (p2mt == p2m_ram_paging_in || p2mt == p2m_ram_paging_in_start) ) - { - set_p2m_entry(p2m, rsp.gfn, mfn, PAGE_ORDER_4K, p2m_ram_rw, a); - set_gpfn_from_mfn(mfn_x(mfn), rsp.gfn); - } - p2m_unlock(p2m); + /* Unpause domain */ + if ( rsp.flags & MEM_EVENT_FLAG_VCPU_PAUSED ) + vcpu_unpause(d->vcpu[rsp.vcpu_id]); } - /* Unpause domain */ - if ( rsp.flags & MEM_EVENT_FLAG_VCPU_PAUSED ) - vcpu_unpause(d->vcpu[rsp.vcpu_id]); - /* Unpause any domains that were paused because the ring was full */ - mem_event_unpause_vcpus(d); + mem_event_unpause_vcpus(d, &d->mem_paging); } void p2m_mem_access_check(unsigned long gpa, bool_t gla_valid, unsigned long gla, @@ -1073,7 +1086,6 @@ void p2m_mem_access_check(unsigned long unsigned long gfn = gpa >> PAGE_SHIFT; struct domain *d = v->domain; struct p2m_domain* p2m = p2m_get_hostp2m(d); - int res; mfn_t mfn; p2m_type_t p2mt; p2m_access_t p2ma; @@ -1091,17 +1103,14 @@ void p2m_mem_access_check(unsigned long p2m_unlock(p2m); /* Otherwise, check if there is a memory event listener, and send the message along */ - res = mem_event_check_ring(d, &d->mem_access); - if ( res < 0 ) + if ( mem_event_check_ring(d, &d->mem_access) == -ENOSYS ) { /* No listener */ if ( p2m->access_required ) { - printk(XENLOG_INFO - "Memory access permissions failure, no mem_event listener: pausing VCPU %d, dom %d\n", - v->vcpu_id, d->domain_id); - - mem_event_mark_and_pause(v); + gdprintk(XENLOG_INFO, "Memory access permissions failure, no mem_event " + "listener VCPU %d, dom %d\n", v->vcpu_id, d->domain_id); + domain_crash(v->domain); } else { @@ -1113,8 +1122,6 @@ void p2m_mem_access_check(unsigned long return; } - else if ( res > 0 ) - return; /* No space in buffer; VCPU paused */ memset(&req, 0, sizeof(req)); req.type = MEM_EVENT_TYPE_ACCESS; @@ -1135,9 +1142,12 @@ void p2m_mem_access_check(unsigned long req.vcpu_id = v->vcpu_id; - mem_event_put_request(d, &d->mem_access, &req); + /* If the ring was busy, this guest vcpu got marked and paused (again), so + * we can remove our unconditional vcpu pause. */ + if ( mem_event_put_request(d, &d->mem_access, &req) == -EBUSY ) + vcpu_unpause(v); - /* VCPU paused, mem event request sent */ + /* VCPU paused */ } void p2m_mem_access_resume(struct p2m_domain *p2m) @@ -1145,15 +1155,17 @@ void p2m_mem_access_resume(struct p2m_do struct domain *d = p2m->domain; mem_event_response_t rsp; - mem_event_get_response(&d->mem_access, &rsp); - - /* Unpause domain */ - if ( rsp.flags & MEM_EVENT_FLAG_VCPU_PAUSED ) - vcpu_unpause(d->vcpu[rsp.vcpu_id]); + /* Pull all responses off the ring */ + while( mem_event_get_response(d, &d->mem_access, &rsp) ) + { + /* Unpause domain */ + if ( rsp.flags & MEM_EVENT_FLAG_VCPU_PAUSED ) + vcpu_unpause(d->vcpu[rsp.vcpu_id]); + } /* Unpause any domains that were paused because the ring was full or no listener * was available */ - mem_event_unpause_vcpus(d); + mem_event_unpause_vcpus(d, &d->mem_access); } diff -r 19a5a2cddad3 -r ee909e5a9d85 xen/include/asm-x86/mem_event.h --- a/xen/include/asm-x86/mem_event.h +++ b/xen/include/asm-x86/mem_event.h @@ -25,12 +25,18 @@ #define __MEM_EVENT_H__ /* Pauses VCPU while marking pause flag for mem event */ -void mem_event_mark_and_pause(struct vcpu *v); +void mem_event_mark_and_pause(struct vcpu *v, struct mem_event_domain *med); int mem_event_check_ring(struct domain *d, struct mem_event_domain *med); -void mem_event_put_req_producers(struct mem_event_domain *med); -void mem_event_put_request(struct domain *d, struct mem_event_domain *med, mem_event_request_t *req); -void mem_event_get_response(struct mem_event_domain *med, mem_event_response_t *rsp); -void mem_event_unpause_vcpus(struct domain *d); +void mem_event_unpause_vcpus(struct domain *d, struct mem_event_domain *med); + +/* Returns 0 on success, -ENOSYS if there is no ring, -EBUSY if there is + * a ring or no space. FOr success or -EBUSY. the vCPU is left blocked to + * ensure that the ring does not lose events. In general, put_request should + * not fail, as it attempts to ensure that each vCPU can put at least one req. */ +int mem_event_put_request(struct domain *d, struct mem_event_domain *med, + mem_event_request_t *req); +int mem_event_get_response(struct domain *d, struct mem_event_domain *med, + mem_event_response_t *rsp); int mem_event_domctl(struct domain *d, xen_domctl_mem_event_op_t *mec, XEN_GUEST_HANDLE(void) u_domctl); diff -r 19a5a2cddad3 -r ee909e5a9d85 xen/include/xen/sched.h --- a/xen/include/xen/sched.h +++ b/xen/include/xen/sched.h @@ -183,13 +183,16 @@ struct mem_event_domain { /* ring lock */ spinlock_t ring_lock; - unsigned int req_producers; /* shared page */ mem_event_shared_page_t *shared_page; /* shared ring page */ void *ring_page; /* front-end ring */ mem_event_front_ring_t front_ring; + /* the number of vCPUs blocked */ + unsigned int blocked; + /* The last vcpu woken up */ + unsigned int last_vcpu_wake_up; /* event channel port (vcpu0 only) */ int xen_port; }; _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-devel

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.