[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH v4 2/2] x86/ioreq: Extend ioreq server to support multiple ioreq pages



A single shared ioreq page provides PAGE_SIZE/sizeof(ioreq_t) = 128
slots, limiting HVM guests to 128 vCPUs. To support more vCPUs, the
single struct ioreq_page in the ioreq_server is replaced with an ioreq_t
pointer backed by a dynamically sized allocation.
For the resource mapping path (XENMEM_acquire_resource), the common
ioreq_server_alloc_mfn() allocates nr_ioreq_pages(d) domain heap pages
with MEMF_no_refcount and writable type references, then maps them as a
single contiguous VA via vmap(). Teardown recovers the pages from the
vmap address via vmap_size() and vmap_to_page().
The legacy GFN-mapped path continues to support only a single ioreq
page. For domains whose vCPU count exceeds PAGE_SIZE/sizeof(ioreq_t),
hvm_map_ioreq_gfn() returns -EOPNOTSUPP. Those domains need to use the
resource mapping interface. This also avoids fragmentation when
allocating multiple slots from the GFN pool. The GFN path uses vmap()
for the Xen-side mapping, consistent with ioreq_server_alloc_mfn(),
allowing vmap_to_page() to recover the page during teardown.

Signed-off-by: Julian Vetter <julian.vetter@xxxxxxxxxx>
---
Changes in v4:
- Dropped the multi-page support for the legacy GFN-mapped path
  - When 'nr_ioreq_pages(d) > 1' -> -EOPNOTSUPP
  - But now also use vmap(), same as in resource mapping path, so new
    s->ioreq pointer is used in either case
  - Mirror exactly what prepare_ring_for_helper/destroy_ring_for_helper
    is doing except the __map_domain_page_global (replaced by
    vmap/vunmap)
- Replaced xvzalloc_array() by xvmalloc_array() -> No need to have mfns
  array zero'ed
- Inverted logic in ioreq_server_alloc_mfn() and ioreq_server_free_mfn()
  to check for 'if (buf)' (same as in the GFN-mapped path)
- Fixed ioreq_server_alloc_mfn -> Correctly call alloc_domheap_page +
  get_page_and_type
- Fixed error handling and teardown path to correctly call
  put_page_alloc_ref + put_page_and_type just like for the bufioreq
- Kept shared_iopage_t so QEMU, varstored, etc. can just reference into
  it via p->ioreq[cpu_id] -> No modification needed
---
 xen/arch/x86/hvm/ioreq.c | 153 ++++++++++++++++++++++++++++++++----
 xen/common/ioreq.c       | 166 ++++++++++++++++++++++++++++++++-------
 xen/include/xen/ioreq.h  |  10 ++-
 3 files changed, 286 insertions(+), 43 deletions(-)

diff --git a/xen/arch/x86/hvm/ioreq.c b/xen/arch/x86/hvm/ioreq.c
index 355b2ba12c..ec4f210768 100644
--- a/xen/arch/x86/hvm/ioreq.c
+++ b/xen/arch/x86/hvm/ioreq.c
@@ -15,6 +15,7 @@
 #include <xen/sched.h>
 #include <xen/softirq.h>
 #include <xen/trace.h>
+#include <xen/vmap.h>
 #include <xen/vpci.h>
 
 #include <asm/hvm/emulate.h>
@@ -123,9 +124,10 @@ static void hvm_free_ioreq_gfn(struct ioreq_server *s, 
gfn_t gfn)
 
 static void hvm_unmap_ioreq_gfn(struct ioreq_server *s, bool buf)
 {
-    struct ioreq_page *iorp = buf ? &s->bufioreq : &s->ioreq;
-
+    if ( buf )
     {
+        struct ioreq_page *iorp = &s->bufioreq;
+
         if ( gfn_eq(iorp->gfn, INVALID_GFN) )
             return;
 
@@ -134,16 +136,33 @@ static void hvm_unmap_ioreq_gfn(struct ioreq_server *s, 
bool buf)
 
         hvm_free_ioreq_gfn(s, iorp->gfn);
         iorp->gfn = INVALID_GFN;
+        return;
     }
+
+    if ( gfn_eq(s->ioreq_gfn, INVALID_GFN) )
+        return;
+
+    put_page_and_type(vmap_to_page((void *)s->ioreq));
+    vunmap(s->ioreq);
+    s->ioreq = NULL;
+
+    hvm_free_ioreq_gfn(s, s->ioreq_gfn);
+    s->ioreq_gfn = INVALID_GFN;
 }
 
 static int hvm_map_ioreq_gfn(struct ioreq_server *s, bool buf)
 {
     struct domain *d = s->target;
-    struct ioreq_page *iorp = buf ? &s->bufioreq : &s->ioreq;
+    gfn_t base_gfn;
+    struct page_info *pg;
+    p2m_type_t p2mt;
+    mfn_t mfn;
     int rc;
 
+    if ( buf )
     {
+        struct ioreq_page *iorp = &s->bufioreq;
+
         if ( iorp->page )
         {
             /*
@@ -173,35 +192,141 @@ static int hvm_map_ioreq_gfn(struct ioreq_server *s, 
bool buf)
 
         return rc;
     }
+
+    if ( s->ioreq )
+    {
+        /*
+         * If pages have already been allocated (which will happen on
+         * demand if ioreq_server_get_frame() is called), then
+         * mapping a guest frame is not permitted.
+         */
+        if ( gfn_eq(s->ioreq_gfn, INVALID_GFN) )
+            return -EPERM;
+        return 0;
+    }
+
+    /*
+     * The legacy GFN path supports only a single ioreq page. Guests requiring
+     * more ioreq slots must use the resource mapping interface
+     * (XENMEM_acquire_resource).
+     */
+    if ( nr_ioreq_pages(d) > 1 )
+        return -EOPNOTSUPP;
+
+    if ( d->is_dying )
+        return -EINVAL;
+
+    base_gfn = hvm_alloc_ioreq_gfn(s);
+
+    if ( gfn_eq(base_gfn, INVALID_GFN) )
+        return -ENOMEM;
+
+    /*
+     * The page management is the same as prepare_ring_for_helper(), but vmap()
+     * is used instead of __map_domain_page_global() to be consistent with
+     * ioreq_server_alloc_mfn(), which uses vmap() to map potentially multiple
+     * pages. This also allows vmap_to_page() to recover the struct page_info *
+     * from s->ioreq during teardown.
+     */
+    rc = check_get_page_from_gfn(d, base_gfn, false, &p2mt, &pg);
+    if ( rc )
+    {
+        if ( rc == -EAGAIN )
+            rc = -ENOENT;
+        goto fail;
+    }
+
+    if ( !get_page_type(pg, PGT_writable_page) )
+    {
+        put_page(pg);
+        rc = -EINVAL;
+        goto fail;
+    }
+
+    mfn = page_to_mfn(pg);
+    s->ioreq = vmap(&mfn, 1);
+    if ( !s->ioreq )
+    {
+        put_page_and_type(pg);
+        rc = -ENOMEM;
+        goto fail;
+    }
+
+    s->ioreq_gfn = base_gfn;
+    return 0;
+
+ fail:
+    hvm_free_ioreq_gfn(s, base_gfn);
+    return rc;
 }
 
 static void hvm_remove_ioreq_gfn(struct ioreq_server *s, bool buf)
 {
     struct domain *d = s->target;
-    struct ioreq_page *iorp = buf ? &s->bufioreq : &s->ioreq;
+    struct page_info *pg;
+    void *va;
+    gfn_t gfn;
 
-    if ( gfn_eq(iorp->gfn, INVALID_GFN) )
-        return;
+    if ( buf )
+    {
+        struct ioreq_page *iorp = &s->bufioreq;
+
+        if ( gfn_eq(iorp->gfn, INVALID_GFN) )
+            return;
+
+        gfn = iorp->gfn;
+        pg = iorp->page;
+        va = iorp->va;
+    }
+    else
+    {
+        if ( gfn_eq(s->ioreq_gfn, INVALID_GFN) )
+            return;
 
-    if ( p2m_remove_page(d, iorp->gfn, page_to_mfn(iorp->page), 0) )
+        gfn = s->ioreq_gfn;
+        pg = vmap_to_page(s->ioreq);
+        va = s->ioreq;
+    }
+
+    if ( p2m_remove_page(d, gfn, page_to_mfn(pg), 0) )
         domain_crash(d);
-    clear_page(iorp->va);
+    clear_page(va);
 }
 
 static int hvm_add_ioreq_gfn(struct ioreq_server *s, bool buf)
 {
     struct domain *d = s->target;
-    struct ioreq_page *iorp = buf ? &s->bufioreq : &s->ioreq;
+    struct page_info *pg;
+    void *va;
+    gfn_t gfn;
     int rc;
 
-    if ( gfn_eq(iorp->gfn, INVALID_GFN) )
-        return 0;
+    if ( buf )
+    {
+        struct ioreq_page *iorp = &s->bufioreq;
+
+        if ( gfn_eq(iorp->gfn, INVALID_GFN) )
+            return 0;
+
+        gfn = iorp->gfn;
+        pg = iorp->page;
+        va = iorp->va;
+    }
+    else
+    {
+        if ( gfn_eq(s->ioreq_gfn, INVALID_GFN) )
+            return 0;
+
+        gfn = s->ioreq_gfn;
+        pg = vmap_to_page(s->ioreq);
+        va = s->ioreq;
+    }
 
-    clear_page(iorp->va);
+    clear_page(va);
 
-    rc = p2m_add_page(d, iorp->gfn, page_to_mfn(iorp->page), 0, p2m_ram_rw);
+    rc = p2m_add_page(d, gfn, page_to_mfn(pg), 0, p2m_ram_rw);
     if ( rc == 0 )
-        paging_mark_pfn_dirty(d, _pfn(gfn_x(iorp->gfn)));
+        paging_mark_pfn_dirty(d, _pfn(gfn_x(gfn)));
 
     return rc;
 }
diff --git a/xen/common/ioreq.c b/xen/common/ioreq.c
index 2e284ad26c..5a09e2ba36 100644
--- a/xen/common/ioreq.c
+++ b/xen/common/ioreq.c
@@ -26,6 +26,8 @@
 #include <xen/paging.h>
 #include <xen/sched.h>
 #include <xen/trace.h>
+#include <xen/vmap.h>
+#include <xen/xvmalloc.h>
 
 #include <asm/guest_atomics.h>
 #include <asm/ioreq.h>
@@ -95,12 +97,10 @@ static struct ioreq_server *get_ioreq_server(const struct 
domain *d,
 
 static ioreq_t *get_ioreq(struct ioreq_server *s, struct vcpu *v)
 {
-    shared_iopage_t *p = s->ioreq.va;
-
     ASSERT((v == current) || !vcpu_runnable(v));
-    ASSERT(p != NULL);
+    ASSERT(s->ioreq != NULL);
 
-    return &p->vcpu_ioreq[v->vcpu_id];
+    return &s->ioreq[v->vcpu_id];
 }
 
 /*
@@ -260,10 +260,16 @@ bool vcpu_ioreq_handle_completion(struct vcpu *v)
 
 static int ioreq_server_alloc_mfn(struct ioreq_server *s, bool buf)
 {
-    struct ioreq_page *iorp = buf ? &s->bufioreq : &s->ioreq;
-    struct page_info *page;
+    unsigned int i, nr_pages;
+    mfn_t *mfns;
+    int rc;
 
+    /* bufioreq: single page allocation */
+    if ( buf )
     {
+        struct ioreq_page *iorp = &s->bufioreq;
+        struct page_info *page;
+
         if ( iorp->page )
         {
             /*
@@ -294,26 +300,92 @@ static int ioreq_server_alloc_mfn(struct ioreq_server *s, 
bool buf)
 
         iorp->va = __map_domain_page_global(page);
         if ( !iorp->va )
-            goto fail;
+        {
+            put_page_alloc_ref(page);
+            put_page_and_type(page);
+            return -ENOMEM;
+        }
 
         iorp->page = page;
         clear_page(iorp->va);
         return 0;
     }
 
- fail:
-    put_page_alloc_ref(page);
-    put_page_and_type(page);
+    /* ioreq: multi-page allocation */
+    if ( s->ioreq )
+    {
+        /*
+         * If a guest frame has already been mapped (which may happen
+         * on demand if ioreq_server_get_info() is called), then
+         * allocating a page is not permitted.
+         */
+        if ( !gfn_eq(s->ioreq_gfn, INVALID_GFN) )
+            return -EPERM;
+
+        return 0;
+    }
+
+    nr_pages = nr_ioreq_pages(s->target);
+    mfns = xvmalloc_array(mfn_t, nr_pages);
+
+    if ( !mfns )
+        return -ENOMEM;
+
+    for ( i = 0; i < nr_pages; i++ )
+    {
+        struct page_info *pg = alloc_domheap_page(s->target,
+                                                  MEMF_no_refcount);
+
+        rc = -ENOMEM;
+        if ( !pg )
+            goto fail_pages;
+
+        if ( !get_page_and_type(pg, s->target, PGT_writable_page) )
+        {
+            /*
+             * The domain can't possibly know about this page yet, so
+             * failure here is a clear indication of something fishy
+             * going on.
+             */
+            domain_crash(s->emulator);
+            rc = -ENODATA;
+            goto fail_pages;
+        }
+
+        mfns[i] = page_to_mfn(pg);
+    }
+
+    s->ioreq = vmap(mfns, nr_pages);
+    if ( !s->ioreq )
+    {
+        rc = -ENOMEM;
+        i = nr_pages;
+        goto fail_pages;
+    }
+
+    memset(s->ioreq, 0, nr_pages * PAGE_SIZE);
+    xvfree(mfns);
+    return 0;
 
-    return -ENOMEM;
+ fail_pages:
+    while ( i-- > 0 )
+    {
+        struct page_info *pg = mfn_to_page(mfns[i]);
+
+        put_page_alloc_ref(pg);
+        put_page_and_type(pg);
+    }
+    xvfree(mfns);
+    return rc;
 }
 
 static void ioreq_server_free_mfn(struct ioreq_server *s, bool buf)
 {
-    struct ioreq_page *iorp = buf ? &s->bufioreq : &s->ioreq;
-    struct page_info *page = iorp->page;
-
+    if ( buf )
     {
+        struct ioreq_page *iorp = &s->bufioreq;
+        struct page_info *page = iorp->page;
+
         if ( !page )
             return;
 
@@ -324,6 +396,23 @@ static void ioreq_server_free_mfn(struct ioreq_server *s, 
bool buf)
 
         put_page_alloc_ref(page);
         put_page_and_type(page);
+        return;
+    }
+
+    if ( s->ioreq )
+    {
+        unsigned int i, nr_pages = vmap_size(s->ioreq);
+
+        for ( i = 0; i < nr_pages; i++ )
+        {
+            struct page_info *pg = vmap_to_page((void *)s->ioreq +
+                                                i * PAGE_SIZE);
+
+            put_page_alloc_ref(pg);
+            put_page_and_type(pg);
+        }
+        vunmap(s->ioreq);
+        s->ioreq = NULL;
     }
 }
 
@@ -337,11 +426,29 @@ bool is_ioreq_server_page(struct domain *d, const struct 
page_info *page)
 
     FOR_EACH_IOREQ_SERVER(d, id, s)
     {
-        if ( (s->ioreq.page == page) || (s->bufioreq.page == page) )
+        if ( s->bufioreq.page == page )
         {
             found = true;
             break;
         }
+
+        if ( s->ioreq )
+        {
+            unsigned int i;
+
+            for ( i = 0; i < nr_ioreq_pages(d); i++ )
+            {
+                if ( vmap_to_page((char *)s->ioreq +
+                                  i * PAGE_SIZE) == page )
+                {
+                    found = true;
+                    break;
+                }
+            }
+
+            if ( found )
+                break;
+        }
     }
 
     rspin_unlock(&d->ioreq_server.lock);
@@ -354,7 +461,7 @@ static void ioreq_server_update_evtchn(struct ioreq_server 
*s,
 {
     ASSERT(spin_is_locked(&s->lock));
 
-    if ( s->ioreq.va != NULL )
+    if ( s->ioreq != NULL )
     {
         ioreq_t *p = get_ioreq(s, sv->vcpu);
 
@@ -594,7 +701,7 @@ static int ioreq_server_init(struct ioreq_server *s,
     INIT_LIST_HEAD(&s->ioreq_vcpu_list);
     spin_lock_init(&s->bufioreq_lock);
 
-    s->ioreq.gfn = INVALID_GFN;
+    s->ioreq_gfn = INVALID_GFN;
     s->bufioreq.gfn = INVALID_GFN;
 
     rc = ioreq_server_alloc_rangesets(s, id);
@@ -773,7 +880,7 @@ static int ioreq_server_get_info(struct domain *d, 
ioservid_t id,
     }
 
     if ( ioreq_gfn )
-        *ioreq_gfn = gfn_x(s->ioreq.gfn);
+        *ioreq_gfn = gfn_x(s->ioreq_gfn);
 
     if ( HANDLE_BUFIOREQ(s) )
     {
@@ -816,26 +923,29 @@ int ioreq_server_get_frame(struct domain *d, ioservid_t 
id,
     if ( rc )
         goto out;
 
-    switch ( idx )
+    if ( idx == XENMEM_resource_ioreq_server_frame_bufioreq )
     {
-    case XENMEM_resource_ioreq_server_frame_bufioreq:
         rc = -ENOENT;
         if ( !HANDLE_BUFIOREQ(s) )
             goto out;
 
         *mfn = page_to_mfn(s->bufioreq.page);
         rc = 0;
-        break;
-
-    case XENMEM_resource_ioreq_server_frame_ioreq(0):
-        *mfn = page_to_mfn(s->ioreq.page);
-        rc = 0;
-        break;
+    }
+    else if ( idx >= XENMEM_resource_ioreq_server_frame_ioreq(0) &&
+              idx < 
XENMEM_resource_ioreq_server_frame_ioreq(nr_ioreq_pages(d)) )
+    {
+        unsigned int page_idx = idx - 
XENMEM_resource_ioreq_server_frame_ioreq(0);
 
-    default:
         rc = -EINVAL;
-        break;
+        if ( s->ioreq )
+        {
+            *mfn = vmap_to_mfn((void *)s->ioreq + page_idx * PAGE_SIZE);
+            rc = 0;
+        }
     }
+    else
+        rc = -EINVAL;
 
  out:
     rspin_unlock(&d->ioreq_server.lock);
diff --git a/xen/include/xen/ioreq.h b/xen/include/xen/ioreq.h
index e86f0869fa..41650a59ca 100644
--- a/xen/include/xen/ioreq.h
+++ b/xen/include/xen/ioreq.h
@@ -19,9 +19,16 @@
 #ifndef __XEN_IOREQ_H__
 #define __XEN_IOREQ_H__
 
+#include <xen/macros.h>
 #include <xen/sched.h>
 
 #include <public/hvm/dm_op.h>
+#include <public/hvm/ioreq.h>
+
+static inline unsigned int nr_ioreq_pages(const struct domain *d)
+{
+    return DIV_ROUND_UP(d->max_vcpus, PAGE_SIZE / sizeof(ioreq_t));
+}
 
 struct ioreq_page {
     gfn_t gfn;
@@ -45,7 +52,8 @@ struct ioreq_server {
     /* Lock to serialize toolstack modifications */
     spinlock_t             lock;
 
-    struct ioreq_page      ioreq;
+    ioreq_t                *ioreq;
+    gfn_t                  ioreq_gfn;
     struct list_head       ioreq_vcpu_list;
     struct ioreq_page      bufioreq;
 
-- 
2.51.0



--
Julian Vetter | Vates Hypervisor & Kernel Developer

XCP-ng & Xen Orchestra - Vates solutions

web: https://vates.tech




 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.