[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCHv6 2/5] gnttab: introduce per-active entry locks



From: Matt Wilson <msw@xxxxxxxxxx>

Instead of protecting the state of a grant table active entry with the
grant table lock, use per active entry locks.

Active entries must be acquired with active_entry_acquire() and
released with active_entry_release() which lock and unlock the entry's
spinlock.

This is the first step to improving grant table scalability and will
allow the heaviliy contented grant table lock to be used less.

Signed-off-by: Matt Wilson <msw@xxxxxxxxxx>
Signed-off-by: David Vrabel <david.vrabel@xxxxxxxxxx>
---
 xen/common/grant_table.c |  213 ++++++++++++++++++++++++++++------------------
 1 file changed, 129 insertions(+), 84 deletions(-)

diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c
index dfb45f8..3a555f9 100644
--- a/xen/common/grant_table.c
+++ b/xen/common/grant_table.c
@@ -157,10 +157,13 @@ struct active_grant_entry {
                                in the page.                           */
     unsigned      length:16; /* For sub-page grants, the length of the
                                 grant.                                */
+    spinlock_t    lock;      /* lock to protect access of this entry.
+                                see docs/misc/grant-tables.txt for
+                                locking protocol                      */
 };
 
 #define ACGNT_PER_PAGE (PAGE_SIZE / sizeof(struct active_grant_entry))
-#define active_entry(t, e) \
+#define _active_entry(t, e) \
     ((t)->active[(e)/ACGNT_PER_PAGE][(e)%ACGNT_PER_PAGE])
 
 static inline void gnttab_flush_tlb(const struct domain *d)
@@ -188,6 +191,24 @@ nr_active_grant_frames(struct grant_table *gt)
     return num_act_frames_from_sha_frames(nr_grant_frames(gt));
 }
 
+static inline struct active_grant_entry *
+active_entry_acquire(struct grant_table *t, grant_ref_t e)
+{
+    struct active_grant_entry *act;
+
+    ASSERT(spin_is_locked(&t->lock));
+
+    act = &_active_entry(t, e);
+    spin_lock(&act->lock);
+
+    return act;
+}
+
+static inline void active_entry_release(struct active_grant_entry *act)
+{
+    spin_unlock(&act->lock);
+}
+    
 /* Check if the page has been paged out, or needs unsharing. 
    If rc == GNTST_okay, *page contains the page struct with a ref taken.
    Caller must do put_page(*page).
@@ -228,30 +249,6 @@ static int __get_paged_frame(unsigned long gfn, unsigned 
long *frame, struct pag
     return rc;
 }
 
-static inline void
-double_gt_lock(struct grant_table *lgt, struct grant_table *rgt)
-{
-    if ( lgt < rgt )
-    {
-        spin_lock(&lgt->lock);
-        spin_lock(&rgt->lock);
-    }
-    else
-    {
-        if ( lgt != rgt )
-            spin_lock(&rgt->lock);
-        spin_lock(&lgt->lock);
-    }
-}
-
-static inline void
-double_gt_unlock(struct grant_table *lgt, struct grant_table *rgt)
-{
-    spin_unlock(&lgt->lock);
-    if ( lgt != rgt )
-        spin_unlock(&rgt->lock);
-}
-
 static inline int
 __get_maptrack_handle(
     struct grant_table *t)
@@ -505,26 +502,23 @@ static int grant_map_exists(const struct domain *ld,
                             unsigned long mfn,
                             unsigned int *ref_count)
 {
-    const struct active_grant_entry *act;
+    struct active_grant_entry *act;
     unsigned int ref, max_iter;
     
     ASSERT(spin_is_locked(&rgt->lock));
 
     max_iter = min(*ref_count + (1 << GNTTABOP_CONTINUATION_ARG_SHIFT),
                    nr_grant_entries(rgt));
-    for ( ref = *ref_count; ref < max_iter; ref++ )
+    for ( ref = *ref_count; ref < max_iter; active_entry_release(act), ref++ )
     {
-        act = &active_entry(rgt, ref);
+        act = active_entry_acquire(rgt, ref);
 
-        if ( !act->pin )
-            continue;
-
-        if ( act->domid != ld->domain_id )
-            continue;
-
-        if ( act->frame != mfn )
+        if ( !act->pin ||
+             act->domid != ld->domain_id ||
+             act->frame != mfn )
             continue;
 
+        active_entry_release(act);
         return 0;
     }
 
@@ -548,11 +542,14 @@ static void mapcount(
 
     for ( handle = 0; handle < lgt->maptrack_limit; handle++ )
     {
+        struct active_grant_entry *act;
+
         map = &maptrack_entry(lgt, handle);
         if ( !(map->flags & (GNTMAP_device_map|GNTMAP_host_map)) ||
              map->domid != rd->domain_id )
             continue;
-        if ( active_entry(rd->grant_table, map->ref).frame == mfn )
+        act = &_active_entry(rd->grant_table, map->ref);
+        if ( act->frame == mfn )
             (map->flags & GNTMAP_readonly) ? (*rdc)++ : (*wrc)++;
     }
 }
@@ -576,7 +573,6 @@ __gnttab_map_grant_ref(
     struct page_info *pg = NULL;
     int            rc = GNTST_okay;
     u32            old_pin;
-    u32            act_pin;
     unsigned int   cache_flags;
     struct active_grant_entry *act = NULL;
     struct grant_mapping *mt;
@@ -639,7 +635,7 @@ __gnttab_map_grant_ref(
     if ( unlikely(op->ref >= nr_grant_entries(rgt)))
         PIN_FAIL(unlock_out, GNTST_bad_gntref, "Bad ref (%d).\n", op->ref);
 
-    act = &active_entry(rgt, op->ref);
+    act = active_entry_acquire(rgt, op->ref);
     shah = shared_entry_header(rgt, op->ref);
     if (rgt->gt_version == 1) {
         sha1 = &shared_entry_v1(rgt, op->ref);
@@ -656,7 +652,7 @@ __gnttab_map_grant_ref(
          ((act->domid != ld->domain_id) ||
           (act->pin & 0x80808080U) != 0 ||
           (act->is_sub_page)) )
-        PIN_FAIL(unlock_out, GNTST_general_error,
+        PIN_FAIL(act_release_out, GNTST_general_error,
                  "Bad domain (%d != %d), or risk of counter overflow %08x, or 
subpage %d\n",
                  act->domid, ld->domain_id, act->pin, act->is_sub_page);
 
@@ -667,7 +663,7 @@ __gnttab_map_grant_ref(
         if ( (rc = _set_status(rgt->gt_version, ld->domain_id,
                                op->flags & GNTMAP_readonly,
                                1, shah, act, status) ) != GNTST_okay )
-             goto unlock_out;
+            goto act_release_out;
 
         if ( !act->pin )
         {
@@ -698,12 +694,9 @@ __gnttab_map_grant_ref(
             GNTPIN_hstr_inc : GNTPIN_hstw_inc;
 
     frame = act->frame;
-    act_pin = act->pin;
 
     cache_flags = (shah->flags & (GTF_PAT | GTF_PWT | GTF_PCD) );
 
-    spin_unlock(&rgt->lock);
-
     /* pg may be set, with a refcount included, from __get_paged_frame */
     if ( !pg )
     {
@@ -778,8 +771,6 @@ __gnttab_map_grant_ref(
         goto undo_out;
     }
 
-    double_gt_lock(lgt, rgt);
-
     if ( gnttab_need_iommu_mapping(ld) )
     {
         unsigned int wrc, rdc;
@@ -787,21 +778,20 @@ __gnttab_map_grant_ref(
         /* We're not translated, so we know that gmfns and mfns are
            the same things, so the IOMMU entry is always 1-to-1. */
         mapcount(lgt, rd, frame, &wrc, &rdc);
-        if ( (act_pin & (GNTPIN_hstw_mask|GNTPIN_devw_mask)) &&
+        if ( (act->pin & (GNTPIN_hstw_mask|GNTPIN_devw_mask)) &&
              !(old_pin & (GNTPIN_hstw_mask|GNTPIN_devw_mask)) )
         {
             if ( wrc == 0 )
                 err = iommu_map_page(ld, frame, frame,
                                      IOMMUF_readable|IOMMUF_writable);
         }
-        else if ( act_pin && !old_pin )
+        else if ( act->pin && !old_pin )
         {
             if ( (wrc + rdc) == 0 )
                 err = iommu_map_page(ld, frame, frame, IOMMUF_readable);
         }
         if ( err )
         {
-            double_gt_unlock(lgt, rgt);
             rc = GNTST_general_error;
             goto undo_out;
         }
@@ -814,7 +804,8 @@ __gnttab_map_grant_ref(
     mt->ref   = op->ref;
     mt->flags = op->flags;
 
-    double_gt_unlock(lgt, rgt);
+    active_entry_release(act);
+    spin_unlock(&rgt->lock);
 
     op->dev_bus_addr = (u64)frame << PAGE_SHIFT;
     op->handle       = handle;
@@ -837,10 +828,6 @@ __gnttab_map_grant_ref(
         put_page(pg);
     }
 
-    spin_lock(&rgt->lock);
-
-    act = &active_entry(rgt, op->ref);
-
     if ( op->flags & GNTMAP_device_map )
         act->pin -= (op->flags & GNTMAP_readonly) ?
             GNTPIN_devr_inc : GNTPIN_devw_inc;
@@ -856,6 +843,9 @@ __gnttab_map_grant_ref(
     if ( !act->pin )
         gnttab_clear_flag(_GTF_reading, status);
 
+ act_release_out:
+    active_entry_release(act);
+
  unlock_out:
     spin_unlock(&rgt->lock);
     op->status = rc;
@@ -939,18 +929,19 @@ __gnttab_unmap_common(
     TRACE_1D(TRC_MEM_PAGE_GRANT_UNMAP, dom);
 
     rgt = rd->grant_table;
-    double_gt_lock(lgt, rgt);
+
+    spin_lock(&rgt->lock);
+    act = active_entry_acquire(rgt, op->map->ref);
 
     op->flags = op->map->flags;
     if ( unlikely(!op->flags) || unlikely(op->map->domid != dom) )
     {
         gdprintk(XENLOG_WARNING, "Unstable handle %u\n", op->handle);
         rc = GNTST_bad_handle;
-        goto unmap_out;
+        goto act_release_out;
     }
 
     op->rd = rd;
-    act = &active_entry(rgt, op->map->ref);
 
     if ( op->frame == 0 )
     {
@@ -959,7 +950,7 @@ __gnttab_unmap_common(
     else
     {
         if ( unlikely(op->frame != act->frame) )
-            PIN_FAIL(unmap_out, GNTST_general_error,
+            PIN_FAIL(act_release_out, GNTST_general_error,
                      "Bad frame number doesn't match gntref. (%lx != %lx)\n",
                      op->frame, act->frame);
         if ( op->flags & GNTMAP_device_map )
@@ -978,7 +969,7 @@ __gnttab_unmap_common(
         if ( (rc = replace_grant_host_mapping(op->host_addr,
                                               op->frame, op->new_addr, 
                                               op->flags)) < 0 )
-            goto unmap_out;
+            goto act_release_out;
 
         ASSERT(act->pin & (GNTPIN_hstw_mask | GNTPIN_hstr_mask));
         op->map->flags &= ~GNTMAP_host_map;
@@ -1000,7 +991,7 @@ __gnttab_unmap_common(
         if ( err )
         {
             rc = GNTST_general_error;
-            goto unmap_out;
+            goto act_release_out;
         }
     }
 
@@ -1008,8 +999,10 @@ __gnttab_unmap_common(
     if ( !(op->flags & GNTMAP_readonly) )
          gnttab_mark_dirty(rd, op->frame);
 
- unmap_out:
-    double_gt_unlock(lgt, rgt);
+ act_release_out:
+    active_entry_release(act);
+    spin_unlock(&rgt->lock);
+
     op->status = rc;
     rcu_unlock_domain(rd);
 }
@@ -1039,12 +1032,12 @@ __gnttab_unmap_common_complete(struct 
gnttab_unmap_common *op)
 
     rcu_lock_domain(rd);
     rgt = rd->grant_table;
-    spin_lock(&rgt->lock);
 
+    spin_lock(&rgt->lock);
     if ( rgt->gt_version == 0 )
-        goto unmap_out;
+        goto unlock_out;
 
-    act = &active_entry(rgt, op->map->ref);
+    act = active_entry_acquire(rgt, op->map->ref);
     sha = shared_entry_header(rgt, op->map->ref);
 
     if ( rgt->gt_version == 1 )
@@ -1058,7 +1051,7 @@ __gnttab_unmap_common_complete(struct gnttab_unmap_common 
*op)
          * Suggests that __gntab_unmap_common failed early and so
          * nothing further to do
          */
-        goto unmap_out;
+        goto act_release_out;
     }
 
     pg = mfn_to_page(op->frame);
@@ -1082,7 +1075,7 @@ __gnttab_unmap_common_complete(struct gnttab_unmap_common 
*op)
              * Suggests that __gntab_unmap_common failed in
              * replace_grant_host_mapping() so nothing further to do
              */
-            goto unmap_out;
+            goto act_release_out;
         }
 
         if ( !is_iomem_page(op->frame) ) 
@@ -1103,8 +1096,12 @@ __gnttab_unmap_common_complete(struct 
gnttab_unmap_common *op)
     if ( act->pin == 0 )
         gnttab_clear_flag(_GTF_reading, status);
 
- unmap_out:
+ act_release_out:
+    active_entry_release(act);
+    
+ unlock_out:
     spin_unlock(&rgt->lock);
+
     if ( put_handle )
     {
         op->map->flags = 0;
@@ -1290,13 +1287,17 @@ gnttab_unpopulate_status_frames(struct domain *d, 
struct grant_table *gt)
     gt->nr_status_frames = 0;
 }
 
+/* 
+ * Grow the grant table. The caller must hold the grant table's
+ * write lock before calling this function.
+ */
 int
 gnttab_grow_table(struct domain *d, unsigned int req_nr_frames)
 {
-    /* d's grant table lock must be held by the caller */
+    /* d's grant table write lock must be held by the caller */
 
     struct grant_table *gt = d->grant_table;
-    unsigned int i;
+    unsigned int i, j;
 
     ASSERT(req_nr_frames <= max_grant_frames);
 
@@ -1311,6 +1312,8 @@ gnttab_grow_table(struct domain *d, unsigned int 
req_nr_frames)
         if ( (gt->active[i] = alloc_xenheap_page()) == NULL )
             goto active_alloc_failed;
         clear_page(gt->active[i]);
+        for ( j = 0; j < ACGNT_PER_PAGE; j++ )
+            spin_lock_init(&gt->active[i][j].lock);
     }
 
     /* Shared */
@@ -1806,7 +1809,7 @@ __release_grant_for_copy(
 
     spin_lock(&rgt->lock);
 
-    act = &active_entry(rgt, gref);
+    act = active_entry_acquire(rgt, gref);
     sha = shared_entry_header(rgt, gref);
     r_frame = act->frame;
 
@@ -1845,6 +1848,7 @@ __release_grant_for_copy(
         released_read = 1;
     }
 
+    active_entry_release(act);
     spin_unlock(&rgt->lock);
 
     if ( td != rd )
@@ -1906,14 +1910,14 @@ __acquire_grant_for_copy(
     spin_lock(&rgt->lock);
 
     if ( rgt->gt_version == 0 )
-        PIN_FAIL(unlock_out, GNTST_general_error,
+        PIN_FAIL(gnt_unlock_out, GNTST_general_error,
                  "remote grant table not ready\n");
 
     if ( unlikely(gref >= nr_grant_entries(rgt)) )
-        PIN_FAIL(unlock_out, GNTST_bad_gntref,
+        PIN_FAIL(gnt_unlock_out, GNTST_bad_gntref,
                  "Bad grant reference %ld\n", gref);
 
-    act = &active_entry(rgt, gref);
+    act = active_entry_acquire(rgt, gref);
     shah = shared_entry_header(rgt, gref);
     if ( rgt->gt_version == 1 )
     {
@@ -1972,6 +1976,13 @@ __acquire_grant_for_copy(
                 PIN_FAIL(unlock_out_clear, GNTST_general_error,
                          "transitive grant referenced bad domain %d\n",
                          trans_domid);
+
+            /* 
+             * __acquire_grant_for_copy() could take the read lock on
+             * the right table (if rd == td), so we have to drop the
+             * lock here and reacquire
+             */
+            active_entry_release(act);
             spin_unlock(&rgt->lock);
 
             rc = __acquire_grant_for_copy(td, trans_gref, rd->domain_id,
@@ -1979,10 +1990,13 @@ __acquire_grant_for_copy(
                                           &trans_page_off, &trans_length, 0);
 
             spin_lock(&rgt->lock);
+            act = active_entry_acquire(rgt, gref);
+
             if ( rc != GNTST_okay ) {
                 __fixup_status_for_copy_pin(act, status);
-                rcu_unlock_domain(td);
+                active_entry_release(act);
                 spin_unlock(&rgt->lock);
+                rcu_unlock_domain(td);
                 return rc;
             }
 
@@ -1994,6 +2008,7 @@ __acquire_grant_for_copy(
             {
                 __fixup_status_for_copy_pin(act, status);
                 rcu_unlock_domain(td);
+                active_entry_release(act);
                 spin_unlock(&rgt->lock);
                 put_page(*page);
                 return __acquire_grant_for_copy(rd, gref, ldom, readonly,
@@ -2062,6 +2077,7 @@ __acquire_grant_for_copy(
     *length = act->length;
     *frame = act->frame;
 
+    active_entry_release(act);
     spin_unlock(&rgt->lock);
     return rc;
  
@@ -2074,7 +2090,11 @@ __acquire_grant_for_copy(
         gnttab_clear_flag(_GTF_reading, status);
 
  unlock_out:
+    active_entry_release(act);
+
+ gnt_unlock_out:
     spin_unlock(&rgt->lock);
+
     return rc;
 }
 
@@ -2399,16 +2419,18 @@ 
gnttab_set_version(XEN_GUEST_HANDLE_PARAM(gnttab_set_version_t) uop)
     {
         for ( i = GNTTAB_NR_RESERVED_ENTRIES; i < nr_grant_entries(gt); i++ )
         {
-            act = &active_entry(gt, i);
+            act = active_entry_acquire(gt, i);
             if ( act->pin != 0 )
             {
                 gdprintk(XENLOG_WARNING,
                          "tried to change grant table version from %d to %d, 
but some grant entries still in use\n",
                          gt->gt_version,
                          op.version);
+                active_entry_release(act);
                 res = -EBUSY;
                 goto out_unlock;
             }
+            active_entry_release(act);
         }
     }
 
@@ -2587,9 +2609,17 @@ __gnttab_swap_grant_ref(grant_ref_t ref_a, grant_ref_t 
ref_b)
 {
     struct domain *d = rcu_lock_current_domain();
     struct grant_table *gt = d->grant_table;
-    struct active_grant_entry *act;
+    struct active_grant_entry *act_a = NULL;
+    struct active_grant_entry *act_b = NULL;
     s16 rc = GNTST_okay;
 
+    if ( ref_a == ref_b )
+        /*
+         * noop, so avoid acquiring the same active entry
+         * twice which would case a deadlock.
+         */
+        return rc;
+
     spin_lock(&gt->lock);
 
     /* Bounds check on the grant refs */
@@ -2598,12 +2628,12 @@ __gnttab_swap_grant_ref(grant_ref_t ref_a, grant_ref_t 
ref_b)
     if ( unlikely(ref_b >= nr_grant_entries(d->grant_table)))
         PIN_FAIL(out, GNTST_bad_gntref, "Bad ref-b (%d).\n", ref_b);
 
-    act = &active_entry(gt, ref_a);
-    if ( act->pin )
+    act_a = active_entry_acquire(gt, ref_a);
+    if ( act_a->pin )
         PIN_FAIL(out, GNTST_eagain, "ref a %ld busy\n", (long)ref_a);
 
-    act = &active_entry(gt, ref_b);
-    if ( act->pin )
+    act_b = active_entry_acquire(gt, ref_b);
+    if ( act_b->pin )
         PIN_FAIL(out, GNTST_eagain, "ref b %ld busy\n", (long)ref_b);
 
     if ( gt->gt_version == 1 )
@@ -2630,6 +2660,10 @@ __gnttab_swap_grant_ref(grant_ref_t ref_a, grant_ref_t 
ref_b)
     }
 
 out:
+    if ( act_b != NULL )
+        active_entry_release(act_b);
+    if ( act_a != NULL )
+        active_entry_release(act_a);
     spin_unlock(&gt->lock);
 
     rcu_unlock_domain(d);
@@ -2939,7 +2973,7 @@ grant_table_create(
     struct domain *d)
 {
     struct grant_table *t;
-    int                 i;
+    int                 i, j;
 
     if ( (t = xzalloc(struct grant_table)) == NULL )
         goto no_mem_0;
@@ -2958,6 +2992,8 @@ grant_table_create(
         if ( (t->active[i] = alloc_xenheap_page()) == NULL )
             goto no_mem_2;
         clear_page(t->active[i]);
+        for ( j = 0; j < ACGNT_PER_PAGE; j++ )
+            spin_lock_init(&t->active[i][j].lock);
     }
 
     /* Tracking of mapped foreign frames table */
@@ -3054,7 +3090,7 @@ gnttab_release_mappings(
         rgt = rd->grant_table;
         spin_lock(&rgt->lock);
 
-        act = &active_entry(rgt, ref);
+        act = active_entry_acquire(rgt, ref);
         sha = shared_entry_header(rgt, ref);
         if (rgt->gt_version == 1)
             status = &sha->flags;
@@ -3112,6 +3148,7 @@ gnttab_release_mappings(
         if ( act->pin == 0 )
             gnttab_clear_flag(_GTF_reading, status);
 
+        active_entry_release(act);
         spin_unlock(&rgt->lock);
 
         rcu_unlock_domain(rd);
@@ -3130,7 +3167,9 @@ grant_table_destroy(
 
     if ( t == NULL )
         return;
-    
+
+    spin_lock(&t->lock);
+
     for ( i = 0; i < nr_grant_frames(t); i++ )
         free_xenheap_page(t->shared_raw[i]);
     xfree(t->shared_raw);
@@ -3147,6 +3186,8 @@ grant_table_destroy(
         free_xenheap_page(t->status[i]);
     xfree(t->status);
 
+    spin_unlock(&t->lock);
+
     xfree(t);
     d->grant_table = NULL;
 }
@@ -3174,9 +3215,12 @@ static void gnttab_usage_print(struct domain *rd)
         uint16_t status;
         uint64_t frame;
 
-        act = &active_entry(gt, ref);
+        act = active_entry_acquire(gt, ref);
         if ( !act->pin )
+        {
+            active_entry_release(act);
             continue;
+        }
 
         sha = shared_entry_header(gt, ref);
 
@@ -3206,6 +3250,7 @@ static void gnttab_usage_print(struct domain *rd)
         printk("[%3d]    %5d 0x%06lx 0x%08x      %5d 0x%06"PRIx64" 0x%02x\n",
                ref, act->domid, act->frame, act->pin,
                sha->domid, frame, status);
+        active_entry_release(act);
     }
 
  out:
-- 
1.7.10.4


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.