[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCHv4 2/3] grant_table: convert grant table rwlock to percpu rwlock



The per domain grant table read lock suffers from significant contention when
performance multi-queue block or network IO due to the parallel
grant map/unmaps/copies occurring on the DomU's grant table.

On multi-socket systems, the contention results in the locked compare swap
operation failing frequently which results in a tight loop of retries of the
compare swap operation. As the coherency fabric can only support a specific
rate of compare swap operations for a particular data location then taking
the read lock itself becomes a bottleneck for grant operations.

Standard rwlock performance of a single VIF VM-VM transfer with 16 queues
configured was limited to approximately 15 gbit/s on a 2 socket Haswell-EP
host.

Percpu rwlock performance with the same configuration is approximately
48 gbit/s.

Oprofile was used to determine the initial overhead of the read-write locks
and to confirm the overhead was dramatically reduced by the percpu rwlocks.

Signed-off-by: Malcolm Crossley <malcolm.crossley@xxxxxxxxxx>
--
Changes since v2:
 - Switched to using wrappers for taking percpu rwlock
 - Added percpu structure pointer to percpu rwlock initialisation
 - Added comment on removal of ASSERTS for grant table rw_is_locked()
Changes since v1:
 - Used new macros provided in updated percpu rwlock v2 patch
 - Converted grant table rwlock_t to percpu_rwlock_t
 - Patched a missed grant table rwlock_t usage site
---
 xen/arch/arm/mm.c             |   4 +-
 xen/arch/x86/mm.c             |   4 +-
 xen/common/grant_table.c      | 126 +++++++++++++++++++++++-------------------
 xen/include/xen/grant_table.h |  24 +++++++-
 4 files changed, 97 insertions(+), 61 deletions(-)

diff --git a/xen/arch/arm/mm.c b/xen/arch/arm/mm.c
index 47bfb27..928d58ed 100644
--- a/xen/arch/arm/mm.c
+++ b/xen/arch/arm/mm.c
@@ -1055,7 +1055,7 @@ int xenmem_add_to_physmap_one(
     switch ( space )
     {
     case XENMAPSPACE_grant_table:
-        write_lock(&d->grant_table->lock);
+        grant_percpu_write_lock(&d->grant_table->lock);
 
         if ( d->grant_table->gt_version == 0 )
             d->grant_table->gt_version = 1;
@@ -1085,7 +1085,7 @@ int xenmem_add_to_physmap_one(
 
         t = p2m_ram_rw;
 
-        write_unlock(&d->grant_table->lock);
+        grant_percpu_write_unlock(&d->grant_table->lock);
         break;
     case XENMAPSPACE_shared_info:
         if ( idx != 0 )
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 92df36f..8c968fa 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -4654,7 +4654,7 @@ int xenmem_add_to_physmap_one(
                 mfn = virt_to_mfn(d->shared_info);
             break;
         case XENMAPSPACE_grant_table:
-            write_lock(&d->grant_table->lock);
+            grant_percpu_write_lock(&d->grant_table->lock);
 
             if ( d->grant_table->gt_version == 0 )
                 d->grant_table->gt_version = 1;
@@ -4676,7 +4676,7 @@ int xenmem_add_to_physmap_one(
                     mfn = virt_to_mfn(d->grant_table->shared_raw[idx]);
             }
 
-            write_unlock(&d->grant_table->lock);
+            grant_percpu_write_unlock(&d->grant_table->lock);
             break;
         case XENMAPSPACE_gmfn_range:
         case XENMAPSPACE_gmfn:
diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c
index 5d52d1e..fb6b808 100644
--- a/xen/common/grant_table.c
+++ b/xen/common/grant_table.c
@@ -178,6 +178,8 @@ struct active_grant_entry {
 #define _active_entry(t, e) \
     ((t)->active[(e)/ACGNT_PER_PAGE][(e)%ACGNT_PER_PAGE])
 
+DEFINE_PERCPU_RWLOCK_GLOBAL(grant_rwlock);
+
 static inline void gnttab_flush_tlb(const struct domain *d)
 {
     if ( !paging_mode_external(d) )
@@ -208,7 +210,13 @@ active_entry_acquire(struct grant_table *t, grant_ref_t e)
 {
     struct active_grant_entry *act;
 
-    ASSERT(rw_is_locked(&t->lock));
+    /* 
+     * The grant table for the active entry should be locked but the 
+     * percpu rwlock cannot be checked for read lock without race conditions
+     * or high overhead so we cannot use an ASSERT 
+     *
+     *   ASSERT(rw_is_locked(&t->lock));
+     */
 
     act = &_active_entry(t, e);
     spin_lock(&act->lock);
@@ -270,23 +278,23 @@ double_gt_lock(struct grant_table *lgt, struct 
grant_table *rgt)
      */
     if ( lgt < rgt )
     {
-        write_lock(&lgt->lock);
-        write_lock(&rgt->lock);
+        grant_percpu_write_lock(&lgt->lock);
+        grant_percpu_write_lock(&rgt->lock);
     }
     else
     {
         if ( lgt != rgt )
-            write_lock(&rgt->lock);
-        write_lock(&lgt->lock);
+            grant_percpu_write_lock(&rgt->lock);
+        grant_percpu_write_lock(&lgt->lock);
     }
 }
 
 static inline void
 double_gt_unlock(struct grant_table *lgt, struct grant_table *rgt)
 {
-    write_unlock(&lgt->lock);
+    grant_percpu_write_unlock(&lgt->lock);
     if ( lgt != rgt )
-        write_unlock(&rgt->lock);
+        grant_percpu_write_unlock(&rgt->lock);
 }
 
 static inline int
@@ -659,8 +667,14 @@ static int grant_map_exists(const struct domain *ld,
                             unsigned int *ref_count)
 {
     unsigned int ref, max_iter;
-    
-    ASSERT(rw_is_locked(&rgt->lock));
+
+    /* 
+     * The remote grant table should be locked but the percpu rwlock
+     * cannot be checked for read lock without race conditions or high 
+     * overhead so we cannot use an ASSERT 
+     *
+     *   ASSERT(rw_is_locked(&rgt->lock));
+     */
 
     max_iter = min(*ref_count + (1 << GNTTABOP_CONTINUATION_ARG_SHIFT),
                    nr_grant_entries(rgt));
@@ -703,12 +717,12 @@ static unsigned int mapkind(
      * Must have the local domain's grant table write lock when
      * iterating over its maptrack entries.
      */
-    ASSERT(rw_is_write_locked(&lgt->lock));
+    ASSERT(percpu_rw_is_write_locked(&lgt->lock));
     /*
      * Must have the remote domain's grant table write lock while
      * counting its active entries.
      */
-    ASSERT(rw_is_write_locked(&rd->grant_table->lock));
+    ASSERT(percpu_rw_is_write_locked(&rd->grant_table->lock));
 
     for ( handle = 0; !(kind & MAPKIND_WRITE) &&
                       handle < lgt->maptrack_limit; handle++ )
@@ -796,7 +810,7 @@ __gnttab_map_grant_ref(
     }
 
     rgt = rd->grant_table;
-    read_lock(&rgt->lock);
+    grant_percpu_read_lock(&rgt->lock);
 
     /* Bounds check on the grant ref */
     if ( unlikely(op->ref >= nr_grant_entries(rgt)))
@@ -859,7 +873,7 @@ __gnttab_map_grant_ref(
     cache_flags = (shah->flags & (GTF_PAT | GTF_PWT | GTF_PCD) );
 
     active_entry_release(act);
-    read_unlock(&rgt->lock);
+    grant_percpu_read_unlock(&rgt->lock);
 
     /* pg may be set, with a refcount included, from __get_paged_frame */
     if ( !pg )
@@ -1006,7 +1020,7 @@ __gnttab_map_grant_ref(
         put_page(pg);
     }
 
-    read_lock(&rgt->lock);
+    grant_percpu_read_lock(&rgt->lock);
 
     act = active_entry_acquire(rgt, op->ref);
 
@@ -1029,7 +1043,7 @@ __gnttab_map_grant_ref(
     active_entry_release(act);
 
  unlock_out:
-    read_unlock(&rgt->lock);
+    grant_percpu_read_unlock(&rgt->lock);
     op->status = rc;
     put_maptrack_handle(lgt, handle);
     rcu_unlock_domain(rd);
@@ -1080,18 +1094,18 @@ __gnttab_unmap_common(
 
     op->map = &maptrack_entry(lgt, op->handle);
 
-    read_lock(&lgt->lock);
+    grant_percpu_read_lock(&lgt->lock);
 
     if ( unlikely(!read_atomic(&op->map->flags)) )
     {
-        read_unlock(&lgt->lock);
+        grant_percpu_read_unlock(&lgt->lock);
         gdprintk(XENLOG_INFO, "Zero flags for handle (%d).\n", op->handle);
         op->status = GNTST_bad_handle;
         return;
     }
 
     dom = op->map->domid;
-    read_unlock(&lgt->lock);
+    grant_percpu_read_unlock(&lgt->lock);
 
     if ( unlikely((rd = rcu_lock_domain_by_id(dom)) == NULL) )
     {
@@ -1113,7 +1127,7 @@ __gnttab_unmap_common(
 
     rgt = rd->grant_table;
 
-    read_lock(&rgt->lock);
+    grant_percpu_read_lock(&rgt->lock);
 
     op->flags = read_atomic(&op->map->flags);
     if ( unlikely(!op->flags) || unlikely(op->map->domid != dom) )
@@ -1165,7 +1179,7 @@ __gnttab_unmap_common(
  act_release_out:
     active_entry_release(act);
  unmap_out:
-    read_unlock(&rgt->lock);
+    grant_percpu_read_unlock(&rgt->lock);
 
     if ( rc == GNTST_okay && gnttab_need_iommu_mapping(ld) )
     {
@@ -1220,7 +1234,7 @@ __gnttab_unmap_common_complete(struct gnttab_unmap_common 
*op)
     rcu_lock_domain(rd);
     rgt = rd->grant_table;
 
-    read_lock(&rgt->lock);
+    grant_percpu_read_lock(&rgt->lock);
     if ( rgt->gt_version == 0 )
         goto unlock_out;
 
@@ -1286,7 +1300,7 @@ __gnttab_unmap_common_complete(struct gnttab_unmap_common 
*op)
  act_release_out:
     active_entry_release(act);
  unlock_out:
-    read_unlock(&rgt->lock);
+    grant_percpu_read_unlock(&rgt->lock);
 
     if ( put_handle )
     {
@@ -1585,7 +1599,7 @@ gnttab_setup_table(
     }
 
     gt = d->grant_table;
-    write_lock(&gt->lock);
+    grant_percpu_write_lock(&gt->lock);
 
     if ( gt->gt_version == 0 )
         gt->gt_version = 1;
@@ -1613,7 +1627,7 @@ gnttab_setup_table(
     }
 
  out3:
-    write_unlock(&gt->lock);
+    grant_percpu_write_unlock(&gt->lock);
  out2:
     rcu_unlock_domain(d);
  out1:
@@ -1655,13 +1669,13 @@ gnttab_query_size(
         goto query_out_unlock;
     }
 
-    read_lock(&d->grant_table->lock);
+    grant_percpu_read_lock(&d->grant_table->lock);
 
     op.nr_frames     = nr_grant_frames(d->grant_table);
     op.max_nr_frames = max_grant_frames;
     op.status        = GNTST_okay;
 
-    read_unlock(&d->grant_table->lock);
+    grant_percpu_read_unlock(&d->grant_table->lock);
 
  
  query_out_unlock:
@@ -1687,7 +1701,7 @@ gnttab_prepare_for_transfer(
     union grant_combo   scombo, prev_scombo, new_scombo;
     int                 retries = 0;
 
-    read_lock(&rgt->lock);
+    grant_percpu_read_lock(&rgt->lock);
 
     if ( unlikely(ref >= nr_grant_entries(rgt)) )
     {
@@ -1730,11 +1744,11 @@ gnttab_prepare_for_transfer(
         scombo = prev_scombo;
     }
 
-    read_unlock(&rgt->lock);
+    grant_percpu_read_unlock(&rgt->lock);
     return 1;
 
  fail:
-    read_unlock(&rgt->lock);
+    grant_percpu_read_unlock(&rgt->lock);
     return 0;
 }
 
@@ -1925,7 +1939,7 @@ gnttab_transfer(
         TRACE_1D(TRC_MEM_PAGE_GRANT_TRANSFER, e->domain_id);
 
         /* Tell the guest about its new page frame. */
-        read_lock(&e->grant_table->lock);
+        grant_percpu_read_lock(&e->grant_table->lock);
         act = active_entry_acquire(e->grant_table, gop.ref);
 
         if ( e->grant_table->gt_version == 1 )
@@ -1949,7 +1963,7 @@ gnttab_transfer(
             GTF_transfer_completed;
 
         active_entry_release(act);
-        read_unlock(&e->grant_table->lock);
+        grant_percpu_read_unlock(&e->grant_table->lock);
 
         rcu_unlock_domain(e);
 
@@ -1987,7 +2001,7 @@ __release_grant_for_copy(
     released_read = 0;
     released_write = 0;
 
-    read_lock(&rgt->lock);
+    grant_percpu_read_lock(&rgt->lock);
 
     act = active_entry_acquire(rgt, gref);
     sha = shared_entry_header(rgt, gref);
@@ -2029,7 +2043,7 @@ __release_grant_for_copy(
     }
 
     active_entry_release(act);
-    read_unlock(&rgt->lock);
+    grant_percpu_read_unlock(&rgt->lock);
 
     if ( td != rd )
     {
@@ -2086,7 +2100,7 @@ __acquire_grant_for_copy(
 
     *page = NULL;
 
-    read_lock(&rgt->lock);
+    grant_percpu_read_lock(&rgt->lock);
 
     if ( unlikely(gref >= nr_grant_entries(rgt)) )
         PIN_FAIL(gt_unlock_out, GNTST_bad_gntref,
@@ -2168,20 +2182,20 @@ __acquire_grant_for_copy(
              * here and reacquire
              */
             active_entry_release(act);
-            read_unlock(&rgt->lock);
+            grant_percpu_read_unlock(&rgt->lock);
 
             rc = __acquire_grant_for_copy(td, trans_gref, rd->domain_id,
                                           readonly, &grant_frame, page,
                                           &trans_page_off, &trans_length, 0);
 
-            read_lock(&rgt->lock);
+            grant_percpu_read_lock(&rgt->lock);
             act = active_entry_acquire(rgt, gref);
 
             if ( rc != GNTST_okay ) {
                 __fixup_status_for_copy_pin(act, status);
                 rcu_unlock_domain(td);
                 active_entry_release(act);
-                read_unlock(&rgt->lock);
+                grant_percpu_read_unlock(&rgt->lock);
                 return rc;
             }
 
@@ -2194,7 +2208,7 @@ __acquire_grant_for_copy(
                 __fixup_status_for_copy_pin(act, status);
                 rcu_unlock_domain(td);
                 active_entry_release(act);
-                read_unlock(&rgt->lock);
+                grant_percpu_read_unlock(&rgt->lock);
                 put_page(*page);
                 return __acquire_grant_for_copy(rd, gref, ldom, readonly,
                                                 frame, page, page_off, length,
@@ -2258,7 +2272,7 @@ __acquire_grant_for_copy(
     *frame = act->frame;
 
     active_entry_release(act);
-    read_unlock(&rgt->lock);
+    grant_percpu_read_unlock(&rgt->lock);
     return rc;
  
  unlock_out_clear:
@@ -2273,7 +2287,7 @@ __acquire_grant_for_copy(
     active_entry_release(act);
 
  gt_unlock_out:
-    read_unlock(&rgt->lock);
+    grant_percpu_read_unlock(&rgt->lock);
 
     return rc;
 }
@@ -2589,7 +2603,7 @@ 
gnttab_set_version(XEN_GUEST_HANDLE_PARAM(gnttab_set_version_t) uop)
     if ( gt->gt_version == op.version )
         goto out;
 
-    write_lock(&gt->lock);
+    grant_percpu_write_lock(&gt->lock);
     /*
      * Make sure that the grant table isn't currently in use when we
      * change the version number, except for the first 8 entries which
@@ -2702,7 +2716,7 @@ 
gnttab_set_version(XEN_GUEST_HANDLE_PARAM(gnttab_set_version_t) uop)
     gt->gt_version = op.version;
 
  out_unlock:
-    write_unlock(&gt->lock);
+    grant_percpu_write_unlock(&gt->lock);
 
  out:
     op.version = gt->gt_version;
@@ -2758,7 +2772,7 @@ 
gnttab_get_status_frames(XEN_GUEST_HANDLE_PARAM(gnttab_get_status_frames_t) uop,
 
     op.status = GNTST_okay;
 
-    read_lock(&gt->lock);
+    grant_percpu_read_lock(&gt->lock);
 
     for ( i = 0; i < op.nr_frames; i++ )
     {
@@ -2767,7 +2781,7 @@ 
gnttab_get_status_frames(XEN_GUEST_HANDLE_PARAM(gnttab_get_status_frames_t) uop,
             op.status = GNTST_bad_virt_addr;
     }
 
-    read_unlock(&gt->lock);
+    grant_percpu_read_unlock(&gt->lock);
 out2:
     rcu_unlock_domain(d);
 out1:
@@ -2817,7 +2831,7 @@ __gnttab_swap_grant_ref(grant_ref_t ref_a, grant_ref_t 
ref_b)
     struct active_grant_entry *act_b = NULL;
     s16 rc = GNTST_okay;
 
-    write_lock(&gt->lock);
+    grant_percpu_write_lock(&gt->lock);
 
     /* Bounds check on the grant refs */
     if ( unlikely(ref_a >= nr_grant_entries(d->grant_table)))
@@ -2865,7 +2879,7 @@ out:
         active_entry_release(act_b);
     if ( act_a != NULL )
         active_entry_release(act_a);
-    write_unlock(&gt->lock);
+    grant_percpu_write_unlock(&gt->lock);
 
     rcu_unlock_domain(d);
 
@@ -2936,12 +2950,12 @@ static int __gnttab_cache_flush(gnttab_cache_flush_t 
*cflush,
 
     if ( d != owner )
     {
-        read_lock(&owner->grant_table->lock);
+        grant_percpu_read_lock(&owner->grant_table->lock);
 
         ret = grant_map_exists(d, owner->grant_table, mfn, ref_count);
         if ( ret != 0 )
         {
-            read_unlock(&owner->grant_table->lock);
+            grant_percpu_read_unlock(&owner->grant_table->lock);
             rcu_unlock_domain(d);
             put_page(page);
             return ret;
@@ -2961,7 +2975,7 @@ static int __gnttab_cache_flush(gnttab_cache_flush_t 
*cflush,
         ret = 0;
 
     if ( d != owner )
-        read_unlock(&owner->grant_table->lock);
+        grant_percpu_read_unlock(&owner->grant_table->lock);
     unmap_domain_page(v);
     put_page(page);
 
@@ -3180,7 +3194,7 @@ grant_table_create(
         goto no_mem_0;
 
     /* Simple stuff. */
-    rwlock_init(&t->lock);
+    percpu_rwlock_resource_init(&t->lock, grant_rwlock);
     spin_lock_init(&t->maptrack_lock);
     t->nr_grant_frames = INITIAL_NR_GRANT_FRAMES;
 
@@ -3282,7 +3296,7 @@ gnttab_release_mappings(
         }
 
         rgt = rd->grant_table;
-        read_lock(&rgt->lock);
+        grant_percpu_read_lock(&rgt->lock);
 
         act = active_entry_acquire(rgt, ref);
         sha = shared_entry_header(rgt, ref);
@@ -3343,7 +3357,7 @@ gnttab_release_mappings(
             gnttab_clear_flag(_GTF_reading, status);
 
         active_entry_release(act);
-        read_unlock(&rgt->lock);
+        grant_percpu_read_unlock(&rgt->lock);
 
         rcu_unlock_domain(rd);
 
@@ -3360,7 +3374,7 @@ void grant_table_warn_active_grants(struct domain *d)
 
 #define WARN_GRANT_MAX 10
 
-    read_lock(&gt->lock);
+    grant_percpu_read_lock(&gt->lock);
 
     for ( ref = 0; ref != nr_grant_entries(gt); ref++ )
     {
@@ -3382,7 +3396,7 @@ void grant_table_warn_active_grants(struct domain *d)
         printk(XENLOG_G_DEBUG "Dom%d has too many (%d) active grants to 
report\n",
                d->domain_id, nr_active);
 
-    read_unlock(&gt->lock);
+    grant_percpu_read_unlock(&gt->lock);
 
 #undef WARN_GRANT_MAX
 }
@@ -3432,7 +3446,7 @@ static void gnttab_usage_print(struct domain *rd)
     printk("      -------- active --------       -------- shared --------\n");
     printk("[ref] localdom mfn      pin          localdom gmfn     flags\n");
 
-    read_lock(&gt->lock);
+    grant_percpu_read_lock(&gt->lock);
 
     for ( ref = 0; ref != nr_grant_entries(gt); ref++ )
     {
@@ -3475,7 +3489,7 @@ static void gnttab_usage_print(struct domain *rd)
         active_entry_release(act);
     }
 
-    read_unlock(&gt->lock);
+    grant_percpu_read_unlock(&gt->lock);
 
     if ( first )
         printk("grant-table for remote domain:%5d ... "
diff --git a/xen/include/xen/grant_table.h b/xen/include/xen/grant_table.h
index 1c29cee..831472e 100644
--- a/xen/include/xen/grant_table.h
+++ b/xen/include/xen/grant_table.h
@@ -51,13 +51,35 @@
 /* The maximum size of a grant table. */
 extern unsigned int max_grant_frames;
 
+DECLARE_PERCPU_RWLOCK_GLOBAL(grant_rwlock);
+
+static inline void grant_percpu_read_lock(percpu_rwlock_t *lock)
+{
+    percpu_read_lock(grant_rwlock, lock);
+}
+
+static inline void grant_percpu_read_unlock(percpu_rwlock_t *lock)
+{
+    percpu_read_unlock(grant_rwlock, lock);
+}
+
+static inline void grant_percpu_write_lock(percpu_rwlock_t *lock)
+{
+    percpu_write_lock(grant_rwlock, lock);
+}
+
+static inline void grant_percpu_write_unlock(percpu_rwlock_t *lock)
+{
+    percpu_write_unlock(grant_rwlock, lock);
+}
+
 /* Per-domain grant information. */
 struct grant_table {
     /*
      * Lock protecting updates to grant table state (version, active
      * entry list, etc.)
      */
-    rwlock_t              lock;
+    percpu_rwlock_t       lock;
     /* Table size. Number of frames shared with guest */
     unsigned int          nr_grant_frames;
     /* Shared grant table (see include/public/grant_table.h). */
-- 
1.7.12.4


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.