[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] 4.11.0 RC1 panic



>>> On 10.06.18 at 12:57, <bouyer@xxxxxxxxxxxxxxx> wrote:
> (XEN) Xen call trace:
> (XEN)    [<ffff82d080284bd2>] mm.c#dec_linear_entries+0x12/0x20
> (XEN)    [<ffff82d08028922e>] mm.c#_put_page_type+0x13e/0x350
> (XEN)    [<ffff82d08023a00d>] _spin_lock+0xd/0x50
> (XEN)    [<ffff82d0802898af>] mm.c#put_page_from_l2e+0xdf/0x110
> (XEN)    [<ffff82d080288c59>] free_page_type+0x2f9/0x790
> (XEN)    [<ffff82d0802891f7>] mm.c#_put_page_type+0x107/0x350
> (XEN)    [<ffff82d0802898ef>] put_page_type_preemptible+0xf/0x10
> (XEN)    [<ffff82d080272adb>] domain.c#relinquish_memory+0xab/0x460
> (XEN)    [<ffff82d080276ae3>] domain_relinquish_resources+0x203/0x290
> (XEN)    [<ffff82d0802068bd>] domain_kill+0xbd/0x150
> (XEN)    [<ffff82d0802039e3>] do_domctl+0x7d3/0x1a90
> (XEN)    [<ffff82d080203210>] do_domctl+0/0x1a90
> (XEN)    [<ffff82d080367b95>] pv_hypercall+0x1f5/0x430
> (XEN)    [<ffff82d08036e422>] lstar_enter+0xa2/0x120
> (XEN)    [<ffff82d08036e42e>] lstar_enter+0xae/0x120
> (XEN)    [<ffff82d08036e422>] lstar_enter+0xa2/0x120
> (XEN)    [<ffff82d08036e42e>] lstar_enter+0xae/0x120
> (XEN)    [<ffff82d08036e422>] lstar_enter+0xa2/0x120
> (XEN)    [<ffff82d08036e42e>] lstar_enter+0xae/0x120
> (XEN)    [<ffff82d08036e48c>] lstar_enter+0x10c/0x120

Let's focus on this scenario for now, as it is under better (timing) control
on the Xen side. Below is a first debugging patch which
- avoids the ASSERT() in question, instead triggering a printk(), in the hope
  that the data logged and/or other ASSERT()s shed some additional light
  on the situation
- logs cleanup activity (this is likely to be quite chatty, so be sure you set
  up large enough internal buffers)

Ideally, if no other ASSERT() triggers as a result of the bypassed one,
you'd try to catch more than a single instance of the problem, so we can
see a possible pattern (if there is one). A simplistic first XTF test I've
created based on your description of the L2 handling model in NetBSD
did not trigger the interesting printk(), but at least that way I've been
able to see that the domain cleanup logging produces useful data.

At the very least I hope that with this we can derive whether the
root of the problem is at page table teardown / cleanup time, or with
management of live ones.

Jan

--- unstable.orig/xen/arch/x86/domain.c
+++ unstable/xen/arch/x86/domain.c
@@ -1872,6 +1872,7 @@ static int relinquish_memory(
 
     while ( (page = page_list_remove_head(list)) )
     {
+bool log = false;//temp
         /* Grab a reference to the page so it won't disappear from under us. */
         if ( unlikely(!get_page(page, d)) )
         {
@@ -1880,6 +1881,10 @@ static int relinquish_memory(
             continue;
         }
 
+if(is_pv_32bit_domain(d) && PGT_type_equal(page->u.inuse.type_info, 
PGT_l2_page_table)) {//temp
+ printk("d%d:%"PRI_mfn": %lx:%d\n", d->domain_id, mfn_x(page_to_mfn(page)), 
page->u.inuse.type_info, page->linear_pt_count);
+ log = true;
+}
         if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
             ret = put_page_and_type_preemptible(page);
         switch ( ret )
@@ -1921,7 +1926,13 @@ static int relinquish_memory(
             if ( likely(y == x) )
             {
                 /* No need for atomic update of type_info here: noone else 
updates it. */
-                switch ( ret = free_page_type(page, x, 1) )
+//temp                switch ( ret = free_page_type(page, x, 1) )
+ret = free_page_type(page, x, 1);//temp
+if(log) {//temp
+ printk("%"PRI_mfn" -> %lx:%d (%d,%d,%d)\n", mfn_x(page_to_mfn(page)), 
page->u.inuse.type_info,
+        page->linear_pt_count, ret, page->nr_validated_ptes, 
page->partial_pte);
+}
+switch(ret)//temp
                 {
                 case 0:
                     break;
--- unstable.orig/xen/arch/x86/mm.c
+++ unstable/xen/arch/x86/mm.c
@@ -705,12 +705,19 @@ static bool inc_linear_entries(struct pa
     return true;
 }
 
-static void dec_linear_entries(struct page_info *pg)
+//temp static void dec_linear_entries(struct page_info *pg)
+static const struct domain*dec_linear_entries(struct page_info*pg)//temp
 {
     typeof(pg->linear_pt_count) oc;
 
     oc = arch_fetch_and_add(&pg->linear_pt_count, -1);
+{//temp
+ const struct domain*owner = page_get_owner(pg);
+ if(oc <= 0 && is_pv_32bit_domain(owner))
+  return owner;
+}
     ASSERT(oc > 0);
+return NULL;//temp
 }
 
 static bool inc_linear_uses(struct page_info *pg)
@@ -2617,8 +2624,15 @@ static int _put_final_page_type(struct p
     {
         if ( ptpg && PGT_type_equal(type, ptpg->u.inuse.type_info) )
         {
+const struct domain*d;//temp
             dec_linear_uses(page);
+if((d = ({//temp
             dec_linear_entries(ptpg);
+})) != NULL) {//temp
+ printk("d%d: %"PRI_mfn":%lx:%d -> %"PRI_mfn":%lx:%d\n", d->domain_id,
+        mfn_x(page_to_mfn(ptpg)), ptpg->u.inuse.type_info, 
ptpg->linear_pt_count,
+        mfn_x(page_to_mfn(page)), page->u.inuse.type_info, 
page->linear_pt_count);
+}
         }
         ASSERT(!page->linear_pt_count || page_get_owner(page)->is_dying);
         set_tlbflush_timestamp(page);
@@ -2704,8 +2718,15 @@ static int _put_page_type(struct page_in
 
             if ( ptpg && PGT_type_equal(x, ptpg->u.inuse.type_info) )
             {
+const struct domain*d;//temp
                 dec_linear_uses(page);
+if((d = ({//temp
                 dec_linear_entries(ptpg);
+})) != NULL) {//temp
+ printk("d%d: %"PRI_mfn":%lx:%d => %"PRI_mfn":%lx:%d\n", d->domain_id,
+        mfn_x(page_to_mfn(ptpg)), ptpg->u.inuse.type_info, 
ptpg->linear_pt_count,
+        mfn_x(page_to_mfn(page)), page->u.inuse.type_info, 
page->linear_pt_count);
+}
             }
 
             return 0;


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.