[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] Regression in kernel 3.5 as Dom0 regarding PCI Passthrough?!



me again :)

it seems the Crash is not always a "fatal one":

[  247.080617] vif vif-2-0: 2 reading script
[  247.083519] br0: port 4(vif2.0) entered disabled state
[  247.084144] br0: port 4(vif2.0) entered disabled state
[  250.700029] ------------[ cut here ]------------
[  250.700046] kernel BUG at drivers/xen/balloon.c:359!
[  250.700059] invalid opcode: 0000 [#1] PREEMPT SMP
[  250.700071] CPU 4
[ 250.700075] Modules linked in: joydev hid_generic uvcvideo snd_usb_audio snd_seq_midi snd_usbmidi_lib snd_hwdep snd_r awmidi videobuf2_vmalloc videobuf2_memops videobuf2_core videodev gpio_ich [last unloaded: scsi_wait_scan]
[  250.700122]
[ 250.700125] Pid: 23, comm: kworker/4:0 Not tainted 3.5.0 #3 /DX58SO [ 250.700139] RIP: e030:[<ffffffff81447f95>] [<ffffffff81447f95>] balloon_process+0x385/0x3a0
[  250.700158] RSP: e02b:ffff8801317b9dc0  EFLAGS: 00010213
[ 250.700162] RAX: 000000021f895000 RBX: 0000000000000000 RCX: 0000000000000002 [ 250.700167] RDX: ffffffff82027000 RSI: 0000000000000137 RDI: 00000000000a2337 [ 250.700172] RBP: ffff8801317b9e20 R08: ffff88014068e140 R09: 00000000fffffffc [ 250.700180] R10: 0000000000000001 R11: 0000000000000000 R12: 0000160000000000 [ 250.700185] R13: 0000000000000001 R14: 00000000000a2337 R15: ffffea000288cdc0 [ 250.700192] FS: 00007fb82ee14700(0000) GS:ffff880140680000(0000) knlGS:0000000000000000
[  250.700198] CS:  e033 DS: 0000 ES: 0000 CR0: 000000008005003b
[ 250.700202] CR2: 00007fb82e7b39a6 CR3: 0000000001e0c000 CR4: 0000000000002660 [ 250.700207] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 250.700213] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [ 250.700218] Process kworker/4:0 (pid: 23, threadinfo ffff8801317b8000, task ffff88013178db00)
[  250.700223] Stack:
[ 250.700225] 000000000006aa7b 0000000000000001 ffffffff8200ea80 0000000000000001 [ 250.700293] 0000000000000000 0000000000007ff0 ffff8801317b9e00 ffff880131796400 [ 250.700301] ffff880140697000 ffff88014068e140 0000000000000000 ffffffff81e587c0
[  250.700311] Call Trace:
[  250.700317]  [<ffffffff8106753b>] process_one_work+0x12b/0x450
[  250.700322]  [<ffffffff81447c10>] ? decrease_reservation+0x320/0x320
[  250.700328]  [<ffffffff810688be>] worker_thread+0x12e/0x2d0
[ 250.700334] [<ffffffff81068790>] ? manage_workers.isra.26+0x1f0/0x1f0
[  250.700340]  [<ffffffff8106db7e>] kthread+0x8e/0xa0
[  250.700346]  [<ffffffff8184e3e4>] kernel_thread_helper+0x4/0x10
[  250.700353]  [<ffffffff8184c87c>] ? retint_restore_args+0x5/0x6
[  250.700358]  [<ffffffff8184e3e0>] ? gs_change+0x13/0x13
[ 250.700362] Code: 01 15 f0 6a bc 00 48 29 d0 48 89 05 ee 6a bc 00 e9 31 fd ff ff 0f 0b 0f 0b 4c 89 f7 e8 85 34 bc ff 48 83 f8 ff 0f 84 2b fe ff ff <0f> 0b 66 0f 1f 84 00 00 00 00 00 48 83 c1 01 e9 c2 fd ff ff 0f
[  250.700471] RIP  [<ffffffff81447f95>] balloon_process+0x385/0x3a0
[  250.700482]  RSP <ffff8801317b9dc0>
[  250.733955] ---[ end trace a5e5187e8ed6c1ff ]---
[ 250.733982] BUG: unable to handle kernel paging request at fffffffffffffff8
[  250.733992] IP: [<ffffffff8106e08c>] kthread_data+0xc/0x20
[  250.733999] PGD 1e0e067 PUD 1e0f067 PMD 0
[  250.734006] Oops: 0000 [#2] PREEMPT SMP
[  250.734013] CPU 4
[ 250.734016] Modules linked in: joydev hid_generic uvcvideo snd_usb_audio snd_seq_midi snd_usbmidi_lib snd_hwdep snd_r awmidi videobuf2_vmalloc videobuf2_memops videobuf2_core videodev gpio_ich [last unloaded: scsi_wait_scan]
[  250.734071]
[ 250.734073] Pid: 23, comm: kworker/4:0 Tainted: G D 3.5.0 #3 /DX58SO [ 250.734095] RIP: e030:[<ffffffff8106e08c>] [<ffffffff8106e08c>] kthread_data+0xc/0x20
[  250.734111] RSP: e02b:ffff8801317b9a90  EFLAGS: 00010092
[ 250.734122] RAX: 0000000000000000 RBX: 0000000000000004 RCX: 0000000000000004 [ 250.734137] RDX: ffffffff81fcba40 RSI: 0000000000000004 RDI: ffff88013178db00 [ 250.734151] RBP: ffff8801317b9aa8 R08: 0000000000989680 R09: ffffffff81fcba40 [ 250.734166] R10: ffffffff8104960a R11: 0000000000000000 R12: ffff8801406936c0 [ 250.734178] R13: 0000000000000004 R14: ffff88013178daf0 R15: ffff88013178db00 [ 250.734196] FS: 00007fb82ee14700(0000) GS:ffff880140680000(0000) knlGS:0000000000000000
[  250.734202] CS:  e033 DS: 0000 ES: 0000 CR0: 000000008005003b
[ 250.734209] CR2: fffffffffffffff8 CR3: 0000000001e0c000 CR4: 0000000000002660 [ 250.734222] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 250.734235] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [ 250.734249] Process kworker/4:0 (pid: 23, threadinfo ffff8801317b8000, task ffff88013178db00)
[  250.734266] Stack:
[ 250.734271] ffffffff810698e0 ffff8801317b9aa8 ffff88013178ded8 ffff8801317b9b18 [ 250.734292] ffffffff8184ae02 ffff8801317b9fd8 ffff88013178db00 ffff8801317b9fd8 [ 250.734313] ffff8801317b9fd8 ffff8801334796c0 ffff88013178db00 ffff8801317b9ae8
[  250.734979] Call Trace:
[  250.735572]  [<ffffffff810698e0>] ? wq_worker_sleeping+0x10/0xa0
[  250.736179]  [<ffffffff8184ae02>] __schedule+0x592/0x7d0
[  250.736783]  [<ffffffff8184b164>] schedule+0x24/0x70
[  250.737373]  [<ffffffff81051592>] do_exit+0x5b2/0x910
[  250.737937]  [<ffffffff8183ea1e>] ? printk+0x48/0x4a
[  250.738498]  [<ffffffff8100ace2>] ? check_events+0x12/0x20
[  250.739053]  [<ffffffff81017581>] oops_end+0x71/0xa0
[  250.739596]  [<ffffffff810176f3>] die+0x53/0x80
[  250.740134]  [<ffffffff810143f8>] do_trap+0xb8/0x160
[  250.740668]  [<ffffffff810146f3>] do_invalid_op+0xa3/0xb0
[  250.741203]  [<ffffffff81447f95>] ? balloon_process+0x385/0x3a0
[  250.741737]  [<ffffffff81085f52>] ? load_balance+0xd2/0x800
[  250.742267]  [<ffffffff81006276>] ? xen_flush_tlb+0xd6/0x2a0
[  250.742803]  [<ffffffff8108117d>] ? cpuacct_charge+0x6d/0xb0
[  250.743332]  [<ffffffff8184e25b>] invalid_op+0x1b/0x20
[  250.743855]  [<ffffffff81447f95>] ? balloon_process+0x385/0x3a0
[  250.744374]  [<ffffffff8106753b>] process_one_work+0x12b/0x450
[  250.744897]  [<ffffffff81447c10>] ? decrease_reservation+0x320/0x320
[  250.745426]  [<ffffffff810688be>] worker_thread+0x12e/0x2d0
[ 250.745942] [<ffffffff81068790>] ? manage_workers.isra.26+0x1f0/0x1f0
[  250.746457]  [<ffffffff8106db7e>] kthread+0x8e/0xa0
[  250.746969]  [<ffffffff8184e3e4>] kernel_thread_helper+0x4/0x10
[  250.747480]  [<ffffffff8184c87c>] ? retint_restore_args+0x5/0x6
[  250.747990]  [<ffffffff8184e3e0>] ? gs_change+0x13/0x13
[ 250.748487] Code: e0 ff ff 01 48 8b 80 38 e0 ff ff a8 08 0f 84 3d ff ff ff e8 57 d0 7d 00 e9 33 ff ff ff 66 90 48 8b 87 80 03 00 00 55 48 89 e5 5d <48> 8b 40 f8 c3 66 66 66 66 66 66 2e 0f 1f 84 00 00 00 00 00 55
[  250.749575] RIP  [<ffffffff8106e08c>] kthread_data+0xc/0x20
[  250.750103]  RSP <ffff8801317b9a90>
[  250.750627] CR2: fffffffffffffff8
[  250.751151] ---[ end trace a5e5187e8ed6c200 ]---
[  250.751152] Fixing recursive fault but reboot is needed!
[ 311.042233] INFO: rcu_preempt detected stalls on CPUs/tasks: { 4} (detected by 7, t=60011 jiffies)
[  311.042237] INFO: Stall ended before state dump start
[ 491.279642] INFO: rcu_preempt detected stalls on CPUs/tasks: { 4} (detected by 7, t=240249 jiffies)
[  491.279646] INFO: Stall ended before state dump start
[ 671.670546] INFO: rcu_preempt detected stalls on CPUs/tasks: { 4} (detected by 7, t=420638 jiffies)
[  671.670550] INFO: Stall ended before state dump start
[ 763.240862] INFO: rcu_bh detected stalls on CPUs/tasks: { 1 4} (detected by 5, t=63547 jiffies)
[  763.240867] INFO: Stall ended before state dump start
[ 853.438186] INFO: rcu_preempt detected stalls on CPUs/tasks: { 4} (detected by 7, t=602410 jiffies)
[  853.438190] INFO: Stall ended before state dump start
[ 943.632087] INFO: rcu_bh detected stalls on CPUs/tasks: { 1 4} (detected by 0, t=243935 jiffies)
[  943.632092] INFO: Stall ended before state dump start
[ 1033.828726] INFO: rcu_preempt detected stalls on CPUs/tasks: { 4} (detected by 7, t=782798 jiffies)
[ 1033.828729] INFO: Stall ended before state dump start


Now Dom0 still reacts, but mostly unusable sluggish...

Am 05.09.2012 20:54, schrieb Konrad Rzeszutek Wilk:
> > > And its due to a patch I added in v3.4
> > > (cd9db80e5257682a7f7ab245a2459648b3c8d268)
> > > - which did not work properly in v3.4, but with v3.5 got it working > > > (977f857ca566a1e68045fcbb7cfc9c4acb077cf0) which causes v3.5 to
> now
> > > work
> > > anymore.
> > >
> > > Anyhow, for right now jsut revert
> > > cd9db80e5257682a7f7ab245a2459648b3c8d268
> > > and it should work for you.
> > >
Confirmed, after reverting that commit, VT-d will work fine.
Will you fix this and push it to upstream Linux, Konrad?

> > Also, our team reported a VT-d bug 2 months ago.
> > http://bugzilla.xen.org/bugzilla/show_bug.cgi?id=1824
>

Can either one of you please test this patch, please:


diff --git a/drivers/xen/xen-pciback/pci_stub.c
b/drivers/xen/xen-pciback/pci_stub.c
index 097e536..425bd0b 100644
--- a/drivers/xen/xen-pciback/pci_stub.c
+++ b/drivers/xen/xen-pciback/pci_stub.c
@@ -4,6 +4,8 @@
  * Ryan Wilson <hap9@xxxxxxxxxxxxxx>
  * Chris Bookholt <hap10@xxxxxxxxxxxxxx>
  */
+#define DEBUG 1
+
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/rwsem.h>
@@ -97,13 +99,15 @@ static void pcistub_device_release(struct kref *kref)
        /* Call the reset function which does not take lock as this
         * is called from "unbind" which takes a device_lock mutex.
         */
+       dev_dbg(&psdev->dev->dev, "FLR locked..\n");
        __pci_reset_function_locked(psdev->dev);
        if (pci_load_and_free_saved_state(psdev->dev,
                                          &dev_data->pci_saved_state)) {
                dev_dbg(&psdev->dev->dev, "Could not reload PCI state\n");
-       } else
+       } else {
+               dev_dbg(&psdev->dev->dev, "Reloading PCI state..\n");
                pci_restore_state(psdev->dev);
-
+       }
        /* Disable the device */
        xen_pcibk_reset_device(psdev->dev);

@@ -353,16 +357,16 @@ static int __devinit pcistub_init_device(struct
pci_dev *dev)
        if (err)
                goto config_release;

-       dev_dbg(&dev->dev, "reseting (FLR, D3, etc) the device\n");
-       __pci_reset_function_locked(dev);
-
        /* We need the device active to save the state. */
        dev_dbg(&dev->dev, "save state of device\n");
        pci_save_state(dev);
        dev_data->pci_saved_state = pci_store_saved_state(dev);
        if (!dev_data->pci_saved_state)
                dev_err(&dev->dev, "Could not store PCI conf saved state!\n");
-
+       else {
+               dev_dbg(&dev->dev, "reseting (FLR, D3, etc) the device\n");
+               __pci_reset_function_locked(dev);
+       }
        /* Now disable the device (this also ensures some private device
         * data is setup before we export)
         */


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.