[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH V2] amd iommu: re-enable iommu msi if dom0 disabled it



>>> On 14.06.12 at 14:13, Wei Wang <wei.wang2@xxxxxxx> wrote:
> Am 12.06.2012 18:43, schrieb Jan Beulich:
>>>>> On 12.06.12 at 18:08, Andrew Cooper<andrew.cooper3@xxxxxxxxxx>  wrote:
>>> On 12/06/12 16:13, Jan Beulich wrote:
>>>>>>> On 12.06.12 at 14:02, Wei Wang<wei.wang2@xxxxxxx>  wrote:
>>>>> I had attached a revised patch, please check it.
>>>> While the patch technically looks better now, you didn't eliminate
>>>> my objections to the approach you take, nor did you comment on
>>>> the proposed alternative.
>>>>
>>>> A fundamental problem is that your IOMMUs show up as a "normal"
>>>> PCI devices, breaking the separation between what is being
>>>> managed by the hypervisor vs by the Dom0 kernel. (This even
>>>> allows something as odd as passing through an IOMMU to a
>>>> DomU, which would clearly upset the hypervisor.)
>>>>
>>>>> I found that the following Linux commit triggers this issue. It has been
>>>>> included into 3.4 pv_ops.
>>>>>
>>>>> " commit a776c491ca5e38c26d9f66923ff574d041e747f4
>>>>>     Author: Eric W. Biederman<ebiederm@xxxxxxxxxxxx>
>>>>>     Date:   Mon Oct 17 11:46:06 2011 -0700
>>>>>
>>>>>     PCI: msi: Disable msi interrupts when we initialize a pci device "
>>>> Thanks for locating this. As it stands, it is incomplete though
>>>> anyway: If the kexec-ed kernel is one built without CONFIG_PCI_MSI,
>>>> it won't have a means to suppress the "screaming" interrupts.
>>>
>>> I feel that the correct solution would be for Xen to hide the PCI
>>> devices from dom0.  Xen already hides the DMAR ACPI table (by turning it
>>> to an XMAR table), and this would be the logical way to proceed now that
>>> IOMMU internals are appearing as PCI devices.
> 
> That sounds absolutely good to me. thanks for the suggestion.
> 
>> That is precisely what I suggested in my response to the first
>> version of this patch, and I'd also volunteer to look into putting
>> together a first draft implementation if we sort of agree that
>> this is the way to go.
> 
> Cool! thanks for doing that. Looking forward to it in Xen 4.2 since 
> iommu msi is really broken with recent Linux dom0...

Rather than submitting it for inclusion immediately, I'd rather see
you first use it successfully. Below/attached what appears to
work fine for me in contrived situations (i.e. hiding bridges with
nothing connected, as I still don't have any AMD IOMMU system
at hand).

Jan

Note that due to ptwr_do_page_fault() being run first, there'll be a
MEM_LOG() issued for each such attempt. If that's undesirable, the
order of the calls would need to be swapped.

--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -1875,6 +1875,7 @@ static int mod_l1_entry(l1_pgentry_t *pl
             break;
         case 1:
             l1e_remove_flags(nl1e, _PAGE_RW);
+            rc = 0;
             break;
         }
         if ( page )
@@ -5208,6 +5209,97 @@ int ptwr_do_page_fault(struct vcpu *v, u
     return 0;
 }
 
+#ifdef __x86_64__
+/*************************
+ * fault handling for read-only MMIO pages
+ */
+
+struct mmio_ro_emulate_ctxt {
+    struct x86_emulate_ctxt ctxt;
+    unsigned long cr2;
+};
+
+static int mmio_ro_emulated_read(
+    enum x86_segment seg,
+    unsigned long offset,
+    void *p_data,
+    unsigned int bytes,
+    struct x86_emulate_ctxt *ctxt)
+{
+    return X86EMUL_UNHANDLEABLE;
+}
+
+static int mmio_ro_emulated_write(
+    enum x86_segment seg,
+    unsigned long offset,
+    void *p_data,
+    unsigned int bytes,
+    struct x86_emulate_ctxt *ctxt)
+{
+    struct mmio_ro_emulate_ctxt *mmio_ro_ctxt =
+        container_of(ctxt, struct mmio_ro_emulate_ctxt, ctxt);
+
+    /* Only allow naturally-aligned stores at the original %cr2 address. */
+    if ( ((bytes | offset) & (bytes - 1)) || offset != mmio_ro_ctxt->cr2 )
+    {
+        MEM_LOG("mmio_ro_emulate: bad access (cr2=%lx, addr=%lx, bytes=%u)",
+                mmio_ro_ctxt->cr2, offset, bytes);
+        return X86EMUL_UNHANDLEABLE;
+    }
+
+    return X86EMUL_OKAY;
+}
+
+static const struct x86_emulate_ops mmio_ro_emulate_ops = {
+    .read       = mmio_ro_emulated_read,
+    .insn_fetch = ptwr_emulated_read,
+    .write      = mmio_ro_emulated_write,
+};
+
+/* Check if guest is trying to modify a r/o MMIO page. */
+int mmio_ro_do_page_fault(struct vcpu *v, unsigned long addr,
+                          struct cpu_user_regs *regs)
+{
+    l1_pgentry_t      pte;
+    unsigned long     mfn;
+    unsigned int      addr_size = is_pv_32on64_domain(v->domain) ?
+                                  32 : BITS_PER_LONG;
+    struct mmio_ro_emulate_ctxt mmio_ro_ctxt = {
+        .ctxt.regs = regs,
+        .ctxt.addr_size = addr_size,
+        .ctxt.sp_size = addr_size,
+        .cr2 = addr
+    };
+    int rc;
+
+    /* Attempt to read the PTE that maps the VA being accessed. */
+    guest_get_eff_l1e(v, addr, &pte);
+
+    /* We are looking only for read-only mappings of MMIO pages. */
+    if ( ((l1e_get_flags(pte) & (_PAGE_PRESENT|_PAGE_RW)) != _PAGE_PRESENT) )
+        return 0;
+
+    mfn = l1e_get_pfn(pte);
+    if ( mfn_valid(mfn) )
+    {
+        struct page_info *page = mfn_to_page(mfn);
+        struct domain *owner = page_get_owner_and_reference(page);
+
+        if ( owner )
+            put_page(page);
+        if ( owner != dom_io )
+            return 0;
+    }
+
+    if ( !rangeset_contains_singleton(mmio_ro_ranges, mfn) )
+        return 0;
+
+    rc = x86_emulate(&mmio_ro_ctxt.ctxt, &mmio_ro_emulate_ops);
+
+    return rc != X86EMUL_UNHANDLEABLE ? EXCRET_fault_fixed : 0;
+}
+#endif /* __x86_64__ */
+
 void free_xen_pagetable(void *v)
 {
     if ( system_state == SYS_STATE_early_boot )
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -1349,20 +1349,23 @@ static int fixup_page_fault(unsigned lon
         return 0;
     }
 
-    if ( VM_ASSIST(d, VMASST_TYPE_writable_pagetables) &&
-         guest_kernel_mode(v, regs) )
-    {
-        unsigned int mbs = PFEC_write_access;
-        unsigned int mbz = PFEC_reserved_bit | PFEC_insn_fetch;
-
-        /* Do not check if access-protection fault since the page may 
-           legitimately be not present in shadow page tables */
-        if ( !paging_mode_enabled(d) )
-            mbs |= PFEC_page_present;
-
-        if ( ((regs->error_code & (mbs | mbz)) == mbs) &&
+    if ( guest_kernel_mode(v, regs) &&
+         !(regs->error_code & (PFEC_reserved_bit | PFEC_insn_fetch)) &&
+         (regs->error_code & PFEC_write_access) )
+    {
+        if ( VM_ASSIST(d, VMASST_TYPE_writable_pagetables) &&
+             /* Do not check if access-protection fault since the page may
+                legitimately be not present in shadow page tables */
+             (paging_mode_enabled(d) ||
+              (regs->error_code & PFEC_page_present)) &&
              ptwr_do_page_fault(v, addr, regs) )
             return EXCRET_fault_fixed;
+
+#ifdef __x86_64__
+        if ( IS_PRIV(d) && (regs->error_code & PFEC_page_present) &&
+             mmio_ro_do_page_fault(v, addr, regs) )
+            return EXCRET_fault_fixed;
+#endif
     }
 
     /* For non-external shadowed guests, we fix up both their own 
@@ -1690,6 +1693,13 @@ static int pci_cfg_ok(struct domain *d, 
         return 0;
 
     machine_bdf = (d->arch.pci_cf8 >> 8) & 0xFFFF;
+    if ( write )
+    {
+        const unsigned long *ro_map = pci_get_ro_map(0);
+
+        if ( ro_map && test_bit(machine_bdf, ro_map) )
+            return 0;
+    }
     start = d->arch.pci_cf8 & 0xFF;
     end = start + size - 1;
     if (xsm_pci_config_permission(d, machine_bdf, start, end, write))
--- a/xen/arch/x86/x86_32/pci.c
+++ b/xen/arch/x86/x86_32/pci.c
@@ -6,6 +6,7 @@
 
 #include <xen/spinlock.h>
 #include <xen/pci.h>
+#include <xen/init.h>
 #include <asm/io.h>
 
 #define PCI_CONF_ADDRESS(bus, dev, func, reg) \
@@ -70,3 +71,7 @@ void pci_conf_write32(
     BUG_ON((bus > 255) || (dev > 31) || (func > 7) || (reg > 255));
     pci_conf_write(PCI_CONF_ADDRESS(bus, dev, func, reg), 0, 4, data);
 }
+
+void __init arch_pci_ro_device(int seg, int bdf)
+{
+}
--- a/xen/arch/x86/x86_64/mmconfig_64.c
+++ b/xen/arch/x86/x86_64/mmconfig_64.c
@@ -145,9 +145,30 @@ static void __iomem *mcfg_ioremap(const 
     return (void __iomem *) virt;
 }
 
+void arch_pci_ro_device(int seg, int bdf)
+{
+    unsigned int idx, bus = PCI_BUS(bdf);
+
+    for (idx = 0; idx < pci_mmcfg_config_num; ++idx) {
+        const struct acpi_mcfg_allocation *cfg = pci_mmcfg_virt[idx].cfg;
+        unsigned long mfn = (cfg->address >> PAGE_SHIFT) + bdf;
+
+        if (!pci_mmcfg_virt[idx].virt || cfg->pci_segment != seg ||
+            cfg->start_bus_number > bus || cfg->end_bus_number < bus)
+            continue;
+
+        if (rangeset_add_singleton(mmio_ro_ranges, mfn))
+            printk(XENLOG_ERR
+                   "%04x:%02x:%02x.%u: could not mark MCFG (mfn %#lx) 
read-only\n",
+                   cfg->pci_segment, bus, PCI_SLOT(bdf), PCI_FUNC(bdf),
+                   mfn);
+    }
+}
+
 int pci_mmcfg_arch_enable(unsigned int idx)
 {
     const typeof(pci_mmcfg_config[0]) *cfg = pci_mmcfg_virt[idx].cfg;
+    const unsigned long *ro_map = pci_get_ro_map(cfg->pci_segment);
 
     if (pci_mmcfg_virt[idx].virt)
         return 0;
@@ -159,6 +180,16 @@ int pci_mmcfg_arch_enable(unsigned int i
     }
     printk(KERN_INFO "PCI: Using MCFG for segment %04x bus %02x-%02x\n",
            cfg->pci_segment, cfg->start_bus_number, cfg->end_bus_number);
+    if (ro_map) {
+        unsigned int bdf = PCI_BDF(cfg->start_bus_number, 0, 0);
+        unsigned int end = PCI_BDF(cfg->end_bus_number, -1, -1);
+
+        while ((bdf = find_next_bit(ro_map, end + 1, bdf)) <= end) {
+            arch_pci_ro_device(cfg->pci_segment, bdf);
+            if (bdf++ == end)
+                break;
+        }
+    }
     return 0;
 }
 
--- a/xen/drivers/passthrough/amd/iommu_detect.c
+++ b/xen/drivers/passthrough/amd/iommu_detect.c
@@ -153,6 +153,12 @@ int __init amd_iommu_detect_one_acpi(
     if ( rt )
         return -ENODEV;
 
+    rt = pci_ro_device(iommu->seg, bus, PCI_DEVFN(dev, func));
+    if ( rt )
+        printk(XENLOG_ERR
+               "Could not mark config space of %04x:%02x:%02x.%u read-only 
(%d)\n",
+               iommu->seg, bus, dev, func, rt);
+
     list_add_tail(&iommu->list, &amd_iommu_head);
 
     return 0;
--- a/xen/drivers/passthrough/io.c
+++ b/xen/drivers/passthrough/io.c
@@ -593,11 +593,3 @@ void hvm_dpci_eoi(struct domain *d, unsi
 unlock:
     spin_unlock(&d->event_lock);
 }
-
-static int __init setup_mmio_ro_ranges(void)
-{
-    mmio_ro_ranges = rangeset_new(NULL, "r/o mmio ranges",
-                                  RANGESETF_prettyprint_hex);
-    return 0;
-}
-__initcall(setup_mmio_ro_ranges);
--- a/xen/drivers/passthrough/pci.c
+++ b/xen/drivers/passthrough/pci.c
@@ -36,6 +36,7 @@
 struct pci_seg {
     struct list_head alldevs_list;
     u16 nr;
+    unsigned long *ro_map;
     /* bus2bridge_lock protects bus2bridge array */
     spinlock_t bus2bridge_lock;
 #define MAX_BUSES 256
@@ -106,6 +107,8 @@ void __init pt_pci_init(void)
     radix_tree_init(&pci_segments);
     if ( !alloc_pseg(0) )
         panic("Could not initialize PCI segment 0\n");
+    mmio_ro_ranges = rangeset_new(NULL, "r/o mmio ranges",
+                                  RANGESETF_prettyprint_hex);
 }
 
 int __init pci_add_segment(u16 seg)
@@ -113,6 +116,13 @@ int __init pci_add_segment(u16 seg)
     return alloc_pseg(seg) ? 0 : -ENOMEM;
 }
 
+const unsigned long *pci_get_ro_map(u16 seg)
+{
+    struct pci_seg *pseg = get_pseg(seg);
+
+    return pseg ? pseg->ro_map : NULL;
+}
+
 static struct pci_dev *alloc_pdev(struct pci_seg *pseg, u8 bus, u8 devfn)
 {
     struct pci_dev *pdev;
@@ -198,6 +208,33 @@ static void free_pdev(struct pci_seg *ps
     xfree(pdev);
 }
 
+int __init pci_ro_device(int seg, int bus, int devfn)
+{
+    struct pci_seg *pseg = alloc_pseg(seg);
+    struct pci_dev *pdev;
+
+    if ( !pseg )
+        return -ENOMEM;
+    pdev = alloc_pdev(pseg, bus, devfn);
+    if ( !pdev )
+        return -ENOMEM;
+
+    if ( !pseg->ro_map )
+    {
+        size_t sz = BITS_TO_LONGS(PCI_BDF(-1, -1, -1) + 1) * sizeof(long);
+
+        pseg->ro_map = alloc_xenheap_pages(get_order_from_bytes(sz), 0);
+        if ( !pseg->ro_map )
+            return -ENOMEM;
+        memset(pseg->ro_map, 0, sz);
+    }
+
+    __set_bit(PCI_BDF2(bus, devfn), pseg->ro_map);
+    arch_pci_ro_device(seg, PCI_BDF2(bus, devfn));
+
+    return 0;
+}
+
 struct pci_dev *pci_get_pdev(int seg, int bus, int devfn)
 {
     struct pci_seg *pseg = get_pseg(seg);
--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
@@ -555,6 +555,8 @@ void memguard_unguard_stack(void *p);
 
 int  ptwr_do_page_fault(struct vcpu *, unsigned long,
                         struct cpu_user_regs *);
+int  mmio_ro_do_page_fault(struct vcpu *, unsigned long,
+                           struct cpu_user_regs *);
 
 int audit_adjust_pgtables(struct domain *d, int dir, int noisy);
 
--- a/xen/include/xen/pci.h
+++ b/xen/include/xen/pci.h
@@ -98,8 +98,11 @@ struct pci_dev *pci_lock_domain_pdev(
 void setup_dom0_pci_devices(struct domain *, void (*)(struct pci_dev *));
 void pci_release_devices(struct domain *d);
 int pci_add_segment(u16 seg);
+const unsigned long *pci_get_ro_map(u16 seg);
 int pci_add_device(u16 seg, u8 bus, u8 devfn, const struct pci_dev_info *);
 int pci_remove_device(u16 seg, u8 bus, u8 devfn);
+int pci_ro_device(int seg, int bus, int devfn);
+void arch_pci_ro_device(int seg, int bdf);
 struct pci_dev *pci_get_pdev(int seg, int bus, int devfn);
 struct pci_dev *pci_get_pdev_by_domain(
     struct domain *, int seg, int bus, int devfn);

Attachment: pci-disallow-write.patch
Description: Text document

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.