[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] Re: [Xen-devel] [PATCH V2] amd iommu: re-enable iommu msi if dom0 disabled it
On 06/21/2012 05:49 PM, Jan Beulich wrote: On 21.06.12 at 17:29, Wei Wang<wei.wang2@xxxxxxx> wrote:On 06/20/2012 05:45 PM, Jan Beulich wrote: > Rather than submitting it for inclusion immediately, I'd rather seeyou first use it successfully. Below/attached what appears to work fine for me in contrived situations (i.e. hiding bridges with nothing connected, as I still don't have any AMD IOMMU system at hand).The patch works for me. IOMMU msi Capability is shown as enabled with this patch.Keir, the question now arises whether we really want this, and particularly this late before 4.2. The Linux folks don't seem to be willing to take the strait forward workaround for the problem introduced at their end, so we will likely need something (the more that the questionable fix already made it into various stable trees) before 4.2 goes out (and even the older trees would be affected, just that putting a change like this there is even more questionable). There are obviously more potential problems in this area: If any of the MMIO addresses used by AMD's IOMMU is configurable through one of the BARs, and if Dom0 decides to re-assign MMIO space, neither allowing the writes nor simply dropping them as done here will work. Whether that's a real problem I don't know - Wei? And there might be other issues arising from dropping all writes - we might just currently not be aware of them. AMD IOMMU does not have any PCI BARs, All address configuration should go to ACPI tables. So this should be fine at least for AMD IOMMU. Thanks Wei Jan00:00.2 Generic system peripheral [0806]: Advanced Micro Devices [AMD] Device 1419 Subsystem: Advanced Micro Devices [AMD] Device 1419 Flags: bus master, fast devsel, latency 0, IRQ 11 Capabilities: [40] Secure device<?> Capabilities: [54] MSI: Enable+ Count=1/1 Maskable- 64bit+ Capabilities: [64] HyperTransport: MSI Mapping Enable+ Fixed+Jan Note that due to ptwr_do_page_fault() being run first, there'll be a MEM_LOG() issued for each such attempt. If that's undesirable, the order of the calls would need to be swapped. --- a/xen/arch/x86/mm.c +++ b/xen/arch/x86/mm.c @@ -1875,6 +1875,7 @@ static int mod_l1_entry(l1_pgentry_t *pl break; case 1: l1e_remove_flags(nl1e, _PAGE_RW); + rc = 0; break; } if ( page ) @@ -5208,6 +5209,97 @@ int ptwr_do_page_fault(struct vcpu *v, u return 0; } +#ifdef __x86_64__ +/************************* + * fault handling for read-only MMIO pages + */ + +struct mmio_ro_emulate_ctxt { + struct x86_emulate_ctxt ctxt; + unsigned long cr2; +}; + +static int mmio_ro_emulated_read( + enum x86_segment seg, + unsigned long offset, + void *p_data, + unsigned int bytes, + struct x86_emulate_ctxt *ctxt) +{ + return X86EMUL_UNHANDLEABLE; +} + +static int mmio_ro_emulated_write( + enum x86_segment seg, + unsigned long offset, + void *p_data, + unsigned int bytes, + struct x86_emulate_ctxt *ctxt) +{ + struct mmio_ro_emulate_ctxt *mmio_ro_ctxt = + container_of(ctxt, struct mmio_ro_emulate_ctxt, ctxt); + + /* Only allow naturally-aligned stores at the original %cr2 address. */ + if ( ((bytes | offset)& (bytes - 1)) || offset != mmio_ro_ctxt->cr2 ) + { + MEM_LOG("mmio_ro_emulate: bad access (cr2=%lx, addr=%lx,bytes=%u)",+ mmio_ro_ctxt->cr2, offset, bytes); + return X86EMUL_UNHANDLEABLE; + } + + return X86EMUL_OKAY; +} + +static const struct x86_emulate_ops mmio_ro_emulate_ops = { + .read = mmio_ro_emulated_read, + .insn_fetch = ptwr_emulated_read, + .write = mmio_ro_emulated_write, +}; + +/* Check if guest is trying to modify a r/o MMIO page. */ +int mmio_ro_do_page_fault(struct vcpu *v, unsigned long addr, + struct cpu_user_regs *regs) +{ + l1_pgentry_t pte; + unsigned long mfn; + unsigned int addr_size = is_pv_32on64_domain(v->domain) ? + 32 : BITS_PER_LONG; + struct mmio_ro_emulate_ctxt mmio_ro_ctxt = { + .ctxt.regs = regs, + .ctxt.addr_size = addr_size, + .ctxt.sp_size = addr_size, + .cr2 = addr + }; + int rc; + + /* Attempt to read the PTE that maps the VA being accessed. */ + guest_get_eff_l1e(v, addr,&pte); + + /* We are looking only for read-only mappings of MMIO pages. */ + if ( ((l1e_get_flags(pte)& (_PAGE_PRESENT|_PAGE_RW)) != _PAGE_PRESENT))+ return 0; + + mfn = l1e_get_pfn(pte); + if ( mfn_valid(mfn) ) + { + struct page_info *page = mfn_to_page(mfn); + struct domain *owner = page_get_owner_and_reference(page); + + if ( owner ) + put_page(page); + if ( owner != dom_io ) + return 0; + } + + if ( !rangeset_contains_singleton(mmio_ro_ranges, mfn) ) + return 0; + + rc = x86_emulate(&mmio_ro_ctxt.ctxt,&mmio_ro_emulate_ops); + + return rc != X86EMUL_UNHANDLEABLE ? EXCRET_fault_fixed : 0; +} +#endif /* __x86_64__ */ + void free_xen_pagetable(void *v) { if ( system_state == SYS_STATE_early_boot ) --- a/xen/arch/x86/traps.c +++ b/xen/arch/x86/traps.c @@ -1349,20 +1349,23 @@ static int fixup_page_fault(unsigned lon return 0; } - if ( VM_ASSIST(d, VMASST_TYPE_writable_pagetables)&& - guest_kernel_mode(v, regs) ) - { - unsigned int mbs = PFEC_write_access; - unsigned int mbz = PFEC_reserved_bit | PFEC_insn_fetch; - - /* Do not check if access-protection fault since the page may - legitimately be not present in shadow page tables */ - if ( !paging_mode_enabled(d) ) - mbs |= PFEC_page_present; - - if ( ((regs->error_code& (mbs | mbz)) == mbs)&& + if ( guest_kernel_mode(v, regs)&& + !(regs->error_code& (PFEC_reserved_bit | PFEC_insn_fetch))&& + (regs->error_code& PFEC_write_access) ) + { + if ( VM_ASSIST(d, VMASST_TYPE_writable_pagetables)&& + /* Do not check if access-protection fault since the page may + legitimately be not present in shadow page tables */ + (paging_mode_enabled(d) || + (regs->error_code& PFEC_page_present))&& ptwr_do_page_fault(v, addr, regs) ) return EXCRET_fault_fixed; + +#ifdef __x86_64__ + if ( IS_PRIV(d)&& (regs->error_code& PFEC_page_present)&& + mmio_ro_do_page_fault(v, addr, regs) ) + return EXCRET_fault_fixed; +#endif } /* For non-external shadowed guests, we fix up both their own @@ -1690,6 +1693,13 @@ static int pci_cfg_ok(struct domain *d, return 0; machine_bdf = (d->arch.pci_cf8>> 8)& 0xFFFF; + if ( write ) + { + const unsigned long *ro_map = pci_get_ro_map(0); + + if ( ro_map&& test_bit(machine_bdf, ro_map) ) + return 0; + } start = d->arch.pci_cf8& 0xFF; end = start + size - 1; if (xsm_pci_config_permission(d, machine_bdf, start, end, write)) --- a/xen/arch/x86/x86_32/pci.c +++ b/xen/arch/x86/x86_32/pci.c @@ -6,6 +6,7 @@ #include<xen/spinlock.h> #include<xen/pci.h> +#include<xen/init.h> #include<asm/io.h> #define PCI_CONF_ADDRESS(bus, dev, func, reg) \ @@ -70,3 +71,7 @@ void pci_conf_write32( BUG_ON((bus> 255) || (dev> 31) || (func> 7) || (reg> 255)); pci_conf_write(PCI_CONF_ADDRESS(bus, dev, func, reg), 0, 4, data); } + +void __init arch_pci_ro_device(int seg, int bdf) +{ +} --- a/xen/arch/x86/x86_64/mmconfig_64.c +++ b/xen/arch/x86/x86_64/mmconfig_64.c @@ -145,9 +145,30 @@ static void __iomem *mcfg_ioremap(const return (void __iomem *) virt; } +void arch_pci_ro_device(int seg, int bdf) +{ + unsigned int idx, bus = PCI_BUS(bdf); + + for (idx = 0; idx< pci_mmcfg_config_num; ++idx) { + const struct acpi_mcfg_allocation *cfg = pci_mmcfg_virt[idx].cfg; + unsigned long mfn = (cfg->address>> PAGE_SHIFT) + bdf; + + if (!pci_mmcfg_virt[idx].virt || cfg->pci_segment != seg || + cfg->start_bus_number> bus || cfg->end_bus_number< bus) + continue; + + if (rangeset_add_singleton(mmio_ro_ranges, mfn)) + printk(XENLOG_ERR + "%04x:%02x:%02x.%u: could not mark MCFG (mfn %#lx)read-only\n",+ cfg->pci_segment, bus, PCI_SLOT(bdf), PCI_FUNC(bdf), + mfn); + } +} + int pci_mmcfg_arch_enable(unsigned int idx) { const typeof(pci_mmcfg_config[0]) *cfg = pci_mmcfg_virt[idx].cfg; + const unsigned long *ro_map = pci_get_ro_map(cfg->pci_segment); if (pci_mmcfg_virt[idx].virt) return 0; @@ -159,6 +180,16 @@ int pci_mmcfg_arch_enable(unsigned int i } printk(KERN_INFO "PCI: Using MCFG for segment %04x bus %02x-%02x\n", cfg->pci_segment, cfg->start_bus_number, cfg->end_bus_number); + if (ro_map) { + unsigned int bdf = PCI_BDF(cfg->start_bus_number, 0, 0); + unsigned int end = PCI_BDF(cfg->end_bus_number, -1, -1); + + while ((bdf = find_next_bit(ro_map, end + 1, bdf))<= end) { + arch_pci_ro_device(cfg->pci_segment, bdf); + if (bdf++ == end) + break; + } + } return 0; } --- a/xen/drivers/passthrough/amd/iommu_detect.c +++ b/xen/drivers/passthrough/amd/iommu_detect.c @@ -153,6 +153,12 @@ int __init amd_iommu_detect_one_acpi( if ( rt ) return -ENODEV; + rt = pci_ro_device(iommu->seg, bus, PCI_DEVFN(dev, func)); + if ( rt ) + printk(XENLOG_ERR + "Could not mark config space of %04x:%02x:%02x.%u read-only(%d)\n",+ iommu->seg, bus, dev, func, rt); + list_add_tail(&iommu->list,&amd_iommu_head); return 0; --- a/xen/drivers/passthrough/io.c +++ b/xen/drivers/passthrough/io.c @@ -593,11 +593,3 @@ void hvm_dpci_eoi(struct domain *d, unsi unlock: spin_unlock(&d->event_lock); } - -static int __init setup_mmio_ro_ranges(void) -{ - mmio_ro_ranges = rangeset_new(NULL, "r/o mmio ranges", - RANGESETF_prettyprint_hex); - return 0; -} -__initcall(setup_mmio_ro_ranges); --- a/xen/drivers/passthrough/pci.c +++ b/xen/drivers/passthrough/pci.c @@ -36,6 +36,7 @@ struct pci_seg { struct list_head alldevs_list; u16 nr; + unsigned long *ro_map; /* bus2bridge_lock protects bus2bridge array */ spinlock_t bus2bridge_lock; #define MAX_BUSES 256 @@ -106,6 +107,8 @@ void __init pt_pci_init(void) radix_tree_init(&pci_segments); if ( !alloc_pseg(0) ) panic("Could not initialize PCI segment 0\n"); + mmio_ro_ranges = rangeset_new(NULL, "r/o mmio ranges", + RANGESETF_prettyprint_hex); } int __init pci_add_segment(u16 seg) @@ -113,6 +116,13 @@ int __init pci_add_segment(u16 seg) return alloc_pseg(seg) ? 0 : -ENOMEM; } +const unsigned long *pci_get_ro_map(u16 seg) +{ + struct pci_seg *pseg = get_pseg(seg); + + return pseg ? pseg->ro_map : NULL; +} + static struct pci_dev *alloc_pdev(struct pci_seg *pseg, u8 bus, u8 devfn) { struct pci_dev *pdev; @@ -198,6 +208,33 @@ static void free_pdev(struct pci_seg *ps xfree(pdev); } +int __init pci_ro_device(int seg, int bus, int devfn) +{ + struct pci_seg *pseg = alloc_pseg(seg); + struct pci_dev *pdev; + + if ( !pseg ) + return -ENOMEM; + pdev = alloc_pdev(pseg, bus, devfn); + if ( !pdev ) + return -ENOMEM; + + if ( !pseg->ro_map ) + { + size_t sz = BITS_TO_LONGS(PCI_BDF(-1, -1, -1) + 1) * sizeof(long); + + pseg->ro_map = alloc_xenheap_pages(get_order_from_bytes(sz), 0); + if ( !pseg->ro_map ) + return -ENOMEM; + memset(pseg->ro_map, 0, sz); + } + + __set_bit(PCI_BDF2(bus, devfn), pseg->ro_map); + arch_pci_ro_device(seg, PCI_BDF2(bus, devfn)); + + return 0; +} + struct pci_dev *pci_get_pdev(int seg, int bus, int devfn) { struct pci_seg *pseg = get_pseg(seg); --- a/xen/include/asm-x86/mm.h +++ b/xen/include/asm-x86/mm.h @@ -555,6 +555,8 @@ void memguard_unguard_stack(void *p); int ptwr_do_page_fault(struct vcpu *, unsigned long, struct cpu_user_regs *); +int mmio_ro_do_page_fault(struct vcpu *, unsigned long, + struct cpu_user_regs *); int audit_adjust_pgtables(struct domain *d, int dir, int noisy); --- a/xen/include/xen/pci.h +++ b/xen/include/xen/pci.h @@ -98,8 +98,11 @@ struct pci_dev *pci_lock_domain_pdev( void setup_dom0_pci_devices(struct domain *, void (*)(struct pci_dev *)); void pci_release_devices(struct domain *d); int pci_add_segment(u16 seg); +const unsigned long *pci_get_ro_map(u16 seg); int pci_add_device(u16 seg, u8 bus, u8 devfn, const struct pci_dev_info*);int pci_remove_device(u16 seg, u8 bus, u8 devfn); +int pci_ro_device(int seg, int bus, int devfn); +void arch_pci_ro_device(int seg, int bdf); struct pci_dev *pci_get_pdev(int seg, int bus, int devfn); struct pci_dev *pci_get_pdev_by_domain( struct domain *, int seg, int bus, int devfn); _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxx http://lists.xen.org/xen-devel
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |