AMD IOMMU: add mechanism to protect their PCI devices' config spaces Recent Dom0 kernels want to disable PCI MSI on all devices, yet doing so on AMD IOMMUs (which get represented by a PCI device) disables part of the functionality set up by the hypervisor. Add a mechanism to mark certain PCI devices as having write protected config spaces (both through port based [method 1] accesses and, for x86-64, mmconfig), and use that for AMD's IOMMUs. Note that due to ptwr_do_page_fault() being run first, there'll be a MEM_LOG() issued for each such mmconfig based write attempt. If that's undesirable, the order of the calls in fixup_page_fault() would need to be swapped. Signed-off-by: Jan Beulich Tested-by: Wei Wang --- a/xen/arch/x86/mm.c +++ b/xen/arch/x86/mm.c @@ -5209,6 +5209,97 @@ int ptwr_do_page_fault(struct vcpu *v, u return 0; } +#ifdef __x86_64__ +/************************* + * fault handling for read-only MMIO pages + */ + +struct mmio_ro_emulate_ctxt { + struct x86_emulate_ctxt ctxt; + unsigned long cr2; +}; + +static int mmio_ro_emulated_read( + enum x86_segment seg, + unsigned long offset, + void *p_data, + unsigned int bytes, + struct x86_emulate_ctxt *ctxt) +{ + return X86EMUL_UNHANDLEABLE; +} + +static int mmio_ro_emulated_write( + enum x86_segment seg, + unsigned long offset, + void *p_data, + unsigned int bytes, + struct x86_emulate_ctxt *ctxt) +{ + struct mmio_ro_emulate_ctxt *mmio_ro_ctxt = + container_of(ctxt, struct mmio_ro_emulate_ctxt, ctxt); + + /* Only allow naturally-aligned stores at the original %cr2 address. */ + if ( ((bytes | offset) & (bytes - 1)) || offset != mmio_ro_ctxt->cr2 ) + { + MEM_LOG("mmio_ro_emulate: bad access (cr2=%lx, addr=%lx, bytes=%u)", + mmio_ro_ctxt->cr2, offset, bytes); + return X86EMUL_UNHANDLEABLE; + } + + return X86EMUL_OKAY; +} + +static const struct x86_emulate_ops mmio_ro_emulate_ops = { + .read = mmio_ro_emulated_read, + .insn_fetch = ptwr_emulated_read, + .write = mmio_ro_emulated_write, +}; + +/* Check if guest is trying to modify a r/o MMIO page. */ +int mmio_ro_do_page_fault(struct vcpu *v, unsigned long addr, + struct cpu_user_regs *regs) +{ + l1_pgentry_t pte; + unsigned long mfn; + unsigned int addr_size = is_pv_32on64_domain(v->domain) ? + 32 : BITS_PER_LONG; + struct mmio_ro_emulate_ctxt mmio_ro_ctxt = { + .ctxt.regs = regs, + .ctxt.addr_size = addr_size, + .ctxt.sp_size = addr_size, + .cr2 = addr + }; + int rc; + + /* Attempt to read the PTE that maps the VA being accessed. */ + guest_get_eff_l1e(v, addr, &pte); + + /* We are looking only for read-only mappings of MMIO pages. */ + if ( ((l1e_get_flags(pte) & (_PAGE_PRESENT|_PAGE_RW)) != _PAGE_PRESENT) ) + return 0; + + mfn = l1e_get_pfn(pte); + if ( mfn_valid(mfn) ) + { + struct page_info *page = mfn_to_page(mfn); + struct domain *owner = page_get_owner_and_reference(page); + + if ( owner ) + put_page(page); + if ( owner != dom_io ) + return 0; + } + + if ( !rangeset_contains_singleton(mmio_ro_ranges, mfn) ) + return 0; + + rc = x86_emulate(&mmio_ro_ctxt.ctxt, &mmio_ro_emulate_ops); + + return rc != X86EMUL_UNHANDLEABLE ? EXCRET_fault_fixed : 0; +} +#endif /* __x86_64__ */ + void free_xen_pagetable(void *v) { if ( system_state == SYS_STATE_early_boot ) --- a/xen/arch/x86/traps.c +++ b/xen/arch/x86/traps.c @@ -1349,20 +1349,23 @@ static int fixup_page_fault(unsigned lon return 0; } - if ( VM_ASSIST(d, VMASST_TYPE_writable_pagetables) && - guest_kernel_mode(v, regs) ) - { - unsigned int mbs = PFEC_write_access; - unsigned int mbz = PFEC_reserved_bit | PFEC_insn_fetch; - - /* Do not check if access-protection fault since the page may - legitimately be not present in shadow page tables */ - if ( !paging_mode_enabled(d) ) - mbs |= PFEC_page_present; - - if ( ((regs->error_code & (mbs | mbz)) == mbs) && + if ( guest_kernel_mode(v, regs) && + !(regs->error_code & (PFEC_reserved_bit | PFEC_insn_fetch)) && + (regs->error_code & PFEC_write_access) ) + { + if ( VM_ASSIST(d, VMASST_TYPE_writable_pagetables) && + /* Do not check if access-protection fault since the page may + legitimately be not present in shadow page tables */ + (paging_mode_enabled(d) || + (regs->error_code & PFEC_page_present)) && ptwr_do_page_fault(v, addr, regs) ) return EXCRET_fault_fixed; + +#ifdef __x86_64__ + if ( IS_PRIV(d) && (regs->error_code & PFEC_page_present) && + mmio_ro_do_page_fault(v, addr, regs) ) + return EXCRET_fault_fixed; +#endif } /* For non-external shadowed guests, we fix up both their own @@ -1690,6 +1693,13 @@ static int pci_cfg_ok(struct domain *d, return 0; machine_bdf = (d->arch.pci_cf8 >> 8) & 0xFFFF; + if ( write ) + { + const unsigned long *ro_map = pci_get_ro_map(0); + + if ( ro_map && test_bit(machine_bdf, ro_map) ) + return 0; + } start = d->arch.pci_cf8 & 0xFF; end = start + size - 1; if (xsm_pci_config_permission(d, machine_bdf, start, end, write)) --- a/xen/arch/x86/x86_32/pci.c +++ b/xen/arch/x86/x86_32/pci.c @@ -6,6 +6,7 @@ #include #include +#include #include #define PCI_CONF_ADDRESS(bus, dev, func, reg) \ @@ -70,3 +71,7 @@ void pci_conf_write32( BUG_ON((bus > 255) || (dev > 31) || (func > 7) || (reg > 255)); pci_conf_write(PCI_CONF_ADDRESS(bus, dev, func, reg), 0, 4, data); } + +void __init arch_pci_ro_device(int seg, int bdf) +{ +} --- a/xen/arch/x86/x86_64/mmconfig_64.c +++ b/xen/arch/x86/x86_64/mmconfig_64.c @@ -14,6 +14,8 @@ #include #include #include +#include +#include #include "mmconfig.h" @@ -132,9 +134,30 @@ static void __iomem *mcfg_ioremap(const return (void __iomem *) virt; } +void arch_pci_ro_device(int seg, int bdf) +{ + unsigned int idx, bus = PCI_BUS(bdf); + + for (idx = 0; idx < pci_mmcfg_config_num; ++idx) { + const struct acpi_mcfg_allocation *cfg = pci_mmcfg_virt[idx].cfg; + unsigned long mfn = (cfg->address >> PAGE_SHIFT) + bdf; + + if (!pci_mmcfg_virt[idx].virt || cfg->pci_segment != seg || + cfg->start_bus_number > bus || cfg->end_bus_number < bus) + continue; + + if (rangeset_add_singleton(mmio_ro_ranges, mfn)) + printk(XENLOG_ERR + "%04x:%02x:%02x.%u: could not mark MCFG (mfn %#lx) read-only\n", + cfg->pci_segment, bus, PCI_SLOT(bdf), PCI_FUNC(bdf), + mfn); + } +} + int pci_mmcfg_arch_enable(unsigned int idx) { const typeof(pci_mmcfg_config[0]) *cfg = pci_mmcfg_virt[idx].cfg; + const unsigned long *ro_map = pci_get_ro_map(cfg->pci_segment); if (pci_mmcfg_virt[idx].virt) return 0; @@ -146,6 +169,16 @@ int pci_mmcfg_arch_enable(unsigned int i } printk(KERN_INFO "PCI: Using MCFG for segment %04x bus %02x-%02x\n", cfg->pci_segment, cfg->start_bus_number, cfg->end_bus_number); + if (ro_map) { + unsigned int bdf = PCI_BDF(cfg->start_bus_number, 0, 0); + unsigned int end = PCI_BDF(cfg->end_bus_number, -1, -1); + + while ((bdf = find_next_bit(ro_map, end + 1, bdf)) <= end) { + arch_pci_ro_device(cfg->pci_segment, bdf); + if (bdf++ == end) + break; + } + } return 0; } --- a/xen/drivers/passthrough/amd/iommu_detect.c +++ b/xen/drivers/passthrough/amd/iommu_detect.c @@ -153,6 +153,12 @@ int __init amd_iommu_detect_one_acpi( if ( rt ) return -ENODEV; + rt = pci_ro_device(iommu->seg, bus, PCI_DEVFN(dev, func)); + if ( rt ) + printk(XENLOG_ERR + "Could not mark config space of %04x:%02x:%02x.%u read-only (%d)\n", + iommu->seg, bus, dev, func, rt); + list_add_tail(&iommu->list, &amd_iommu_head); return 0; --- a/xen/drivers/passthrough/io.c +++ b/xen/drivers/passthrough/io.c @@ -593,11 +593,3 @@ void hvm_dpci_eoi(struct domain *d, unsi unlock: spin_unlock(&d->event_lock); } - -static int __init setup_mmio_ro_ranges(void) -{ - mmio_ro_ranges = rangeset_new(NULL, "r/o mmio ranges", - RANGESETF_prettyprint_hex); - return 0; -} -__initcall(setup_mmio_ro_ranges); --- a/xen/drivers/passthrough/pci.c +++ b/xen/drivers/passthrough/pci.c @@ -36,6 +36,7 @@ struct pci_seg { struct list_head alldevs_list; u16 nr; + unsigned long *ro_map; /* bus2bridge_lock protects bus2bridge array */ spinlock_t bus2bridge_lock; #define MAX_BUSES 256 @@ -106,6 +107,8 @@ void __init pt_pci_init(void) radix_tree_init(&pci_segments); if ( !alloc_pseg(0) ) panic("Could not initialize PCI segment 0\n"); + mmio_ro_ranges = rangeset_new(NULL, "r/o mmio ranges", + RANGESETF_prettyprint_hex); } int __init pci_add_segment(u16 seg) @@ -113,6 +116,13 @@ int __init pci_add_segment(u16 seg) return alloc_pseg(seg) ? 0 : -ENOMEM; } +const unsigned long *pci_get_ro_map(u16 seg) +{ + struct pci_seg *pseg = get_pseg(seg); + + return pseg ? pseg->ro_map : NULL; +} + static struct pci_dev *alloc_pdev(struct pci_seg *pseg, u8 bus, u8 devfn) { struct pci_dev *pdev; @@ -198,6 +208,33 @@ static void free_pdev(struct pci_seg *ps xfree(pdev); } +int __init pci_ro_device(int seg, int bus, int devfn) +{ + struct pci_seg *pseg = alloc_pseg(seg); + struct pci_dev *pdev; + + if ( !pseg ) + return -ENOMEM; + pdev = alloc_pdev(pseg, bus, devfn); + if ( !pdev ) + return -ENOMEM; + + if ( !pseg->ro_map ) + { + size_t sz = BITS_TO_LONGS(PCI_BDF(-1, -1, -1) + 1) * sizeof(long); + + pseg->ro_map = alloc_xenheap_pages(get_order_from_bytes(sz), 0); + if ( !pseg->ro_map ) + return -ENOMEM; + memset(pseg->ro_map, 0, sz); + } + + __set_bit(PCI_BDF2(bus, devfn), pseg->ro_map); + arch_pci_ro_device(seg, PCI_BDF2(bus, devfn)); + + return 0; +} + struct pci_dev *pci_get_pdev(int seg, int bus, int devfn) { struct pci_seg *pseg = get_pseg(seg); --- a/xen/include/asm-x86/mm.h +++ b/xen/include/asm-x86/mm.h @@ -555,6 +555,8 @@ void memguard_unguard_stack(void *p); int ptwr_do_page_fault(struct vcpu *, unsigned long, struct cpu_user_regs *); +int mmio_ro_do_page_fault(struct vcpu *, unsigned long, + struct cpu_user_regs *); int audit_adjust_pgtables(struct domain *d, int dir, int noisy); --- a/xen/include/xen/pci.h +++ b/xen/include/xen/pci.h @@ -98,8 +98,11 @@ struct pci_dev *pci_lock_domain_pdev( void setup_dom0_pci_devices(struct domain *, void (*)(struct pci_dev *)); void pci_release_devices(struct domain *d); int pci_add_segment(u16 seg); +const unsigned long *pci_get_ro_map(u16 seg); int pci_add_device(u16 seg, u8 bus, u8 devfn, const struct pci_dev_info *); int pci_remove_device(u16 seg, u8 bus, u8 devfn); +int pci_ro_device(int seg, int bus, int devfn); +void arch_pci_ro_device(int seg, int bdf); struct pci_dev *pci_get_pdev(int seg, int bus, int devfn); struct pci_dev *pci_get_pdev_by_domain( struct domain *, int seg, int bus, int devfn);