[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH V2] amd iommu: re-enable iommu msi if dom0 disabled it



>>> On 21.06.12 at 17:29, Wei Wang <wei.wang2@xxxxxxx> wrote:
> On 06/20/2012 05:45 PM, Jan Beulich wrote:
>   > Rather than submitting it for inclusion immediately, I'd rather see
>> you first use it successfully. Below/attached what appears to
>> work fine for me in contrived situations (i.e. hiding bridges with
>> nothing connected, as I still don't have any AMD IOMMU system
>> at hand).
> 
> The patch works for me. IOMMU msi Capability is shown as enabled with 
> this patch.

Keir,

the question now arises whether we really want this, and
particularly this late before 4.2. The Linux folks don't seem to
be willing to take the strait forward workaround for the
problem introduced at their end, so we will likely need
something (the more that the questionable fix already made
it into various stable trees) before 4.2 goes out (and even
the older trees would be affected, just that putting a change
like this there is even more questionable).

There are obviously more potential problems in this area: If
any of the MMIO addresses used by AMD's IOMMU is
configurable through one of the BARs, and if Dom0 decides to
re-assign MMIO space, neither allowing the writes nor simply
dropping them as done here will work. Whether that's a real
problem I don't know - Wei? And there might be other issues
arising from dropping all writes - we might just currently not be
aware of them.

Jan

> 00:00.2 Generic system peripheral [0806]: Advanced Micro Devices [AMD] 
> Device 1419
>          Subsystem: Advanced Micro Devices [AMD] Device 1419
>          Flags: bus master, fast devsel, latency 0, IRQ 11
>          Capabilities: [40] Secure device <?>
>          Capabilities: [54] MSI: Enable+ Count=1/1 Maskable- 64bit+
>          Capabilities: [64] HyperTransport: MSI Mapping Enable+ Fixed+
> 
> 
>> Jan
>>
>> Note that due to ptwr_do_page_fault() being run first, there'll be a
>> MEM_LOG() issued for each such attempt. If that's undesirable, the
>> order of the calls would need to be swapped.
>>
>> --- a/xen/arch/x86/mm.c
>> +++ b/xen/arch/x86/mm.c
>> @@ -1875,6 +1875,7 @@ static int mod_l1_entry(l1_pgentry_t *pl
>>               break;
>>           case 1:
>>               l1e_remove_flags(nl1e, _PAGE_RW);
>> +            rc = 0;
>>               break;
>>           }
>>           if ( page )
>> @@ -5208,6 +5209,97 @@ int ptwr_do_page_fault(struct vcpu *v, u
>>       return 0;
>>   }
>>
>> +#ifdef __x86_64__
>> +/*************************
>> + * fault handling for read-only MMIO pages
>> + */
>> +
>> +struct mmio_ro_emulate_ctxt {
>> +    struct x86_emulate_ctxt ctxt;
>> +    unsigned long cr2;
>> +};
>> +
>> +static int mmio_ro_emulated_read(
>> +    enum x86_segment seg,
>> +    unsigned long offset,
>> +    void *p_data,
>> +    unsigned int bytes,
>> +    struct x86_emulate_ctxt *ctxt)
>> +{
>> +    return X86EMUL_UNHANDLEABLE;
>> +}
>> +
>> +static int mmio_ro_emulated_write(
>> +    enum x86_segment seg,
>> +    unsigned long offset,
>> +    void *p_data,
>> +    unsigned int bytes,
>> +    struct x86_emulate_ctxt *ctxt)
>> +{
>> +    struct mmio_ro_emulate_ctxt *mmio_ro_ctxt =
>> +        container_of(ctxt, struct mmio_ro_emulate_ctxt, ctxt);
>> +
>> +    /* Only allow naturally-aligned stores at the original %cr2 address. */
>> +    if ( ((bytes | offset)&  (bytes - 1)) || offset != mmio_ro_ctxt->cr2 )
>> +    {
>> +        MEM_LOG("mmio_ro_emulate: bad access (cr2=%lx, addr=%lx, 
> bytes=%u)",
>> +                mmio_ro_ctxt->cr2, offset, bytes);
>> +        return X86EMUL_UNHANDLEABLE;
>> +    }
>> +
>> +    return X86EMUL_OKAY;
>> +}
>> +
>> +static const struct x86_emulate_ops mmio_ro_emulate_ops = {
>> +    .read       = mmio_ro_emulated_read,
>> +    .insn_fetch = ptwr_emulated_read,
>> +    .write      = mmio_ro_emulated_write,
>> +};
>> +
>> +/* Check if guest is trying to modify a r/o MMIO page. */
>> +int mmio_ro_do_page_fault(struct vcpu *v, unsigned long addr,
>> +                          struct cpu_user_regs *regs)
>> +{
>> +    l1_pgentry_t      pte;
>> +    unsigned long     mfn;
>> +    unsigned int      addr_size = is_pv_32on64_domain(v->domain) ?
>> +                                  32 : BITS_PER_LONG;
>> +    struct mmio_ro_emulate_ctxt mmio_ro_ctxt = {
>> +        .ctxt.regs = regs,
>> +        .ctxt.addr_size = addr_size,
>> +        .ctxt.sp_size = addr_size,
>> +        .cr2 = addr
>> +    };
>> +    int rc;
>> +
>> +    /* Attempt to read the PTE that maps the VA being accessed. */
>> +    guest_get_eff_l1e(v, addr,&pte);
>> +
>> +    /* We are looking only for read-only mappings of MMIO pages. */
>> +    if ( ((l1e_get_flags(pte)&  (_PAGE_PRESENT|_PAGE_RW)) != _PAGE_PRESENT) 
> )
>> +        return 0;
>> +
>> +    mfn = l1e_get_pfn(pte);
>> +    if ( mfn_valid(mfn) )
>> +    {
>> +        struct page_info *page = mfn_to_page(mfn);
>> +        struct domain *owner = page_get_owner_and_reference(page);
>> +
>> +        if ( owner )
>> +            put_page(page);
>> +        if ( owner != dom_io )
>> +            return 0;
>> +    }
>> +
>> +    if ( !rangeset_contains_singleton(mmio_ro_ranges, mfn) )
>> +        return 0;
>> +
>> +    rc = x86_emulate(&mmio_ro_ctxt.ctxt,&mmio_ro_emulate_ops);
>> +
>> +    return rc != X86EMUL_UNHANDLEABLE ? EXCRET_fault_fixed : 0;
>> +}
>> +#endif /* __x86_64__ */
>> +
>>   void free_xen_pagetable(void *v)
>>   {
>>       if ( system_state == SYS_STATE_early_boot )
>> --- a/xen/arch/x86/traps.c
>> +++ b/xen/arch/x86/traps.c
>> @@ -1349,20 +1349,23 @@ static int fixup_page_fault(unsigned lon
>>           return 0;
>>       }
>>
>> -    if ( VM_ASSIST(d, VMASST_TYPE_writable_pagetables)&&
>> -         guest_kernel_mode(v, regs) )
>> -    {
>> -        unsigned int mbs = PFEC_write_access;
>> -        unsigned int mbz = PFEC_reserved_bit | PFEC_insn_fetch;
>> -
>> -        /* Do not check if access-protection fault since the page may
>> -           legitimately be not present in shadow page tables */
>> -        if ( !paging_mode_enabled(d) )
>> -            mbs |= PFEC_page_present;
>> -
>> -        if ( ((regs->error_code&  (mbs | mbz)) == mbs)&&
>> +    if ( guest_kernel_mode(v, regs)&&
>> +         !(regs->error_code&  (PFEC_reserved_bit | PFEC_insn_fetch))&&
>> +         (regs->error_code&  PFEC_write_access) )
>> +    {
>> +        if ( VM_ASSIST(d, VMASST_TYPE_writable_pagetables)&&
>> +             /* Do not check if access-protection fault since the page may
>> +                legitimately be not present in shadow page tables */
>> +             (paging_mode_enabled(d) ||
>> +              (regs->error_code&  PFEC_page_present))&&
>>                ptwr_do_page_fault(v, addr, regs) )
>>               return EXCRET_fault_fixed;
>> +
>> +#ifdef __x86_64__
>> +        if ( IS_PRIV(d)&&  (regs->error_code&  PFEC_page_present)&&
>> +             mmio_ro_do_page_fault(v, addr, regs) )
>> +            return EXCRET_fault_fixed;
>> +#endif
>>       }
>>
>>       /* For non-external shadowed guests, we fix up both their own
>> @@ -1690,6 +1693,13 @@ static int pci_cfg_ok(struct domain *d,
>>           return 0;
>>
>>       machine_bdf = (d->arch.pci_cf8>>  8)&  0xFFFF;
>> +    if ( write )
>> +    {
>> +        const unsigned long *ro_map = pci_get_ro_map(0);
>> +
>> +        if ( ro_map&&  test_bit(machine_bdf, ro_map) )
>> +            return 0;
>> +    }
>>       start = d->arch.pci_cf8&  0xFF;
>>       end = start + size - 1;
>>       if (xsm_pci_config_permission(d, machine_bdf, start, end, write))
>> --- a/xen/arch/x86/x86_32/pci.c
>> +++ b/xen/arch/x86/x86_32/pci.c
>> @@ -6,6 +6,7 @@
>>
>>   #include<xen/spinlock.h>
>>   #include<xen/pci.h>
>> +#include<xen/init.h>
>>   #include<asm/io.h>
>>
>>   #define PCI_CONF_ADDRESS(bus, dev, func, reg) \
>> @@ -70,3 +71,7 @@ void pci_conf_write32(
>>       BUG_ON((bus>  255) || (dev>  31) || (func>  7) || (reg>  255));
>>       pci_conf_write(PCI_CONF_ADDRESS(bus, dev, func, reg), 0, 4, data);
>>   }
>> +
>> +void __init arch_pci_ro_device(int seg, int bdf)
>> +{
>> +}
>> --- a/xen/arch/x86/x86_64/mmconfig_64.c
>> +++ b/xen/arch/x86/x86_64/mmconfig_64.c
>> @@ -145,9 +145,30 @@ static void __iomem *mcfg_ioremap(const
>>       return (void __iomem *) virt;
>>   }
>>
>> +void arch_pci_ro_device(int seg, int bdf)
>> +{
>> +    unsigned int idx, bus = PCI_BUS(bdf);
>> +
>> +    for (idx = 0; idx<  pci_mmcfg_config_num; ++idx) {
>> +        const struct acpi_mcfg_allocation *cfg = pci_mmcfg_virt[idx].cfg;
>> +        unsigned long mfn = (cfg->address>>  PAGE_SHIFT) + bdf;
>> +
>> +        if (!pci_mmcfg_virt[idx].virt || cfg->pci_segment != seg ||
>> +            cfg->start_bus_number>  bus || cfg->end_bus_number<  bus)
>> +            continue;
>> +
>> +        if (rangeset_add_singleton(mmio_ro_ranges, mfn))
>> +            printk(XENLOG_ERR
>> +                   "%04x:%02x:%02x.%u: could not mark MCFG (mfn %#lx) 
> read-only\n",
>> +                   cfg->pci_segment, bus, PCI_SLOT(bdf), PCI_FUNC(bdf),
>> +                   mfn);
>> +    }
>> +}
>> +
>>   int pci_mmcfg_arch_enable(unsigned int idx)
>>   {
>>       const typeof(pci_mmcfg_config[0]) *cfg = pci_mmcfg_virt[idx].cfg;
>> +    const unsigned long *ro_map = pci_get_ro_map(cfg->pci_segment);
>>
>>       if (pci_mmcfg_virt[idx].virt)
>>           return 0;
>> @@ -159,6 +180,16 @@ int pci_mmcfg_arch_enable(unsigned int i
>>       }
>>       printk(KERN_INFO "PCI: Using MCFG for segment %04x bus %02x-%02x\n",
>>              cfg->pci_segment, cfg->start_bus_number, cfg->end_bus_number);
>> +    if (ro_map) {
>> +        unsigned int bdf = PCI_BDF(cfg->start_bus_number, 0, 0);
>> +        unsigned int end = PCI_BDF(cfg->end_bus_number, -1, -1);
>> +
>> +        while ((bdf = find_next_bit(ro_map, end + 1, bdf))<= end) {
>> +            arch_pci_ro_device(cfg->pci_segment, bdf);
>> +            if (bdf++ == end)
>> +                break;
>> +        }
>> +    }
>>       return 0;
>>   }
>>
>> --- a/xen/drivers/passthrough/amd/iommu_detect.c
>> +++ b/xen/drivers/passthrough/amd/iommu_detect.c
>> @@ -153,6 +153,12 @@ int __init amd_iommu_detect_one_acpi(
>>       if ( rt )
>>           return -ENODEV;
>>
>> +    rt = pci_ro_device(iommu->seg, bus, PCI_DEVFN(dev, func));
>> +    if ( rt )
>> +        printk(XENLOG_ERR
>> +               "Could not mark config space of %04x:%02x:%02x.%u read-only 
> (%d)\n",
>> +               iommu->seg, bus, dev, func, rt);
>> +
>>       list_add_tail(&iommu->list,&amd_iommu_head);
>>
>>       return 0;
>> --- a/xen/drivers/passthrough/io.c
>> +++ b/xen/drivers/passthrough/io.c
>> @@ -593,11 +593,3 @@ void hvm_dpci_eoi(struct domain *d, unsi
>>   unlock:
>>       spin_unlock(&d->event_lock);
>>   }
>> -
>> -static int __init setup_mmio_ro_ranges(void)
>> -{
>> -    mmio_ro_ranges = rangeset_new(NULL, "r/o mmio ranges",
>> -                                  RANGESETF_prettyprint_hex);
>> -    return 0;
>> -}
>> -__initcall(setup_mmio_ro_ranges);
>> --- a/xen/drivers/passthrough/pci.c
>> +++ b/xen/drivers/passthrough/pci.c
>> @@ -36,6 +36,7 @@
>>   struct pci_seg {
>>       struct list_head alldevs_list;
>>       u16 nr;
>> +    unsigned long *ro_map;
>>       /* bus2bridge_lock protects bus2bridge array */
>>       spinlock_t bus2bridge_lock;
>>   #define MAX_BUSES 256
>> @@ -106,6 +107,8 @@ void __init pt_pci_init(void)
>>       radix_tree_init(&pci_segments);
>>       if ( !alloc_pseg(0) )
>>           panic("Could not initialize PCI segment 0\n");
>> +    mmio_ro_ranges = rangeset_new(NULL, "r/o mmio ranges",
>> +                                  RANGESETF_prettyprint_hex);
>>   }
>>
>>   int __init pci_add_segment(u16 seg)
>> @@ -113,6 +116,13 @@ int __init pci_add_segment(u16 seg)
>>       return alloc_pseg(seg) ? 0 : -ENOMEM;
>>   }
>>
>> +const unsigned long *pci_get_ro_map(u16 seg)
>> +{
>> +    struct pci_seg *pseg = get_pseg(seg);
>> +
>> +    return pseg ? pseg->ro_map : NULL;
>> +}
>> +
>>   static struct pci_dev *alloc_pdev(struct pci_seg *pseg, u8 bus, u8 devfn)
>>   {
>>       struct pci_dev *pdev;
>> @@ -198,6 +208,33 @@ static void free_pdev(struct pci_seg *ps
>>       xfree(pdev);
>>   }
>>
>> +int __init pci_ro_device(int seg, int bus, int devfn)
>> +{
>> +    struct pci_seg *pseg = alloc_pseg(seg);
>> +    struct pci_dev *pdev;
>> +
>> +    if ( !pseg )
>> +        return -ENOMEM;
>> +    pdev = alloc_pdev(pseg, bus, devfn);
>> +    if ( !pdev )
>> +        return -ENOMEM;
>> +
>> +    if ( !pseg->ro_map )
>> +    {
>> +        size_t sz = BITS_TO_LONGS(PCI_BDF(-1, -1, -1) + 1) * sizeof(long);
>> +
>> +        pseg->ro_map = alloc_xenheap_pages(get_order_from_bytes(sz), 0);
>> +        if ( !pseg->ro_map )
>> +            return -ENOMEM;
>> +        memset(pseg->ro_map, 0, sz);
>> +    }
>> +
>> +    __set_bit(PCI_BDF2(bus, devfn), pseg->ro_map);
>> +    arch_pci_ro_device(seg, PCI_BDF2(bus, devfn));
>> +
>> +    return 0;
>> +}
>> +
>>   struct pci_dev *pci_get_pdev(int seg, int bus, int devfn)
>>   {
>>       struct pci_seg *pseg = get_pseg(seg);
>> --- a/xen/include/asm-x86/mm.h
>> +++ b/xen/include/asm-x86/mm.h
>> @@ -555,6 +555,8 @@ void memguard_unguard_stack(void *p);
>>
>>   int  ptwr_do_page_fault(struct vcpu *, unsigned long,
>>                           struct cpu_user_regs *);
>> +int  mmio_ro_do_page_fault(struct vcpu *, unsigned long,
>> +                           struct cpu_user_regs *);
>>
>>   int audit_adjust_pgtables(struct domain *d, int dir, int noisy);
>>
>> --- a/xen/include/xen/pci.h
>> +++ b/xen/include/xen/pci.h
>> @@ -98,8 +98,11 @@ struct pci_dev *pci_lock_domain_pdev(
>>   void setup_dom0_pci_devices(struct domain *, void (*)(struct pci_dev *));
>>   void pci_release_devices(struct domain *d);
>>   int pci_add_segment(u16 seg);
>> +const unsigned long *pci_get_ro_map(u16 seg);
>>   int pci_add_device(u16 seg, u8 bus, u8 devfn, const struct pci_dev_info 
> *);
>>   int pci_remove_device(u16 seg, u8 bus, u8 devfn);
>> +int pci_ro_device(int seg, int bus, int devfn);
>> +void arch_pci_ro_device(int seg, int bdf);
>>   struct pci_dev *pci_get_pdev(int seg, int bus, int devfn);
>>   struct pci_dev *pci_get_pdev_by_domain(
>>       struct domain *, int seg, int bus, int devfn);
>>



_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.