x86: enable multi-vector MSI This implies - extending the public interface to have a way to request a block of MSIs - allocating a block of contiguous pIRQ-s for the target domain (but note that the Xen IRQs allocated have no need of being contiguous) - repeating certain operations for all involved IRQs - fixing multi_msi_enable() - adjusting the mask bit accesses for maskable MSIs Signed-off-by: Jan Beulich --- a/xen/arch/x86/irq.c +++ b/xen/arch/x86/irq.c @@ -1880,6 +1880,25 @@ int get_free_pirq(struct domain *d, int return -ENOSPC; } +int get_free_pirqs(struct domain *d, unsigned int nr) +{ + unsigned int i, found = 0; + + ASSERT(spin_is_locked(&d->event_lock)); + + for ( i = d->nr_pirqs - 1; i >= nr_irqs_gsi; --i ) + if ( is_free_pirq(d, pirq_info(d, i)) ) + { + pirq_get_info(d, i); + if ( ++found == nr ) + return i; + } + else + found = 0; + + return -ENOSPC; +} + int map_domain_pirq( struct domain *d, int pirq, int irq, int type, void *data) { @@ -1935,11 +1954,12 @@ int map_domain_pirq( desc = irq_to_desc(irq); - if ( type == MAP_PIRQ_TYPE_MSI ) + if ( type == MAP_PIRQ_TYPE_MSI || type == MAP_PIRQ_TYPE_MULTI_MSI ) { struct msi_info *msi = (struct msi_info *)data; struct msi_desc *msi_desc; struct pci_dev *pdev; + unsigned int nr = 0; ASSERT(spin_is_locked(&pcidevs_lock)); @@ -1950,7 +1970,14 @@ int map_domain_pirq( pdev = pci_get_pdev(msi->seg, msi->bus, msi->devfn); ret = pci_enable_msi(msi, &msi_desc); if ( ret ) + { + if ( ret > 0 ) + { + msi->entry_nr = ret; + ret = -ENFILE; + } goto done; + } spin_lock_irqsave(&desc->lock, flags); @@ -1964,25 +1991,73 @@ int map_domain_pirq( goto done; } - ret = setup_msi_irq(desc, msi_desc); - if ( ret ) + while ( !(ret = setup_msi_irq(desc, msi_desc + nr)) ) { + if ( opt_irq_vector_map == OPT_IRQ_VECTOR_MAP_PERDEV && + !desc->arch.used_vectors ) + { + desc->arch.used_vectors = &pdev->arch.used_vectors; + if ( desc->arch.vector != IRQ_VECTOR_UNASSIGNED ) + { + int vector = desc->arch.vector; + + ASSERT(!test_bit(vector, desc->arch.used_vectors)); + set_bit(vector, desc->arch.used_vectors); + } + } + if ( type == MAP_PIRQ_TYPE_MSI || + msi_desc->msi_attrib.type != PCI_CAP_ID_MSI || + ++nr == msi->entry_nr ) + break; + + set_domain_irq_pirq(d, irq, info); spin_unlock_irqrestore(&desc->lock, flags); - pci_disable_msi(msi_desc); - goto done; + + info = NULL; + irq = create_irq(NUMA_NO_NODE); + ret = irq >= 0 ? prepare_domain_irq_pirq(d, irq, pirq + nr, &info) + : irq; + if ( ret ) + break; + msi_desc[nr].irq = irq; + + if ( irq_permit_access(d, irq) != 0 ) + printk(XENLOG_G_WARNING + "dom%d: could not permit access to IRQ%d (pirq %d)\n", + d->domain_id, irq, pirq); + + desc = irq_to_desc(irq); + spin_lock_irqsave(&desc->lock, flags); + + if ( desc->handler != &no_irq_type ) + { + dprintk(XENLOG_G_ERR, "dom%d: irq %d (pirq %u) in use (%s)\n", + d->domain_id, irq, pirq + nr, desc->handler->typename); + ret = -EBUSY; + break; + } } - if ( opt_irq_vector_map == OPT_IRQ_VECTOR_MAP_PERDEV - && !desc->arch.used_vectors ) + if ( ret ) { - desc->arch.used_vectors = &pdev->arch.used_vectors; - if ( desc->arch.vector != IRQ_VECTOR_UNASSIGNED ) + spin_unlock_irqrestore(&desc->lock, flags); + while ( nr-- ) { - int vector = desc->arch.vector; - ASSERT(!test_bit(vector, desc->arch.used_vectors)); - - set_bit(vector, desc->arch.used_vectors); + if ( irq >= 0 ) + { + if ( irq_deny_access(d, irq) ) + printk(XENLOG_G_ERR + "dom%d: could not revoke access to IRQ%d (pirq %d)\n", + d->domain_id, irq, pirq); + destroy_irq(irq); + } + if ( info ) + cleanup_domain_irq_pirq(d, irq, info); + info = pirq_info(d, pirq + nr); + irq = info->arch.irq; } + pci_disable_msi(msi_desc); + goto done; } set_domain_irq_pirq(d, irq, info); @@ -2013,7 +2088,8 @@ int unmap_domain_pirq(struct domain *d, { unsigned long flags; struct irq_desc *desc; - int irq, ret = 0; + int irq, ret = 0, rc; + unsigned int i, nr = 1; bool_t forced_unbind; struct pirq *info; struct msi_desc *msi_desc = NULL; @@ -2035,6 +2111,18 @@ int unmap_domain_pirq(struct domain *d, desc = irq_to_desc(irq); msi_desc = desc->msi_desc; + if ( msi_desc && msi_desc->msi_attrib.type == PCI_CAP_ID_MSI ) + { + if ( msi_desc->msi_attrib.entry_nr ) + { + printk(XENLOG_G_ERR + "dom%d: trying to unmap secondary MSI pirq %d\n", + d->domain_id, pirq); + ret = -EBUSY; + goto done; + } + nr = msi_desc->msi.nvec; + } ret = xsm_unmap_domain_irq(XSM_HOOK, d, irq, msi_desc); if ( ret ) @@ -2050,37 +2138,83 @@ int unmap_domain_pirq(struct domain *d, spin_lock_irqsave(&desc->lock, flags); - BUG_ON(irq != domain_pirq_to_irq(d, pirq)); - - if ( !forced_unbind ) - clear_domain_irq_pirq(d, irq, info); - else + for ( i = 0; ; ) { - info->arch.irq = -irq; - radix_tree_replace_slot( - radix_tree_lookup_slot(&d->arch.irq_pirq, irq), - radix_tree_int_to_ptr(-pirq)); + BUG_ON(irq != domain_pirq_to_irq(d, pirq + i)); + + if ( !forced_unbind ) + clear_domain_irq_pirq(d, irq, info); + else + { + info->arch.irq = -irq; + radix_tree_replace_slot( + radix_tree_lookup_slot(&d->arch.irq_pirq, irq), + radix_tree_int_to_ptr(-pirq)); + } + + if ( msi_desc ) + { + desc->handler = &no_irq_type; + desc->msi_desc = NULL; + } + + if ( ++i == nr ) + break; + + spin_unlock_irqrestore(&desc->lock, flags); + + if ( !forced_unbind ) + cleanup_domain_irq_pirq(d, irq, info); + + rc = irq_deny_access(d, irq); + if ( rc ) + { + printk(XENLOG_G_ERR + "dom%d: could not deny access to IRQ%d (pirq %d)\n", + d->domain_id, irq, pirq + i); + ret = rc; + } + + do { + info = pirq_info(d, pirq + i); + if ( info && (irq = info->arch.irq) > 0 ) + break; + printk(XENLOG_G_ERR "dom%d: MSI pirq %d not mapped\n", + d->domain_id, pirq + i); + } while ( ++i < nr ); + + if ( i == nr ) + { + desc = NULL; + break; + } + + desc = irq_to_desc(irq); + BUG_ON(desc->msi_desc != msi_desc + i); + + spin_lock_irqsave(&desc->lock, flags); } - if ( msi_desc ) + if ( desc ) { - desc->handler = &no_irq_type; - desc->msi_desc = NULL; + spin_unlock_irqrestore(&desc->lock, flags); + + if ( !forced_unbind ) + cleanup_domain_irq_pirq(d, irq, info); + + rc = irq_deny_access(d, irq); + if ( rc ) + { + printk(XENLOG_G_ERR + "dom%d: could not deny access to IRQ%d (pirq %d)\n", + d->domain_id, irq, pirq + nr - 1); + ret = rc; + } } - spin_unlock_irqrestore(&desc->lock, flags); if (msi_desc) msi_free_irq(msi_desc); - if ( !forced_unbind ) - cleanup_domain_irq_pirq(d, irq, info); - - ret = irq_deny_access(d, irq); - if ( ret ) - printk(XENLOG_G_ERR - "dom%d: could not deny access to IRQ%d (pirq %d)\n", - d->domain_id, irq, pirq); - done: return ret; } --- a/xen/arch/x86/msi.c +++ b/xen/arch/x86/msi.c @@ -238,6 +238,11 @@ static int write_msi_msg(struct msi_desc u8 bus = dev->bus; u8 slot = PCI_SLOT(dev->devfn); u8 func = PCI_FUNC(dev->devfn); + int nr = entry->msi_attrib.entry_nr; + + ASSERT((msg->data & (entry[-nr].msi.nvec - 1)) == nr); + if ( nr ) + return 0; pci_conf_write32(seg, bus, slot, func, msi_lower_address_reg(pos), msg->address_lo); @@ -361,8 +366,8 @@ static void msi_set_mask_bit(struct irq_ u8 func = PCI_FUNC(entry->dev->devfn); mask_bits = pci_conf_read32(seg, bus, slot, func, entry->msi.mpos); - mask_bits &= ~(1); - mask_bits |= flag; + mask_bits &= ~((u32)1 << entry->msi_attrib.entry_nr); + mask_bits |= (u32)flag << entry->msi_attrib.entry_nr; pci_conf_write32(seg, bus, slot, func, entry->msi.mpos, mask_bits); } break; @@ -386,10 +391,11 @@ static int msi_get_mask_bit(const struct case PCI_CAP_ID_MSI: if (!entry->dev || !entry->msi_attrib.maskbit) break; - return pci_conf_read32(entry->dev->seg, entry->dev->bus, - PCI_SLOT(entry->dev->devfn), - PCI_FUNC(entry->dev->devfn), - entry->msi.mpos) & 1; + return (pci_conf_read32(entry->dev->seg, entry->dev->bus, + PCI_SLOT(entry->dev->devfn), + PCI_FUNC(entry->dev->devfn), + entry->msi.mpos) >> + entry->msi_attrib.entry_nr) & 1; case PCI_CAP_ID_MSIX: return readl(entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET) & 1; } @@ -455,17 +461,20 @@ static hw_irq_controller pci_msi_nonmask .set_affinity = set_msi_affinity }; -static struct msi_desc* alloc_msi_entry(void) +static struct msi_desc *alloc_msi_entry(unsigned int nr) { struct msi_desc *entry; - entry = xmalloc(struct msi_desc); + entry = xmalloc_array(struct msi_desc, nr); if ( !entry ) return NULL; INIT_LIST_HEAD(&entry->list); - entry->dev = NULL; - entry->remap_index = -1; + while ( nr-- ) + { + entry[nr].dev = NULL; + entry[nr].remap_index = -1; + } return entry; } @@ -483,17 +492,24 @@ int setup_msi_irq(struct irq_desc *desc, int msi_free_irq(struct msi_desc *entry) { - destroy_irq(entry->irq); + unsigned int nr = entry->msi.nvec; + if ( entry->msi_attrib.type == PCI_CAP_ID_MSIX ) { unsigned long start; start = (unsigned long)entry->mask_base & ~(PAGE_SIZE - 1); msix_put_fixmap(entry->dev, virt_to_fix(start)); + nr = 1; } - /* Free the unused IRTE if intr remap enabled */ - if ( iommu_intremap ) - iommu_update_ire_from_msi(entry, NULL); + while ( nr-- ) + { + destroy_irq(entry[nr].irq); + + /* Free the unused IRTE if intr remap enabled */ + if ( iommu_intremap ) + iommu_update_ire_from_msi(entry + nr, NULL); + } list_del(&entry->list); xfree(entry); @@ -526,11 +542,12 @@ static struct msi_desc *find_msi_entry(s **/ static int msi_capability_init(struct pci_dev *dev, int irq, - struct msi_desc **desc) + struct msi_desc **desc, + unsigned int nvec) { struct msi_desc *entry; int pos; - unsigned int maxvec, mpos; + unsigned int i, maxvec, mpos; u16 control, seg = dev->seg; u8 bus = dev->bus; u8 slot = PCI_SLOT(dev->devfn); @@ -540,27 +557,34 @@ static int msi_capability_init(struct pc pos = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSI); control = pci_conf_read16(seg, bus, slot, func, msi_control_reg(pos)); maxvec = multi_msi_capable(control); + if ( nvec > maxvec ) + return maxvec; control &= ~PCI_MSI_FLAGS_QSIZE; + multi_msi_enable(control, nvec); /* MSI Entry Initialization */ msi_set_enable(dev, 0); /* Ensure msi is disabled as I set it up */ - entry = alloc_msi_entry(); + entry = alloc_msi_entry(nvec); if ( !entry ) return -ENOMEM; - entry->msi_attrib.type = PCI_CAP_ID_MSI; - entry->msi_attrib.is_64 = is_64bit_address(control); - entry->msi_attrib.entry_nr = 0; - entry->msi_attrib.maskbit = is_mask_bit_support(control); - entry->msi_attrib.masked = 1; - entry->msi_attrib.pos = pos; mpos = msi_mask_bits_reg(pos, is_64bit_address(control)); - entry->msi.nvec = 1; + for ( i = 0; i < nvec; ++i ) + { + entry[i].msi_attrib.type = PCI_CAP_ID_MSI; + entry[i].msi_attrib.is_64 = is_64bit_address(control); + entry[i].msi_attrib.entry_nr = i; + entry[i].msi_attrib.maskbit = is_mask_bit_support(control); + entry[i].msi_attrib.masked = 1; + entry[i].msi_attrib.pos = pos; + if ( entry[i].msi_attrib.maskbit ) + entry[i].msi.mpos = mpos; + entry[i].msi.nvec = 0; + entry[i].dev = dev; + } + entry->msi.nvec = nvec; entry->irq = irq; - if ( is_mask_bit_support(control) ) - entry->msi.mpos = mpos; - entry->dev = dev; if ( entry->msi_attrib.maskbit ) { u32 maskbits; @@ -688,7 +712,7 @@ static int msix_capability_init(struct p if ( desc ) { - entry = alloc_msi_entry(); + entry = alloc_msi_entry(1); if ( !entry ) return -ENOMEM; ASSERT(msi); @@ -846,7 +870,6 @@ static int msix_capability_init(struct p static int __pci_enable_msi(struct msi_info *msi, struct msi_desc **desc) { - int status; struct pci_dev *pdev; struct msi_desc *old_desc; @@ -875,8 +898,7 @@ static int __pci_enable_msi(struct msi_i pci_disable_msi(old_desc); } - status = msi_capability_init(pdev, msi->irq, desc); - return status; + return msi_capability_init(pdev, msi->irq, desc, msi->entry_nr); } static void __pci_disable_msi(struct msi_desc *entry) @@ -1096,6 +1118,8 @@ int pci_restore_msi_state(struct pci_dev list_for_each_entry_safe( entry, tmp, &pdev->msi_list, list ) { + unsigned int i = 0, nr = 1; + irq = entry->irq; desc = &irq_desc[irq]; @@ -1105,30 +1129,58 @@ int pci_restore_msi_state(struct pci_dev if (desc->msi_desc != entry) { + bogus: dprintk(XENLOG_ERR, - "Restore MSI for dev %04x:%02x:%02x:%x not set before?\n", + "Restore MSI for %04x:%02x:%02x:%u entry %u not set?\n", pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn), - PCI_FUNC(pdev->devfn)); + PCI_FUNC(pdev->devfn), i); spin_unlock_irqrestore(&desc->lock, flags); return -EINVAL; } if ( entry->msi_attrib.type == PCI_CAP_ID_MSI ) + { msi_set_enable(pdev, 0); + nr = entry->msi.nvec; + } else if ( entry->msi_attrib.type == PCI_CAP_ID_MSIX ) msix_set_enable(pdev, 0); msg = entry->msg; write_msi_msg(entry, &msg); - msi_set_mask_bit(desc, entry->msi_attrib.masked); + for ( i = 0; ; ) + { + msi_set_mask_bit(desc, entry[i].msi_attrib.masked); + spin_unlock_irqrestore(&desc->lock, flags); + + if ( !--nr ) + break; + + desc = &irq_desc[entry[++i].irq]; + spin_lock_irqsave(&desc->lock, flags); + if ( desc->msi_desc != entry + i ) + goto bogus; + } + + spin_unlock_irqrestore(&desc->lock, flags); if ( entry->msi_attrib.type == PCI_CAP_ID_MSI ) + { + unsigned int cpos = msi_control_reg(entry->msi_attrib.pos); + u16 control = pci_conf_read16(pdev->seg, pdev->bus, + PCI_SLOT(pdev->devfn), + PCI_FUNC(pdev->devfn), cpos); + + control &= ~PCI_MSI_FLAGS_QSIZE; + multi_msi_enable(control, entry->msi.nvec); + pci_conf_write16(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn), + PCI_FUNC(pdev->devfn), cpos, control); + msi_set_enable(pdev, 1); + } else if ( entry->msi_attrib.type == PCI_CAP_ID_MSIX ) msix_set_enable(pdev, 1); - - spin_unlock_irqrestore(&desc->lock, flags); } return 0; --- a/xen/arch/x86/physdev.c +++ b/xen/arch/x86/physdev.c @@ -140,8 +140,11 @@ int physdev_map_pirq(domid_t domid, int break; case MAP_PIRQ_TYPE_MSI: + if ( !msi->table_base ) + msi->entry_nr = 1; irq = *index; if ( irq == -1 ) + case MAP_PIRQ_TYPE_MULTI_MSI: irq = create_irq(NUMA_NO_NODE); if ( irq < nr_irqs_gsi || irq >= nr_irqs ) @@ -179,6 +182,30 @@ int physdev_map_pirq(domid_t domid, int goto done; } } + else if ( type == MAP_PIRQ_TYPE_MULTI_MSI ) + { + if ( msi->entry_nr <= 0 || msi->entry_nr > 32 ) + ret = -EDOM; + else if ( msi->entry_nr != 1 && !iommu_intremap ) + ret = -EOPNOTSUPP; + else + { + while ( msi->entry_nr & (msi->entry_nr - 1) ) + msi->entry_nr += msi->entry_nr & -msi->entry_nr; + pirq = get_free_pirqs(d, msi->entry_nr); + if ( pirq < 0 ) + { + while ( (msi->entry_nr >>= 1) > 1 ) + if ( get_free_pirqs(d, msi->entry_nr) > 0 ) + break; + dprintk(XENLOG_G_ERR, "dom%d: no block of %d free pirqs\n", + d->domain_id, msi->entry_nr << 1); + ret = pirq; + } + } + if ( ret < 0 ) + goto done; + } else { pirq = get_free_pirq(d, type); @@ -210,8 +237,15 @@ int physdev_map_pirq(domid_t domid, int done: spin_unlock(&d->event_lock); spin_unlock(&pcidevs_lock); - if ( (ret != 0) && (type == MAP_PIRQ_TYPE_MSI) && (*index == -1) ) - destroy_irq(irq); + if ( ret != 0 ) + switch ( type ) + { + case MAP_PIRQ_TYPE_MSI: + if ( *index == -1 ) + case MAP_PIRQ_TYPE_MULTI_MSI: + destroy_irq(irq); + break; + } free_domain: rcu_unlock_domain(d); return ret; @@ -390,14 +424,22 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_H if ( copy_from_guest(&map, arg, 1) != 0 ) break; - if ( map.type == MAP_PIRQ_TYPE_MSI_SEG ) + switch ( map.type ) { + case MAP_PIRQ_TYPE_MSI_SEG: map.type = MAP_PIRQ_TYPE_MSI; msi.seg = map.bus >> 16; - } - else - { + break; + + case MAP_PIRQ_TYPE_MULTI_MSI: + if ( map.table_base ) + return -EINVAL; + msi.seg = map.bus >> 16; + break; + + default: msi.seg = 0; + break; } msi.bus = map.bus; msi.devfn = map.devfn; @@ -406,6 +448,8 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_H ret = physdev_map_pirq(map.domid, map.type, &map.index, &map.pirq, &msi); + if ( map.type == MAP_PIRQ_TYPE_MULTI_MSI ) + map.entry_nr = msi.entry_nr; if ( __copy_to_guest(arg, &map, 1) ) ret = -EFAULT; break; --- a/xen/include/asm-x86/irq.h +++ b/xen/include/asm-x86/irq.h @@ -141,6 +141,7 @@ int map_domain_pirq(struct domain *d, in void *data); int unmap_domain_pirq(struct domain *d, int pirq); int get_free_pirq(struct domain *d, int type); +int get_free_pirqs(struct domain *, unsigned int nr); void free_domain_pirqs(struct domain *d); int map_domain_emuirq_pirq(struct domain *d, int pirq, int irq); int unmap_domain_pirq_emuirq(struct domain *d, int pirq); --- a/xen/include/asm-x86/msi.h +++ b/xen/include/asm-x86/msi.h @@ -145,7 +145,7 @@ int msi_free_irq(struct msi_desc *entry) #define multi_msi_capable(control) \ (1 << ((control & PCI_MSI_FLAGS_QMASK) >> 1)) #define multi_msi_enable(control, num) \ - control |= (((num >> 1) << 4) & PCI_MSI_FLAGS_QSIZE); + control |= (((fls(num) - 1) << 4) & PCI_MSI_FLAGS_QSIZE); #define is_64bit_address(control) (!!(control & PCI_MSI_FLAGS_64BIT)) #define is_mask_bit_support(control) (!!(control & PCI_MSI_FLAGS_MASKBIT)) #define msi_enable(control, num) multi_msi_enable(control, num); \ --- a/xen/include/public/physdev.h +++ b/xen/include/public/physdev.h @@ -151,21 +151,22 @@ DEFINE_XEN_GUEST_HANDLE(physdev_irq_t); #define MAP_PIRQ_TYPE_GSI 0x1 #define MAP_PIRQ_TYPE_UNKNOWN 0x2 #define MAP_PIRQ_TYPE_MSI_SEG 0x3 +#define MAP_PIRQ_TYPE_MULTI_MSI 0x4 #define PHYSDEVOP_map_pirq 13 struct physdev_map_pirq { domid_t domid; /* IN */ int type; - /* IN */ + /* IN (ignored for ..._MULTI_MSI) */ int index; /* IN or OUT */ int pirq; - /* IN - high 16 bits hold segment for MAP_PIRQ_TYPE_MSI_SEG */ + /* IN - high 16 bits hold segment for ..._MSI_SEG and ..._MULTI_MSI */ int bus; /* IN */ int devfn; - /* IN */ + /* IN (also OUT for ..._MULTI_MSI) */ int entry_nr; /* IN */ uint64_t table_base;