Xen project Mailing List

Re: [Xen-devel] [PATCH v3 9/9] vpci/msix: add MSI-X handlers

To: "Roger Pau Monne" <roger.pau@xxxxxxxxxx>

From: "Jan Beulich" <JBeulich@xxxxxxxx>

Date: Mon, 29 May 2017 07:29:29 -0600

Cc: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>, julien.grall@xxxxxxx, boris.ostrovsky@xxxxxxxxxx, xen-devel@xxxxxxxxxxxxxxxxxxxx

Delivery-date: Mon, 29 May 2017 13:29:45 +0000

List-id: Xen developer discussion <xen-devel.lists.xen.org>

>>> On 27.04.17 at 16:35, <roger.pau@xxxxxxxxxx> wrote: > +static int vpci_msix_control_write(struct pci_dev *pdev, unsigned int reg, > + union vpci_val val, void *data) > +{ > + uint8_t seg = pdev->seg, bus = pdev->bus; > + uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn); > + paddr_t table_base = > pdev->vpci->header.bars[pdev->vpci->msix->bir].paddr; > + struct vpci_msix *msix = data; Wouldn't you better use this also to obtain the array index one line earlier? > + bool new_masked, new_enabled; > + unsigned int i; > + uint32_t data32; > + int rc; > + > + new_masked = val.word & PCI_MSIX_FLAGS_MASKALL; > + new_enabled = val.word & PCI_MSIX_FLAGS_ENABLE; > + > + if ( new_enabled != msix->enabled && new_enabled ) if ( !msix->enabled && new_enabled ) would likely be easier to read (similar for the "else if" below). > + { > + /* MSI-X enabled. */ > + for ( i = 0; i < msix->max_entries; i++ ) > + { > + if ( msix->entries[i].masked ) > + continue; > + > + rc = vpci_msix_enable(&msix->entries[i].arch, pdev, > + msix->entries[i].addr, > msix->entries[i].data, > + msix->entries[i].nr, table_base); > + if ( rc ) > + { > + gdprintk(XENLOG_ERR, > + "%04x:%02x:%02x.%u: unable to update entry %u: > %d\n", > + seg, bus, slot, func, i, rc); > + return rc; > + } > + > + vpci_msix_mask(&msix->entries[i].arch, false); > + } > + } > + else if ( new_enabled != msix->enabled && !new_enabled ) > + { > + /* MSI-X disabled. */ > + for ( i = 0; i < msix->max_entries; i++ ) > + { > + rc = vpci_msix_disable(&msix->entries[i].arch); > + if ( rc ) > + { > + gdprintk(XENLOG_ERR, > + "%04x:%02x:%02x.%u: unable to disable entry %u: > %d\n", > + seg, bus, slot, func, i, rc); > + return rc; > + } > + } > + } > + > + data32 = val.word; > + if ( (new_enabled != msix->enabled || new_masked != msix->masked) && > + pci_msi_conf_write_intercept(pdev, reg, 2, &data32) >= 0 ) > + pci_conf_write16(seg, bus, slot, func, reg, data32); What's the intermediate variable "data32" good for here? Afaict you could use val.word in its stead. > +static struct vpci_msix *vpci_msix_find(struct domain *d, unsigned long addr) > +{ > + struct vpci_msix *msix; > + > + ASSERT(vpci_locked(d)); > + list_for_each_entry ( msix, &d->arch.hvm_domain.msix_tables, next ) > + if ( msix->pdev->vpci->header.command & PCI_COMMAND_MEMORY && Please parenthesize & within &&. > + addr >= msix->addr && > + addr < msix->addr + msix->max_entries * PCI_MSIX_ENTRY_SIZE ) > + return msix; > + > + return NULL; > +} Looking ahead I'm getting the impression that you only allow accesses to the MSI-X table entries, yet in vpci_modify_bars() you (correctly) prevent mapping entire pages. While most other registers are disallowed from sharing a page with the table, the PBA is specifically named as an exception. Hence you need to support at least reads from the entire range. > +static int vpci_msix_table_accept(struct vcpu *v, unsigned long addr) > +{ > + int found; > + > + vpci_lock(v->domain); > + found = !!vpci_msix_find(v->domain, addr); At the risk of repeating a comment I gave on an earlier patch: Using "bool" for "found" allows you to avoid the !! . > +static int vpci_msix_access_check(struct pci_dev *pdev, unsigned long addr, > + unsigned int len) > +{ > + uint8_t seg = pdev->seg, bus = pdev->bus; > + uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn); > + > + No double blank lines please. > + /* Only allow 32/64b accesses. */ > + if ( len != 4 && len != 8 ) > + { > + gdprintk(XENLOG_ERR, > + "%04x:%02x:%02x.%u: invalid MSI-X table access size: %u\n", > + seg, bus, slot, func, len); > + return -EINVAL; > + } > + > + /* Do no allow accesses that span across multiple entries. */ > + if ( (addr & (PCI_MSIX_ENTRY_SIZE - 1)) + len > PCI_MSIX_ENTRY_SIZE ) > + { > + gdprintk(XENLOG_ERR, > + "%04x:%02x:%02x.%u: MSI-X access crosses entry boundary\n", > + seg, bus, slot, func); > + return -EINVAL; > + } > + > + /* > + * Only allow 64b accesses to the low message address field. > + * > + * NB: this is more restrictive than the specification, that allows 64b > + * accesses to other fields under certain circumstances, so this check > and > + * the code will have to be fixed in order to fully comply with the > + * specification. > + */ > + if ( (addr & (PCI_MSIX_ENTRY_SIZE - 1)) != 0 && len != 4 ) > + { > + gdprintk(XENLOG_ERR, > + "%04x:%02x:%02x.%u: 64bit MSI-X table access to 32bit field" > + " (offset: %#lx len: %u)\n", seg, bus, slot, func, > + addr & (PCI_MSIX_ENTRY_SIZE - 1), len); > + return -EINVAL; > + } > + > + return 0; > +} So you allow mis-aligned accesses, but you disallow 8-byte ones to the upper half of an entry? I think both aspects need to be got right from the beginning, the more that you BUG() in the switch()es further down in such cases. > +static struct vpci_msix_entry *vpci_msix_get_entry(struct vpci_msix *msix, > + unsigned long addr) > +{ > + return &msix->entries[(addr - msix->addr) / PCI_MSIX_ENTRY_SIZE]; > +} > + > +static int vpci_msix_table_read(struct vcpu *v, unsigned long addr, > + unsigned int len, unsigned long *data) > +{ > + struct vpci_msix *msix; > + struct vpci_msix_entry *entry; > + unsigned int offset; > + > + vpci_lock(v->domain); > + msix = vpci_msix_find(v->domain, addr); > + if ( !msix ) > + { > + vpci_unlock(v->domain); > + return X86EMUL_UNHANDLEABLE; > + } > + > + if ( vpci_msix_access_check(msix->pdev, addr, len) ) > + { > + vpci_unlock(v->domain); > + return X86EMUL_UNHANDLEABLE; > + } > + > + /* Get the table entry and offset. */ > + entry = vpci_msix_get_entry(msix, addr); > + offset = addr & (PCI_MSIX_ENTRY_SIZE - 1); > + > + switch ( offset ) > + { > + case PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET: > + *data = entry->addr; You're not clipping off the upper 32 bits here - is that reliably happening elsewhere? > + break; > + case PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET: > + *data = entry->addr >> 32; > + break; > + case PCI_MSIX_ENTRY_DATA_OFFSET: > + *data = entry->data; > + break; > + case PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET: > + *data = entry->masked ? PCI_MSIX_VECTOR_BITMASK : 0; What about the other 31 bits? > +static int vpci_msix_table_write(struct vcpu *v, unsigned long addr, > + unsigned int len, unsigned long data) > +{ > + struct vpci_msix *msix; > + struct vpci_msix_entry *entry; > + unsigned int offset; > + > + vpci_lock(v->domain); > + msix = vpci_msix_find(v->domain, addr); > + if ( !msix ) > + { > + vpci_unlock(v->domain); > + return X86EMUL_UNHANDLEABLE; > + } > + > + if ( vpci_msix_access_check(msix->pdev, addr, len) ) > + { > + vpci_unlock(v->domain); > + return X86EMUL_UNHANDLEABLE; > + } > + > + /* Get the table entry and offset. */ > + entry = vpci_msix_get_entry(msix, addr); > + offset = addr & (PCI_MSIX_ENTRY_SIZE - 1); > + > + switch ( offset ) > + { > + case PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET: > + if ( len == 8 ) > + { > + entry->addr = data; > + break; > + } > + entry->addr &= ~GENMASK(31, 0); > + entry->addr |= data; > + break; > + case PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET: > + entry->addr &= ~GENMASK(63, 32); > + entry->addr |= data << 32; > + break; > + case PCI_MSIX_ENTRY_DATA_OFFSET: > + entry->data = data; > + break; > + case PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET: > + { > + bool new_masked = data & PCI_MSIX_VECTOR_BITMASK; > + struct pci_dev *pdev = msix->pdev; > + paddr_t table_base = > + pdev->vpci->header.bars[pdev->vpci->msix->bir].paddr; Again simply "msix->bir"? > + int rc; > + > + if ( !msix->enabled ) > + { > + entry->masked = new_masked; > + break; > + } > + > + if ( new_masked != entry->masked && !new_masked ) > + { > + /* Unmasking an entry, update it. */ > + rc = vpci_msix_enable(&entry->arch, msix->pdev, entry->addr, And simply "pdev" here? > +static int vpci_init_msix(struct pci_dev *pdev) > +{ > + struct domain *d = pdev->domain; > + uint8_t seg = pdev->seg, bus = pdev->bus; > + uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn); > + struct vpci_msix *msix; > + unsigned int msix_offset, i, max_entries; > + paddr_t msix_paddr; > + uint16_t control; > + int rc; > + > + msix_offset = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSIX); > + if ( !msix_offset ) > + return 0; > + > + if ( !vpci_msix_enabled(pdev->domain) ) This is a non-__init function, so it can't use dom0_msix (I'm saying this just in case there really is a need to retain those command line options). > + { > + xen_vpci_mask_capability(pdev, PCI_CAP_ID_MSIX); > + return 0; > + } > + > + control = pci_conf_read16(seg, bus, slot, func, > + msix_control_reg(msix_offset)); > + > + /* Get the maximum number of vectors the device supports. */ > + max_entries = msix_table_size(control); > + if ( !max_entries ) > + return 0; This if() is never going to be true. > + msix = xzalloc_bytes(MSIX_SIZE(max_entries)); > + if ( !msix ) > + return -ENOMEM; > + > + msix->max_entries = max_entries; > + msix->pdev = pdev; > + > + /* Find the MSI-X table address. */ > + msix->offset = pci_conf_read32(seg, bus, slot, func, > + msix_table_offset_reg(msix_offset)); > + msix->bir = msix->offset & PCI_MSIX_BIRMASK; > + msix->offset &= ~PCI_MSIX_BIRMASK; > + > + ASSERT(pdev->vpci->header.bars[msix->bir].type == VPCI_BAR_MEM || > + pdev->vpci->header.bars[msix->bir].type == VPCI_BAR_MEM64_LO); > + msix->addr = pdev->vpci->header.bars[msix->bir].mapped_addr + > msix->offset; > + msix_paddr = pdev->vpci->header.bars[msix->bir].paddr + msix->offset; I can't seem to find where these addresses get updated in case the BARs are being relocated by the Dom0 kernel. > + for ( i = 0; i < msix->max_entries; i++) > + { > + msix->entries[i].masked = true; > + msix->entries[i].nr = i; > + vpci_msix_arch_init(&msix->entries[i].arch); > + } > + > + if ( list_empty(&d->arch.hvm_domain.msix_tables) ) > + register_mmio_handler(d, &vpci_msix_table_ops); > + > + list_add(&msix->next, &d->arch.hvm_domain.msix_tables); > + > + rc = xen_vpci_add_register(pdev, vpci_msix_control_read, > + vpci_msix_control_write, > + msix_control_reg(msix_offset), 2, msix); > + if ( rc ) > + { > + dprintk(XENLOG_ERR, > + "%04x:%02x:%02x.%u: failed to add handler for MSI-X control: > %d\n", > + seg, bus, slot, func, rc); > + goto error; > + } > + > + if ( pdev->vpci->header.command & PCI_COMMAND_MEMORY ) > + { > + /* Unmap this memory from the guest. */ > + rc = modify_mmio(pdev->domain, _gfn(PFN_DOWN(msix->addr)), > + _mfn(PFN_DOWN(msix_paddr)), > + PFN_UP(msix->max_entries * PCI_MSIX_ENTRY_SIZE), > + false); > + if ( rc ) > + { > + dprintk(XENLOG_ERR, > + "%04x:%02x:%02x.%u: unable to unmap MSI-X BAR region: > %d\n", > + seg, bus, slot, func, rc); > + goto error; > + } > + } Why is this unmapping conditional upon PCI_COMMAND_MEMORY? > +static void vpci_dump_msix(unsigned char key) > +{ > + struct domain *d; > + struct pci_dev *pdev; > + > + printk("Guest MSI-X information:\n"); > + > + for_each_domain ( d ) > + { > + if ( !has_vpci(d) ) > + continue; > + > + vpci_lock(d); Dump handlers, even if there are existing examples to the contrary, should only try-lock any locks they mean to hold (and not dump anything if they can't get hold of the lock). > --- a/xen/include/xen/vpci.h > +++ b/xen/include/xen/vpci.h > @@ -112,6 +112,33 @@ struct vpci { > /* Arch-specific data. */ > struct vpci_arch_msi arch; > } *msi; > + > + /* MSI-X data. */ > + struct vpci_msix { > + struct pci_dev *pdev; > + /* Maximum number of vectors supported by the device. */ > + unsigned int max_entries; > + /* MSI-X table offset. */ > + unsigned int offset; > + /* MSI-X table BIR. */ > + unsigned int bir; > + /* Table addr. */ > + paddr_t addr; > + /* MSI-X enabled? */ > + bool enabled; > + /* Masked? */ > + bool masked; > + /* List link. */ > + struct list_head next; > + /* Entries. */ > + struct vpci_msix_entry { > + unsigned int nr; > + uint64_t addr; > + uint32_t data; > + bool masked; > + struct vpci_arch_msix_entry arch; Indentation. Jan _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxx https://lists.xen.org/xen-devel

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.