Xen project Mailing List

Re: [Xen-devel] [PATCH v4 9/9] vpci/msix: add MSI-X handlers

>>> Roger Pau Monne <roger.pau@xxxxxxxxxx> 06/30/17 5:01 PM >>> >Note that accesses to the Table Offset, Table BIR, PBA Offset and PBA >BIR are not trapped by Xen at the moment. They're mandated r/o by the spec anyway. >@@ -113,6 +148,35 @@ static int vpci_modify_bar(struct domain *d, const struct >vpci_bar *bar, >if ( IS_ERR(mem) ) >return -PTR_ERR(mem); > >+ /* >+ * Make sure the MSI-X regions of the BAR are not mapped into the domain >+ * p2m, or else the MSI-X handlers are useless. Only do this when mapping, >+ * since that's when the memory decoding on the device is enabled. >+ */ >+ for ( i = 0; map && i < ARRAY_SIZE(bar->msix); i++ ) >+ { >+ struct vpci_msix_mem *msix = bar->msix[i]; >+ >+ if ( !msix || msix->addr == INVALID_PADDR ) >+ continue; >+ >+ rc = vpci_unmap_msix(d, msix); Why do you need this, instead of being able to simply rely on the rangeset based (un)mapping? >@@ -405,7 +475,20 @@ static int vpci_init_bars(struct pci_dev *pdev) >continue; >} > >- bars[i].addr = (cmd & PCI_COMMAND_MEMORY) ? addr : INVALID_PADDR; >+ if ( cmd & PCI_COMMAND_MEMORY ) >+ { >+ unsigned int j; >+ >+ bars[i].addr = addr; >+ >+ for ( j = 0; j < ARRAY_SIZE(bars[i].msix); j++ ) >+ if ( bars[i].msix[j] ) >+ bars[i].msix[j]->addr = bars[i].addr + >+ bars[i].msix[j]->offset; >+ } >+ else >+ bars[i].addr = INVALID_PADDR; As (I think) mentioned elsewhere already, this init-time special case looks dangerous (and unnecessary) to me (or else I'd expect you to also zap the field when the memory decode bit is being cleared). >--- /dev/null >+++ b/xen/drivers/vpci/msix.c >@@ -0,0 +1,503 @@ >+/* >+ * Handlers for accesses to the MSI-X capability structure and the memory >+ * region. >+ * >+ * Copyright (C) 2017 Citrix Systems R&D >+ * >+ * This program is free software; you can redistribute it and/or >+ * modify it under the terms and conditions of the GNU General Public >+ * License, version 2, as published by the Free Software Foundation. >+ * >+ * This program is distributed in the hope that it will be useful, >+ * but WITHOUT ANY WARRANTY; without even the implied warranty of >+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >+ * General Public License for more details. >+ * >+ * You should have received a copy of the GNU General Public >+ * License along with this program; If not, see ><http://www.gnu.org/licenses/>. >+ */ >+ >+#include <xen/sched.h> >+#include <xen/vpci.h> >+#include <asm/msi.h> >+#include <xen/p2m-common.h> >+#include <xen/keyhandler.h> >+ >+#define MSIX_SIZE(num) (offsetof(struct vpci_msix, entries[num])) The outermost parens are pointless here. >+static void vpci_msix_control_write(struct pci_dev *pdev, unsigned int reg, >+ union vpci_val val, void *data) >+{ >+ uint8_t seg = pdev->seg, bus = pdev->bus; >+ uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn); >+ struct vpci_msix *msix = data; >+ paddr_t table_base = pdev->vpci->header.bars[msix->table.bir].addr; >+ bool new_masked, new_enabled; >+ unsigned int i; >+ int rc; >+ >+ new_masked = val.u16 & PCI_MSIX_FLAGS_MASKALL; >+ new_enabled = val.u16 & PCI_MSIX_FLAGS_ENABLE; >+ >+ if ( !msix->enabled && new_enabled ) >+ { >+ /* MSI-X enabled. */ Insert "being"? >+ for ( i = 0; i < msix->max_entries; i++ ) >+ { >+ if ( msix->entries[i].masked ) >+ continue; Why is the mask bit relevant here, but not the mask-all one? >+ rc = vpci_msix_arch_enable(&msix->entries[i].arch, pdev, >+ msix->entries[i].addr, >+ msix->entries[i].data, >+ msix->entries[i].nr, table_base); >+ if ( rc ) >+ { >+ gdprintk(XENLOG_ERR, <+ "%04x:%02x:%02x.%u: unable to update entry %u: %d\n", >+ seg, bus, slot, func, i, rc); >+ return; >+ } >+ >+ vpci_msix_arch_mask(&msix->entries[i].arch, pdev, false); Same question here. >+ } >+ } >+ else if ( msix->enabled && !new_enabled ) >+ { >+ /* MSI-X disabled. */ >+ for ( i = 0; i < msix->max_entries; i++ ) >+ { >+ rc = vpci_msix_arch_disable(&msix->entries[i].arch, pdev); >+ if ( rc ) >+ { >+ gdprintk(XENLOG_ERR, >+ "%04x:%02x:%02x.%u: unable to disable entry %u: >%d\n", >+ seg, bus, slot, func, i, rc); >+ return; >+ } >+ } >+ } >+ >+ if ( (new_enabled != msix->enabled || new_masked != msix->masked) && >+ pci_msi_conf_write_intercept(pdev, reg, 2, &val.u32) >= 0 ) >+ pci_conf_write16(seg, bus, slot, func, reg, val.u32); DYM val.u16 here? >+static struct vpci_msix *vpci_msix_find(struct domain *d, unsigned long addr) >+{ >+ struct vpci_msix *msix; >+ >+ ASSERT(vpci_locked(d)); >+ list_for_each_entry ( msix, &d->arch.hvm_domain.msix_tables, next ) >+ { >+ uint8_t seg = msix->pdev->seg, bus = msix->pdev->bus; >+ uint8_t slot = PCI_SLOT(msix->pdev->devfn); >+ uint8_t func = PCI_FUNC(msix->pdev->devfn); >+ uint16_t cmd = pci_conf_read16(seg, bus, slot, func, PCI_COMMAND); Perhaps better to keep a cached copy of the command register value? >+static int vpci_msix_read(struct vcpu *v, unsigned long addr, >+ unsigned int len, unsigned long *data) >+{ >+ struct domain *d = v->domain; >+ struct vpci_msix *msix; >+ const struct vpci_msix_entry *entry; >+ unsigned int offset; >+ >+ vpci_lock(d); >+ >+ msix = vpci_msix_find(d, addr); >+ if ( !msix ) >+ { >+ vpci_unlock(d); >+ *data = ~0ul; >+ return X86EMUL_OKAY; >+ } >+ >+ if ( vpci_msix_access_check(msix->pdev, addr, len) ) >+ { >+ vpci_unlock(d); >+ *data = ~0ul; >+ return X86EMUL_OKAY; >+ } >+ >+ if ( MSIX_ADDR_IN_RANGE(addr, &msix->pba) ) >+ { >+ /* Access to PBA. */ >+ switch ( len ) >+ { >+ case 4: >+ *data = readl(addr); >+ break; >+ case 8: >+ *data = readq(addr); >+ break; >+ default: >+ ASSERT_UNREACHABLE(); data = ~0ul; >+static int vpci_msix_write(struct vcpu *v, unsigned long addr, >+ unsigned int len, unsigned long data) >+{ >+ struct domain *d = v->domain; >+ struct vpci_msix *msix; >+ struct vpci_msix_entry *entry; >+ unsigned int offset; >+ >+ vpci_lock(d); >+ msix = vpci_msix_find(d, addr); >+ if ( !msix ) >+ { >+ vpci_unlock(d); >+ return X86EMUL_OKAY; >+ } >+ >+ if ( MSIX_ADDR_IN_RANGE(addr, &msix->pba) ) >+ { >+ /* Ignore writes to PBA, it's behavior is undefined. */ >+ vpci_unlock(d); >+ return X86EMUL_OKAY; >+ } >+ >+ if ( vpci_msix_access_check(msix->pdev, addr, len) ) >+ { >+ vpci_unlock(d); >+ return X86EMUL_OKAY; >+ } >+ >+ /* Get the table entry and offset. */ >+ entry = vpci_msix_get_entry(msix, addr); >+ offset = addr & (PCI_MSIX_ENTRY_SIZE - 1); >+ >+ switch ( offset ) >+ { >+ case PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET: >+ if ( len == 8 ) >+ { >+ entry->addr = data; >+ break; >+ } >+ entry->addr &= ~0xffffffff; With this, why not ... >+ entry->addr |= data; >+ break; >+ case PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET: >+ entry->addr &= ~((uint64_t)0xffffffff << 32); ... simply 0xffffffff here? >+ entry->addr |= data << 32; Iirc we've already talked about this being undefined for 32-bit arches (e.g. ARM32), and the resulting need to make "data" uint64_t. >+ break; >+ case PCI_MSIX_ENTRY_DATA_OFFSET: >+ /* >+ * 8 byte writes to the msg data and vector control fields are >+ * only allowed if the entry is masked. >+ */ >+ if ( len == 8 && !entry->masked && !msix->masked && msix->enabled ) >+ { >+ vpci_unlock(d); >+ return X86EMUL_OKAY; >+ } I don't think this is correct - iirc such writes simply don't take effect immediately (but I then seem to recall this to apply to the address field and 32-bit writes to the data field as well). They'd instead take effect the next time the entry is being unmasked (or some such). A while ago I did fix the qemu code to behave in this way. >+ entry->data = data; >+ >+ if ( len == 4 ) >+ break; >+ >+ data >>= 32; >+ /* fallthrough */ >+ case PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET: >+ { >+ bool new_masked = data & PCI_MSIX_VECTOR_BITMASK; >+ struct pci_dev *pdev = msix->pdev; >+ paddr_t table_base = pdev->vpci->header.bars[msix->table.bir].addr; >+ int rc; >+ >+ if ( !msix->enabled ) >+ { >+ entry->masked = new_masked; >+ break; >+ } >+ >+ if ( new_masked != entry->masked && !new_masked ) if ( !new_masked && entry->masked ) (or the other way around) >+static int vpci_init_msix(struct pci_dev *pdev) >+{ >+ struct domain *d = pdev->domain; >+ uint8_t seg = pdev->seg, bus = pdev->bus; >+ uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn); >+ struct vpci_msix *msix; >+ unsigned int msix_offset, i, max_entries; >+ struct vpci_bar *table_bar, *pba_bar; >+ uint16_t control; >+ int rc; >+ >+ msix_offset = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSIX); >+ if ( !msix_offset ) >+ return 0; >+ >+ control = pci_conf_read16(seg, bus, slot, func, >+ msix_control_reg(msix_offset)); >+ >+ /* Get the maximum number of vectors the device supports. */ >+ max_entries = msix_table_size(control); >+ >+ msix = xzalloc_bytes(MSIX_SIZE(max_entries)); >+ if ( !msix ) >+ return -ENOMEM; >+ >+ msix->max_entries = max_entries; >+ msix->pdev = pdev; >+ >+ /* Find the MSI-X table address. */ >+ msix->table.offset = pci_conf_read32(seg, bus, slot, func, >+ msix_table_offset_reg(msix_offset)); >+ msix->table.bir = msix->table.offset & PCI_MSIX_BIRMASK; >+ msix->table.offset &= ~PCI_MSIX_BIRMASK; >+ msix->table.size = msix->max_entries * PCI_MSIX_ENTRY_SIZE; >+ msix->table.addr = INVALID_PADDR; >+ >+ /* Find the MSI-X pba address. */ >+ msix->pba.offset = pci_conf_read32(seg, bus, slot, func, >+ msix_pba_offset_reg(msix_offset)); >+ msix->pba.bir = msix->pba.offset & PCI_MSIX_BIRMASK; >+ msix->pba.offset &= ~PCI_MSIX_BIRMASK; >+ msix->pba.size = DIV_ROUND_UP(msix->max_entries, 8); I think you want to round up to at least the next 32-bit boundary; the spec talking about bits 63..00 even suggests a 64-bit boundary. The table addresses being required to be qword aligned also supports this. >+void vpci_dump_msix(void) >+{ >+ struct domain *d; >+ struct pci_dev *pdev; const for all pointers in dump handlers, as far as possible. >+ for_each_domain ( d ) >+ { >+ if ( !has_vpci(d) ) >+ continue; >+ >+ printk("vPCI MSI-X information for guest %u\n", d->domain_id); Wouldn't it be better (more useful) to dump the MSI and MSI-X data for a domain next to each other? Apart from the comments here the ones give for the MSI patch apply respectively. Jan _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxx https://lists.xen.org/xen-devel

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.