[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH] x86/MSI: add mechanism to protect MSI-X table from PV guest accesses


  • To: Jan Beulich <JBeulich@xxxxxxxx>, xen-devel <xen-devel@xxxxxxxxxxxxx>
  • From: "Zhang, Xiantao" <xiantao.zhang@xxxxxxxxx>
  • Date: Mon, 25 Feb 2013 05:17:29 +0000
  • Accept-language: en-US
  • Delivery-date: Mon, 25 Feb 2013 05:18:12 +0000
  • List-id: Xen developer discussion <xen-devel.lists.xen.org>
  • Thread-index: AQHOBIp1bevntmDnjEewxuNkgxh7+piKIr2A
  • Thread-topic: [Xen-devel] [PATCH] x86/MSI: add mechanism to protect MSI-X table from PV guest accesses

Hi, Jan
I think this fix also covers HVM side, right ?   For PV side,  I recalled there 
was already a fix in Xend to protect PV guest from accessing the related MMIO 
range. Also for HVM guest, last year you proposed and submitted a patch to 
Xen-Qemu for blocking guest's accesses to the related MMIO range.    Looks like 
the previous fixes are not needed any more if this fix got check-in, right ?  
Thanks! 
Xiantao

> -----Original Message-----
> From: xen-devel-bounces@xxxxxxxxxxxxx [mailto:xen-devel-
> bounces@xxxxxxxxxxxxx] On Behalf Of Jan Beulich
> Sent: Thursday, February 07, 2013 12:51 AM
> To: xen-devel
> Subject: [Xen-devel] [PATCH] x86/MSI: add mechanism to protect MSI-X
> table from PV guest accesses
> 
> This adds two new physdev operations for Dom0 to invoke when resource
> allocation for devices is known to be complete, so that the hypervisor
> can arrange for the respective MMIO ranges to be marked read-only
> before an eventual guest getting such a device assigned even gets
> started, such that it won't be able to set up writable mappings for
> these MMIO ranges before Xen has a chance to protect them.
> 
> This also addresses another issue with the code being modified here,
> in that so far write protection for the address ranges in question got
> set up only once during the lifetime of a device (i.e. until either
> system shutdown or device hot removal), while teardown happened when
> the last interrupt was disposed of by the guest (which at least allowed
> the tables to be writable when the device got assigned to a second
> guest [instance] after the first terminated).
> 
> Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>
> 
> --- a/xen/arch/x86/msi.c
> +++ b/xen/arch/x86/msi.c
> @@ -656,8 +656,8 @@ static u64 read_pci_mem_bar(u16 seg, u8
>   * @entries: pointer to an array of struct msix_entry entries
>   * @nvec: number of @entries
>   *
> - * Setup the MSI-X capability structure of device function with a
> - * single MSI-X irq. A return of zero indicates the successful setup of
> + * Setup the MSI-X capability structure of device function with the
> requested
> + * number MSI-X irqs. A return of zero indicates the successful setup of
>   * requested MSI-X entries with allocated irqs or non-zero for otherwise.
>   **/
>  static int msix_capability_init(struct pci_dev *dev,
> @@ -665,86 +665,69 @@ static int msix_capability_init(struct p
>                                  struct msi_desc **desc,
>                                  unsigned int nr_entries)
>  {
> -    struct msi_desc *entry;
> -    int pos;
> +    struct msi_desc *entry = NULL;
> +    int pos, vf;
>      u16 control;
> -    u64 table_paddr, entry_paddr;
> -    u32 table_offset, entry_offset;
> -    u8 bir;
> -    void __iomem *base;
> -    int idx;
> +    u64 table_paddr;
> +    u32 table_offset;
> +    u8 bir, pbus, pslot, pfunc;
>      u16 seg = dev->seg;
>      u8 bus = dev->bus;
>      u8 slot = PCI_SLOT(dev->devfn);
>      u8 func = PCI_FUNC(dev->devfn);
> 
>      ASSERT(spin_is_locked(&pcidevs_lock));
> -    ASSERT(desc);
> 
>      pos = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSIX);
>      control = pci_conf_read16(seg, bus, slot, func, msix_control_reg(pos));
>      msix_set_enable(dev, 0);/* Ensure msix is disabled as I set it up */
> 
> -    /* MSI-X Table Initialization */
> -    entry = alloc_msi_entry();
> -    if ( !entry )
> -        return -ENOMEM;
> +    if ( desc )
> +    {
> +        entry = alloc_msi_entry();
> +        if ( !entry )
> +            return -ENOMEM;
> +        ASSERT(msi);
> +    }
> 
> -    /* Request & Map MSI-X table region */
> +    /* Locate MSI-X table region */
>      table_offset = pci_conf_read32(seg, bus, slot, func,
>                                     msix_table_offset_reg(pos));
>      bir = (u8)(table_offset & PCI_MSIX_BIRMASK);
>      table_offset &= ~PCI_MSIX_BIRMASK;
> -    entry_offset = msi->entry_nr * PCI_MSIX_ENTRY_SIZE;
> 
> -    table_paddr = msi->table_base + table_offset;
> -    entry_paddr = table_paddr + entry_offset;
> -    idx = msix_get_fixmap(dev, table_paddr, entry_paddr);
> -    if ( idx < 0 )
> -    {
> -        xfree(entry);
> -        return idx;
> -    }
> -    base = (void *)(fix_to_virt(idx) +
> -        ((unsigned long)entry_paddr & ((1UL << PAGE_SHIFT) - 1)));
> -
> -    entry->msi_attrib.type = PCI_CAP_ID_MSIX;
> -    entry->msi_attrib.is_64 = 1;
> -    entry->msi_attrib.entry_nr = msi->entry_nr;
> -    entry->msi_attrib.maskbit = 1;
> -    entry->msi_attrib.masked = 1;
> -    entry->msi_attrib.pos = pos;
> -    entry->irq = msi->irq;
> -    entry->dev = dev;
> -    entry->mask_base = base;
> -
> -    list_add_tail(&entry->list, &dev->msi_list);
> -
> -    if ( !dev->msix_nr_entries )
> +    if ( !dev->info.is_virtfn )
>      {
> -        u8 pbus, pslot, pfunc;
> -        int vf;
> -        u64 pba_paddr;
> -        u32 pba_offset;
> +        pbus = bus;
> +        pslot = slot;
> +        pfunc = func;
> +        vf = -1;
> +    }
> +    else
> +    {
> +        pbus = dev->info.physfn.bus;
> +        pslot = PCI_SLOT(dev->info.physfn.devfn);
> +        pfunc = PCI_FUNC(dev->info.physfn.devfn);
> +        vf = PCI_BDF2(dev->bus, dev->devfn);
> +    }
> 
> -        if ( !dev->info.is_virtfn )
> -        {
> -            pbus = bus;
> -            pslot = slot;
> -            pfunc = func;
> -            vf = -1;
> -        }
> -        else
> +    table_paddr = read_pci_mem_bar(seg, pbus, pslot, pfunc, bir, vf);
> +    WARN_ON(msi && msi->table_base != table_paddr);
> +    if ( !table_paddr )
> +    {
> +        if ( !msi || !msi->table_base )
>          {
> -            pbus = dev->info.physfn.bus;
> -            pslot = PCI_SLOT(dev->info.physfn.devfn);
> -            pfunc = PCI_FUNC(dev->info.physfn.devfn);
> -            vf = PCI_BDF2(dev->bus, dev->devfn);
> +            xfree(entry);
> +            return -ENXIO;
>          }
> +        table_paddr = msi->table_base;
> +    }
> +    table_paddr += table_offset;
> 
> -        ASSERT(!dev->msix_used_entries);
> -        WARN_ON(msi->table_base !=
> -                read_pci_mem_bar(seg, pbus, pslot, pfunc, bir, vf));
> +    if ( !dev->msix_used_entries )
> +    {
> +        u64 pba_paddr;
> +        u32 pba_offset;
> 
>          dev->msix_nr_entries = nr_entries;
>          dev->msix_table.first = PFN_DOWN(table_paddr);
> @@ -765,7 +748,42 @@ static int msix_capability_init(struct p
>                                        BITS_TO_LONGS(nr_entries) - 1);
>          WARN_ON(rangeset_overlaps_range(mmio_ro_ranges, dev-
> >msix_pba.first,
>                                          dev->msix_pba.last));
> +    }
> 
> +    if ( entry )
> +    {
> +        /* Map MSI-X table region */
> +        u64 entry_paddr = table_paddr + msi->entry_nr *
> PCI_MSIX_ENTRY_SIZE;
> +        int idx = msix_get_fixmap(dev, table_paddr, entry_paddr);
> +        void __iomem *base;
> +
> +        if ( idx < 0 )
> +        {
> +            xfree(entry);
> +            return idx;
> +        }
> +        base = (void *)(fix_to_virt(idx) +
> +                        ((unsigned long)entry_paddr & (PAGE_SIZE - 1)));
> +
> +        /* Mask interrupt here */
> +        writel(1, base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
> +
> +        entry->msi_attrib.type = PCI_CAP_ID_MSIX;
> +        entry->msi_attrib.is_64 = 1;
> +        entry->msi_attrib.entry_nr = msi->entry_nr;
> +        entry->msi_attrib.maskbit = 1;
> +        entry->msi_attrib.masked = 1;
> +        entry->msi_attrib.pos = pos;
> +        entry->irq = msi->irq;
> +        entry->dev = dev;
> +        entry->mask_base = base;
> +
> +        list_add_tail(&entry->list, &dev->msi_list);
> +        *desc = entry;
> +    }
> +
> +    if ( !dev->msix_used_entries )
> +    {
>          if ( rangeset_add_range(mmio_ro_ranges, dev->msix_table.first,
>                                  dev->msix_table.last) )
>              WARN();
> @@ -776,7 +794,7 @@ static int msix_capability_init(struct p
>          if ( dev->domain )
>              p2m_change_entry_type_global(dev->domain,
>                                           p2m_mmio_direct, p2m_mmio_direct);
> -        if ( !dev->domain || !paging_mode_translate(dev->domain) )
> +        if ( desc && (!dev->domain || !paging_mode_translate(dev->domain)) )
>          {
>              struct domain *d = dev->domain;
> 
> @@ -790,6 +808,13 @@ static int msix_capability_init(struct p
>                          break;
>              if ( d )
>              {
> +                if ( !IS_PRIV(d) && dev->msix_warned != d->domain_id )
> +                {
> +                    dev->msix_warned = d->domain_id;
> +                    printk(XENLOG_ERR
> +                           "Potentially insecure use of MSI-X on 
> %04x:%02x:%02x.%u by
> Dom%d\n",
> +                           seg, bus, slot, func, d->domain_id);
> +                }
>                  /* XXX How to deal with existing mappings? */
>              }
>          }
> @@ -798,10 +823,6 @@ static int msix_capability_init(struct p
>      WARN_ON(dev->msix_table.first != (table_paddr >> PAGE_SHIFT));
>      ++dev->msix_used_entries;
> 
> -    /* Mask interrupt here */
> -    writel(1, entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
> -
> -    *desc = entry;
>      /* Restore MSI-X enabled bits */
>      pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos), control);
> 
> @@ -926,6 +947,19 @@ static int __pci_enable_msix(struct msi_
>      return status;
>  }
> 
> +static void _pci_cleanup_msix(struct pci_dev *dev)
> +{
> +    if ( !--dev->msix_used_entries )
> +    {
> +        if ( rangeset_remove_range(mmio_ro_ranges, dev->msix_table.first,
> +                                   dev->msix_table.last) )
> +            WARN();
> +        if ( rangeset_remove_range(mmio_ro_ranges, dev->msix_pba.first,
> +                                   dev->msix_pba.last) )
> +            WARN();
> +    }
> +}
> +
>  static void __pci_disable_msix(struct msi_desc *entry)
>  {
>      struct pci_dev *dev;
> @@ -949,15 +983,42 @@ static void __pci_disable_msix(struct ms
> 
>      pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos), control);
> 
> -    if ( !--dev->msix_used_entries )
> +    _pci_cleanup_msix(dev);
> +}
> +
> +int pci_prepare_msix(u16 seg, u8 bus, u8 devfn, bool_t off)
> +{
> +    int rc;
> +    struct pci_dev *pdev;
> +    u8 slot = PCI_SLOT(devfn), func = PCI_FUNC(devfn);
> +    unsigned int pos = pci_find_cap_offset(seg, bus, slot, func,
> +                                           PCI_CAP_ID_MSIX);
> +
> +    if ( !pos )
> +        return -ENODEV;
> +
> +    spin_lock(&pcidevs_lock);
> +    pdev = pci_get_pdev(seg, bus, devfn);
> +    if ( !pdev )
> +        rc = -ENODEV;
> +    else if ( pdev->msix_used_entries != !!off )
> +        rc = -EBUSY;
> +    else if ( off )
>      {
> -        if ( rangeset_remove_range(mmio_ro_ranges, dev->msix_table.first,
> -                                  dev->msix_table.last) )
> -            WARN();
> -        if ( rangeset_remove_range(mmio_ro_ranges, dev->msix_pba.first,
> -                                   dev->msix_pba.last) )
> -            WARN();
> +        _pci_cleanup_msix(pdev);
> +        rc = 0;
>      }
> +    else
> +    {
> +        u16 control = pci_conf_read16(seg, bus, slot, func,
> +                                      msix_control_reg(pos));
> +
> +        rc = msix_capability_init(pdev, NULL, NULL,
> +                                  multi_msix_capable(control));
> +    }
> +    spin_unlock(&pcidevs_lock);
> +
> +    return rc;
>  }
> 
>  /*
> --- a/xen/arch/x86/physdev.c
> +++ b/xen/arch/x86/physdev.c
> @@ -575,6 +575,18 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_H
>          break;
>      }
> 
> +    case PHYSDEVOP_prepare_msix:
> +    case PHYSDEVOP_release_msix: {
> +        struct physdev_pci_device dev;
> +
> +        if ( copy_from_guest(&dev, arg, 1) )
> +            ret = -EFAULT;
> +        else
> +            ret = pci_prepare_msix(dev.seg, dev.bus, dev.devfn,
> +                                   cmd != PHYSDEVOP_prepare_msix);
> +        break;
> +    }
> +
>      case PHYSDEVOP_pci_mmcfg_reserved: {
>          struct physdev_pci_mmcfg_reserved info;
> 
> --- a/xen/include/asm-x86/msi.h
> +++ b/xen/include/asm-x86/msi.h
> @@ -76,6 +76,7 @@ struct msi_desc;
>  /* Helper functions */
>  extern int pci_enable_msi(struct msi_info *msi, struct msi_desc **desc);
>  extern void pci_disable_msi(struct msi_desc *desc);
> +extern int pci_prepare_msix(u16 seg, u8 bus, u8 devfn, bool_t off);
>  extern void pci_cleanup_msi(struct pci_dev *pdev);
>  extern void setup_msi_handler(struct irq_desc *, struct msi_desc *);
>  extern void setup_msi_irq(struct irq_desc *);
> --- a/xen/include/public/physdev.h
> +++ b/xen/include/public/physdev.h
> @@ -303,6 +303,12 @@ DEFINE_XEN_GUEST_HANDLE(physdev_pci_devi
> 
>  #define PHYSDEVOP_pci_device_remove     26
>  #define PHYSDEVOP_restore_msi_ext       27
> +/*
> + * Dom0 should use these two to announce MMIO resources assigned to
> + * MSI-X capable devices won't (prepare) or may (release) change.
> + */
> +#define PHYSDEVOP_prepare_msix          30
> +#define PHYSDEVOP_release_msix          31
>  struct physdev_pci_device {
>      /* IN */
>      uint16_t seg;
> --- a/xen/include/xen/pci.h
> +++ b/xen/include/xen/pci.h
> @@ -57,6 +57,7 @@ struct pci_dev {
>      int msix_table_refcnt[MAX_MSIX_TABLE_PAGES];
>      int msix_table_idx[MAX_MSIX_TABLE_PAGES];
>      spinlock_t msix_table_lock;
> +    domid_t msix_warned;
> 
>      struct domain *domain;
>      const u16 seg;
> 


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.