[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH v3 3/4] x86/hvm: Allow writes to registers on the same page as MSI-X table


  • To: Marek Marczykowski-Górecki <marmarek@xxxxxxxxxxxxxxxxxxxxxx>
  • From: Jan Beulich <jbeulich@xxxxxxxx>
  • Date: Mon, 24 Apr 2023 15:59:15 +0200
  • Arc-authentication-results: i=1; mx.microsoft.com 1; spf=pass smtp.mailfrom=suse.com; dmarc=pass action=none header.from=suse.com; dkim=pass header.d=suse.com; arc=none
  • Arc-message-signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=microsoft.com; s=arcselector9901; h=From:Date:Subject:Message-ID:Content-Type:MIME-Version:X-MS-Exchange-AntiSpam-MessageData-ChunkCount:X-MS-Exchange-AntiSpam-MessageData-0:X-MS-Exchange-AntiSpam-MessageData-1; bh=h7K8zzvcm/qI7O13PT/5IEE5ayUGR9RPHKuFzhwTJvQ=; b=VDZPdKTVhrqehNxBRjxuI4zwVX+nwK6gA4B8aHe3d/QNM94yl+HBEL3uF0odMLUDB6hNfykgsFvlbE5bB/59ej93rrQGV4BjgV4+VQLhD/6T3mi+FP3PfT1VUEg/mlTktOy3GSP8+H/5/tYy7scb2lUn/mmjB4FaTkPEXJl9UOyJ8G9X0TTn2LnR+z/p6aQxHgWaqL1lJbl+AInktCCecJljIxkIMBCWaGVUX1e5PwgkJMuaH4eXAKFpdtAe9mADT7fqCLbie7QPtdnjMFUxfmdbZuKhGvRqJkszVvQwYR4BgtAke0C5+2+4qTWJf15b9qDoqeoQ8WSnEphCtMgBuQ==
  • Arc-seal: i=1; a=rsa-sha256; s=arcselector9901; d=microsoft.com; cv=none; b=iXx/GP3+Rzh8Y1Ywy2CcnTrPgX2BVk0E9NkQfz4erQE6yxf8o/otzT0FhbCVrOLFmi6AAJnXGCv+Hb7uAog+bS9AlXyQpwH752VrW2iijbBNCqizYBbgVnObwkqCz9w8EMSf5CHMqfmKmASPi1KsKhwPrxKQCcaeYBzKxTwXmbtMHoqtMDPvSDCyatoxMGN21+Qy4mi8O8OyjaqQvPH+mrBbbva2qVfO9lEB3CES3cU73yy30UXUo0G617dfQ8Mmv+98tyRu7NlvKWHzjrNuR+V74Z0IdLtg7gwTUcXcbd62Lia7HGcwVCqXBAAOrZfqbmxtxRXYEGxgjAXz8TdqWQ==
  • Authentication-results: dkim=none (message not signed) header.d=none;dmarc=none action=none header.from=suse.com;
  • Cc: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>, Roger Pau Monné <roger.pau@xxxxxxxxxx>, Wei Liu <wl@xxxxxxx>, xen-devel@xxxxxxxxxxxxxxxxxxxx
  • Delivery-date: Mon, 24 Apr 2023 13:59:34 +0000
  • List-id: Xen developer discussion <xen-devel.lists.xenproject.org>

On 06.04.2023 05:57, Marek Marczykowski-Górecki wrote:
> --- a/xen/arch/x86/hvm/vmsi.c
> +++ b/xen/arch/x86/hvm/vmsi.c
> @@ -181,15 +181,21 @@ static bool msixtbl_initialised(const struct domain *d)
>  }
>  
>  static struct msixtbl_entry *msixtbl_find_entry(
> -    struct vcpu *v, unsigned long addr)
> +    struct vcpu *v, unsigned long addr, bool same_page)
>  {
>      struct msixtbl_entry *entry;
>      struct domain *d = v->domain;
>  
>      list_for_each_entry( entry, &d->arch.hvm.msixtbl_list, list )
> +    {
> +        if ( same_page &&
> +             PFN_DOWN(addr) >= PFN_DOWN(entry->gtable) &&
> +             PFN_DOWN(addr) <= PFN_DOWN(entry->gtable + entry->table_len) )

This will allow access to the subsequent page in case entry->gtable +
entry->table_len falls on a page boundary. I think you mean "< PFN_UP()"
or "<= PFN_DOWN(... - 1)".

But I anyway don't understand why you need both this and ...

> +            return entry;
>          if ( addr >= entry->gtable &&
>               addr < entry->gtable + entry->table_len )
>              return entry;

... this (and the new function parameter). Again like in the recently
added vPCI code, all you should need ought to be the new code you add
(see msix.c:msix_find()), with the caller then separating "in table" vs
"other space". Only one of the four callers really cares about _just_
the table range. In no event do you need to do both pairs of checks:

        if ( same_page
             ? PFN_DOWN(addr) >= PFN_DOWN(entry->gtable) &&
               ...
             : ...

(i.e. in case you disagree with moving the original check to the sole
place where it's needed).

> @@ -213,6 +219,144 @@ static struct msi_desc *msixtbl_addr_to_desc(
>      return NULL;
>  }
>  
> +/*
> + * Returns:
> + *  - UINT_MAX if no handling should be done
> + *  - UINT_MAX-1 if write should be discarded
> + *  - a fixmap idx to use for handling
> + */
> +#define ADJACENT_DONT_HANDLE UINT_MAX
> +#define ADJACENT_DISCARD_WRITE (UINT_MAX - 1)
> +static unsigned int adjacent_handle(
> +    const struct msixtbl_entry *entry, unsigned long addr, bool write)
> +{
> +    unsigned int adj_type;
> +
> +    if ( !entry || !entry->pdev )
> +        return ADJACENT_DONT_HANDLE;
> +
> +    if ( PFN_DOWN(addr) == PFN_DOWN(entry->gtable) && addr < entry->gtable )
> +        adj_type = ADJ_IDX_FIRST;
> +    else if ( PFN_DOWN(addr) == PFN_DOWN(entry->gtable + entry->table_len - 
> 1) &&
> +              addr >= entry->gtable + entry->table_len )
> +        adj_type = ADJ_IDX_LAST;
> +    else
> +        return ADJACENT_DONT_HANDLE;
> +
> +    ASSERT(entry->pdev->msix);

Considering the code below, you probably want to latch this pointer into
a local variable.

> +    if ( !entry->pdev->msix->adj_access_table_idx[adj_type] )
> +    {
> +        gprintk(XENLOG_WARNING,
> +                "Page for adjacent(%d) MSI-X table access not initialized 
> for %pp (addr %#lx, gtable %#lx\n",
> +                adj_type, &entry->pdev->sbdf, addr, entry->gtable);
> +
> +        return ADJACENT_DONT_HANDLE;
> +    }
> +
> +    /* If PBA lives on the same page too, discard writes. */
> +    if ( write && (

Nit: Style (no standalone opening parenthesis ought to be last on a line).

> +        (adj_type == ADJ_IDX_LAST &&
> +         entry->pdev->msix->table.last == entry->pdev->msix->pba.first) ||
> +        (adj_type == ADJ_IDX_FIRST &&
> +         entry->pdev->msix->table.first == entry->pdev->msix->pba.last)) )

And these all need indenting by one more blank.

> +    {
> +        gprintk(XENLOG_WARNING,
> +                 "MSI-X table and PBA of %pp live on the same page, "
> +                 "writing to other registers there is not implemented\n",
> +                 &entry->pdev->sbdf);

Whereas here a blank needs dropping. But - don't you discard writes to
more than just the PBA here?

> +        return ADJACENT_DISCARD_WRITE;
> +    }
> +
> +    return entry->pdev->msix->adj_access_table_idx[adj_type];
> +}
> +
> +static int adjacent_read(
> +        unsigned int fixmap_idx,
> +        uint64_t address, uint32_t len, uint64_t *pval)

Nit: Style (too deep indentation). Also while I'm fine with the two
uint64_t (arguably the former may want to be paddr_t), the uint32_t
clearly isn't needed here, and wants to be unsigned int.

> +{
> +    const void __iomem *hwaddr;
> +
> +    *pval = ~0UL;
> +
> +    if ( !IS_ALIGNED(address, len) )
> +    {
> +        gdprintk(XENLOG_WARNING,
> +                "Dropping unaligned read from MSI-X table page at %" PRIx64 
> "\n",
> +                address);

Here indentation is one too little again.

> +        return X86EMUL_OKAY;
> +    }
> +
> +    ASSERT(fixmap_idx != ADJACENT_DISCARD_WRITE);
> +
> +    hwaddr = fix_to_virt(fixmap_idx) + PAGE_OFFSET(address);
> +
> +    switch ( len )
> +    {
> +    case 1:
> +        *pval = readb(hwaddr);
> +        break;
> +
> +    case 2:
> +        *pval = readw(hwaddr);
> +        break;
> +
> +    case 4:
> +        *pval = readl(hwaddr);
> +        break;
> +
> +    case 8:
> +        *pval = readq(hwaddr);
> +        break;
> +
> +    default:
> +        ASSERT_UNREACHABLE();
> +    }
> +    return X86EMUL_OKAY;
> +}
> +
> +static int adjacent_write(
> +        unsigned int fixmap_idx,
> +        uint64_t address, uint32_t len, uint64_t val)
> +{
> +    void __iomem *hwaddr;
> +
> +    if ( !IS_ALIGNED(address, len) )
> +    {
> +        gdprintk(XENLOG_WARNING,
> +                "Dropping unaligned write to MSI-X table page at %" PRIx64 
> "\n",
> +                address);
> +        return X86EMUL_OKAY;
> +    }
> +
> +    if ( fixmap_idx == ADJACENT_DISCARD_WRITE )
> +        return X86EMUL_OKAY;
> +
> +    hwaddr = fix_to_virt(fixmap_idx) + PAGE_OFFSET(address);
> +
> +    switch ( len ) {

Nit: Style (brace placement).

> @@ -220,16 +364,27 @@ static int cf_check msixtbl_read(
>      unsigned long offset;
>      struct msixtbl_entry *entry;
>      unsigned int nr_entry, index;
> +    unsigned int adjacent_fixmap;
>      int r = X86EMUL_UNHANDLEABLE;
>  
> -    if ( (len != 4 && len != 8) || (address & (len - 1)) )
> +    if ( !IS_ALIGNED(address, len) )
>          return r;
>  
>      rcu_read_lock(&msixtbl_rcu_lock);
> -
> -    entry = msixtbl_find_entry(current, address);
> +    entry = msixtbl_find_entry(current, address, true);
>      if ( !entry )
>          goto out;
> +
> +    adjacent_fixmap = adjacent_handle(entry, address, false);
> +    if ( adjacent_fixmap != ADJACENT_DONT_HANDLE )
> +    {
> +        r = adjacent_read(adjacent_fixmap, address, len, pval);
> +        goto out;
> +    }

adjacent_handle() may return ADJACENT_DONT_HANDLE for reasons other than
the address not being in the expected ranges, so you can't blindly fall
through to the original code just yet (same in functions further down).

> @@ -375,14 +543,23 @@ static bool cf_check msixtbl_range(
>  {
>      struct vcpu *curr = current;
>      unsigned long addr = r->addr;
> -    const struct msi_desc *desc;
> +    struct msixtbl_entry *entry;

const?

> +    const struct msi_desc *desc = NULL;
> +    unsigned int adjacent_fixmap;
> +
>  

Nit: Please don't introduce double blank lines.

> --- a/xen/arch/x86/include/asm/msi.h
> +++ b/xen/arch/x86/include/asm/msi.h
> @@ -207,6 +207,10 @@ struct msg_address {
>                                         PCI_MSIX_ENTRY_SIZE + \
>                                         (~PCI_MSIX_BIRMASK & (PAGE_SIZE - 1)))
>  
> +/* indexes in adj_access_table_idx[] below */
> +#define ADJ_IDX_FIRST 0
> +#define ADJ_IDX_LAST  1
> +
>  struct arch_msix {
>      unsigned int nr_entries, used_entries;
>      struct {
> @@ -214,6 +218,7 @@ struct arch_msix {
>      } table, pba;
>      int table_refcnt[MAX_MSIX_TABLE_PAGES];
>      int table_idx[MAX_MSIX_TABLE_PAGES];
> +    int adj_access_table_idx[2];

adjacent_handle() returns unsigned int - why is it plain int here?

Also, to keep variable name length somewhat limited, I don't think "table"
adds much value to the name here.

> --- a/xen/arch/x86/msi.c
> +++ b/xen/arch/x86/msi.c
> @@ -961,6 +961,34 @@ static int msix_capability_init(struct pci_dev *dev,
>                  domain_crash(d);
>              /* XXX How to deal with existing mappings? */
>          }
> +
> +        /*
> +         * If the MSI-X table doesn't start at the page boundary, map the 
> first page for
> +         * passthrough accesses.
> +         */
> +        if ( PAGE_OFFSET(table_paddr) )
> +        {
> +            int idx = msix_get_fixmap(msix, table_paddr, table_paddr);
> +
> +            if ( idx > 0 )
> +                msix->adj_access_table_idx[ADJ_IDX_FIRST] = idx;
> +            else
> +                gprintk(XENLOG_ERR, "Failed to map first MSI-X table page: 
> %d\n", idx);
> +        }
> +        /*
> +         * If the MSI-X table doesn't span full page(s), map the last page 
> for
> +         * passthrough accesses.
> +         */
> +        if ( PAGE_OFFSET(table_paddr + msix->nr_entries * 
> PCI_MSIX_ENTRY_SIZE) )
> +        {
> +            uint64_t entry_paddr = table_paddr + msix->nr_entries * 
> PCI_MSIX_ENTRY_SIZE;
> +            int idx = msix_get_fixmap(msix, table_paddr, entry_paddr);
> +
> +            if ( idx > 0 )
> +                msix->adj_access_table_idx[ADJ_IDX_LAST] = idx;
> +            else
> +                gprintk(XENLOG_ERR, "Failed to map last MSI-X table page: 
> %d\n", idx);
> +        }

Wouldn't this better be delayed until / restricted to the device actually
being prepared for guest use? Which may be as simple as making all of this
an "else" to the earlier if()?

Also I think the 2nd comment is somewhat misleading - you don't really mean
"spans full pages" but, along the lines of the first one, "ends on a page
boundary".

Jan



 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.