 
	
| [Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] Re: [PATCH v3 06/11] vpci/header: Handle p2m range sets per BAR
 Hi, Roger!
On 26.10.21 12:08, Roger Pau Monné wrote:
> On Thu, Sep 30, 2021 at 10:52:18AM +0300, Oleksandr Andrushchenko wrote:
>> From: Oleksandr Andrushchenko <oleksandr_andrushchenko@xxxxxxxx>
>>
>> Instead of handling a single range set, that contains all the memory
>> regions of all the BARs and ROM, have them per BAR.
>>
>> This is in preparation of making non-identity mappings in p2m for the
>> MMIOs/ROM.
>>
>> Signed-off-by: Oleksandr Andrushchenko <oleksandr_andrushchenko@xxxxxxxx>
>> ---
>>   xen/drivers/vpci/header.c | 172 ++++++++++++++++++++++++++------------
>>   xen/include/xen/vpci.h    |   3 +-
>>   2 files changed, 122 insertions(+), 53 deletions(-)
>>
>> diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
>> index ec4d215f36ff..9c603d26d302 100644
>> --- a/xen/drivers/vpci/header.c
>> +++ b/xen/drivers/vpci/header.c
>> @@ -131,49 +131,75 @@ static void modify_decoding(const struct pci_dev 
>> *pdev, uint16_t cmd,
>>   
>>   bool vpci_process_pending(struct vcpu *v)
>>   {
>> -    if ( v->vpci.mem )
>> +    if ( v->vpci.num_mem_ranges )
>>       {
>>           struct map_data data = {
>>               .d = v->domain,
>>               .map = v->vpci.cmd & PCI_COMMAND_MEMORY,
>>           };
>> -        int rc = rangeset_consume_ranges(v->vpci.mem, map_range, &data);
>> +        struct pci_dev *pdev = v->vpci.pdev;
>> +        struct vpci_header *header = &pdev->vpci->header;
>> +        unsigned int i;
>>   
>> -        if ( rc == -ERESTART )
>> -            return true;
>> +        for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>> +        {
>> +            struct vpci_bar *bar = &header->bars[i];
>> +            int rc;
>>   
>> -        spin_lock(&v->vpci.pdev->vpci->lock);
>> -        /* Disable memory decoding unconditionally on failure. */
>> -        modify_decoding(v->vpci.pdev,
>> -                        rc ? v->vpci.cmd & ~PCI_COMMAND_MEMORY : 
>> v->vpci.cmd,
>> -                        !rc && v->vpci.rom_only);
>> -        spin_unlock(&v->vpci.pdev->vpci->lock);
>> +            if ( !bar->mem )
>> +                continue;
>>   
>> -        rangeset_destroy(v->vpci.mem);
>> -        v->vpci.mem = NULL;
>> -        if ( rc )
>> -            /*
>> -             * FIXME: in case of failure remove the device from the domain.
>> -             * Note that there might still be leftover mappings. While this 
>> is
>> -             * safe for Dom0, for DomUs the domain will likely need to be
>> -             * killed in order to avoid leaking stale p2m mappings on
>> -             * failure.
>> -             */
>> -            vpci_remove_device(v->vpci.pdev);
>> +            rc = rangeset_consume_ranges(bar->mem, map_range, &data);
>> +
>> +            if ( rc == -ERESTART )
>> +                return true;
>> +
>> +            spin_lock(&pdev->vpci->lock);
>> +            /* Disable memory decoding unconditionally on failure. */
>> +            modify_decoding(pdev,
>> +                            rc ? v->vpci.cmd & ~PCI_COMMAND_MEMORY : 
>> v->vpci.cmd,
>> +                            !rc && v->vpci.rom_only);
>> +            spin_unlock(&pdev->vpci->lock);
>> +
>> +            rangeset_destroy(bar->mem);
> Now that the rangesets are per-BAR we might have to consider
> allocating them at initialization time and not destroying them when
> empty. We could replace the NULL checks with rangeset_is_empty
> instead. Not that you have to do this on this patch, but I think it's
> worth mentioning.
Yes, this is a good idea. I will re-work the patch to create/destroy
the rangesets once in add/remove
>
>> +            bar->mem = NULL;
>> +            v->vpci.num_mem_ranges--;
>> +            if ( rc )
>> +                /*
>> +                 * FIXME: in case of failure remove the device from the 
>> domain.
>> +                 * Note that there might still be leftover mappings. While 
>> this is
>> +                 * safe for Dom0, for DomUs the domain will likely need to 
>> be
>> +                 * killed in order to avoid leaking stale p2m mappings on
>> +                 * failure.
>> +                 */
>> +                vpci_remove_device(pdev);
>> +        }
>>       }
>>   
>>       return false;
>>   }
>>   
>>   static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
>> -                            struct rangeset *mem, uint16_t cmd)
>> +                            uint16_t cmd)
>>   {
>>       struct map_data data = { .d = d, .map = true };
>> -    int rc;
>> +    struct vpci_header *header = &pdev->vpci->header;
>> +    int rc = 0;
>> +    unsigned int i;
>> +
>> +    for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>> +    {
>> +        struct vpci_bar *bar = &header->bars[i];
>>   
>> -    while ( (rc = rangeset_consume_ranges(mem, map_range, &data)) == 
>> -ERESTART )
>> -        process_pending_softirqs();
>> -    rangeset_destroy(mem);
>> +        if ( !bar->mem )
>> +            continue;
>> +
>> +        while ( (rc = rangeset_consume_ranges(bar->mem, map_range,
>> +                                              &data)) == -ERESTART )
>> +            process_pending_softirqs();
>> +        rangeset_destroy(bar->mem);
>> +        bar->mem = NULL;
>> +    }
>>       if ( !rc )
>>           modify_decoding(pdev, cmd, false);
>>   
>> @@ -181,7 +207,7 @@ static int __init apply_map(struct domain *d, const 
>> struct pci_dev *pdev,
>>   }
>>   
>>   static void defer_map(struct domain *d, struct pci_dev *pdev,
>> -                      struct rangeset *mem, uint16_t cmd, bool rom_only)
>> +                      uint16_t cmd, bool rom_only, uint8_t num_mem_ranges)
> Like mentioned below, I don't think you need to pass the number of
> BARs that need mapping changes. Iff that's strictly needed, it should
> be an unsigned int.
bool map_pending :1 works great
>
>>   {
>>       struct vcpu *curr = current;
>>   
>> @@ -192,9 +218,9 @@ static void defer_map(struct domain *d, struct pci_dev 
>> *pdev,
>>        * started for the same device if the domain is not well-behaved.
>>        */
>>       curr->vpci.pdev = pdev;
>> -    curr->vpci.mem = mem;
>>       curr->vpci.cmd = cmd;
>>       curr->vpci.rom_only = rom_only;
>> +    curr->vpci.num_mem_ranges = num_mem_ranges;
>>       /*
>>        * Raise a scheduler softirq in order to prevent the guest from 
>> resuming
>>        * execution with pending mapping operations, to trigger the invocation
>> @@ -206,42 +232,47 @@ static void defer_map(struct domain *d, struct pci_dev 
>> *pdev,
>>   static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool 
>> rom_only)
>>   {
>>       struct vpci_header *header = &pdev->vpci->header;
>> -    struct rangeset *mem = rangeset_new(NULL, NULL, 0);
>>       struct pci_dev *tmp, *dev = NULL;
>>       const struct vpci_msix *msix = pdev->vpci->msix;
>> -    unsigned int i;
>> +    unsigned int i, j;
>>       int rc;
>> -
>> -    if ( !mem )
>> -        return -ENOMEM;
>> +    uint8_t num_mem_ranges;
>>   
>>       /*
>> -     * Create a rangeset that represents the current device BARs memory 
>> region
>> +     * Create a rangeset per BAR that represents the current device memory 
>> region
>>        * and compare it against all the currently active BAR memory regions. 
>> If
>>        * an overlap is found, subtract it from the region to be 
>> mapped/unmapped.
>>        *
>> -     * First fill the rangeset with all the BARs of this device or with the 
>> ROM
>> +     * First fill the rangesets with all the BARs of this device or with 
>> the ROM
>>        * BAR only, depending on whether the guest is toggling the memory 
>> decode
>>        * bit of the command register, or the enable bit of the ROM BAR 
>> register.
>>        */
>>       for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>>       {
>> -        const struct vpci_bar *bar = &header->bars[i];
>> +        struct vpci_bar *bar = &header->bars[i];
>>           unsigned long start = PFN_DOWN(bar->addr);
>>           unsigned long end = PFN_DOWN(bar->addr + bar->size - 1);
>>   
>> +        bar->mem = NULL;
> Why do you need to set mem to NULL here? I think we should instead
> assert that bar->mem == NULL here.
I will put an ASSERT here
>
>> +
>>           if ( !MAPPABLE_BAR(bar) ||
>>                (rom_only ? bar->type != VPCI_BAR_ROM
>>                          : (bar->type == VPCI_BAR_ROM && 
>> !header->rom_enabled)) )
>>               continue;
>>   
>> -        rc = rangeset_add_range(mem, start, end);
>> +        bar->mem = rangeset_new(NULL, NULL, 0);
>> +        if ( !bar->mem )
>> +        {
>> +            rc = -ENOMEM;
>> +            goto fail;
>> +        }
>> +
>> +        rc = rangeset_add_range(bar->mem, start, end);
>>           if ( rc )
>>           {
>>               printk(XENLOG_G_WARNING "Failed to add [%lx, %lx]: %d\n",
>>                      start, end, rc);
>> -            rangeset_destroy(mem);
>> -            return rc;
>> +            goto fail;
>>           }
>>       }
>>   
>> @@ -252,14 +283,21 @@ static int modify_bars(const struct pci_dev *pdev, 
>> uint16_t cmd, bool rom_only)
>>           unsigned long end = PFN_DOWN(vmsix_table_addr(pdev->vpci, i) +
>>                                        vmsix_table_size(pdev->vpci, i) - 1);
>>   
>> -        rc = rangeset_remove_range(mem, start, end);
>> -        if ( rc )
>> +        for ( j = 0; j < ARRAY_SIZE(header->bars); j++ )
>>           {
>> -            printk(XENLOG_G_WARNING
>> -                   "Failed to remove MSIX table [%lx, %lx]: %d\n",
>> -                   start, end, rc);
>> -            rangeset_destroy(mem);
>> -            return rc;
>> +            const struct vpci_bar *bar = &header->bars[j];
>> +
>> +            if ( !bar->mem )
>> +                continue;
>> +
>> +            rc = rangeset_remove_range(bar->mem, start, end);
>> +            if ( rc )
>> +            {
>> +                printk(XENLOG_G_WARNING
>> +                       "Failed to remove MSIX table [%lx, %lx]: %d\n",
>> +                       start, end, rc);
>> +                goto fail;
>> +            }
>>           }
>>       }
>>   
>> @@ -291,7 +329,8 @@ static int modify_bars(const struct pci_dev *pdev, 
>> uint16_t cmd, bool rom_only)
>>               unsigned long start = PFN_DOWN(bar->addr);
>>               unsigned long end = PFN_DOWN(bar->addr + bar->size - 1);
>>   
>> -            if ( !bar->enabled || !rangeset_overlaps_range(mem, start, end) 
>> ||
>> +            if ( !bar->enabled ||
>> +                 !rangeset_overlaps_range(bar->mem, start, end) ||
>>                    /*
>>                     * If only the ROM enable bit is toggled check against 
>> other
>>                     * BARs in the same device for overlaps, but not against 
>> the
>> @@ -300,13 +339,12 @@ static int modify_bars(const struct pci_dev *pdev, 
>> uint16_t cmd, bool rom_only)
>>                    (rom_only && tmp == pdev && bar->type == VPCI_BAR_ROM) )
>>                   continue;
>>   
>> -            rc = rangeset_remove_range(mem, start, end);
>> +            rc = rangeset_remove_range(bar->mem, start, end);
>>               if ( rc )
>>               {
>>                   printk(XENLOG_G_WARNING "Failed to remove [%lx, %lx]: 
>> %d\n",
>>                          start, end, rc);
>> -                rangeset_destroy(mem);
>> -                return rc;
>> +                goto fail;
>>               }
>>           }
>>       }
>> @@ -324,12 +362,42 @@ static int modify_bars(const struct pci_dev *pdev, 
>> uint16_t cmd, bool rom_only)
>>            * will always be to establish mappings and process all the BARs.
>>            */
>>           ASSERT((cmd & PCI_COMMAND_MEMORY) && !rom_only);
>> -        return apply_map(pdev->domain, pdev, mem, cmd);
>> +        return apply_map(pdev->domain, pdev, cmd);
>>       }
>>   
>> -    defer_map(dev->domain, dev, mem, cmd, rom_only);
>> +    /* Find out how many memory ranges has left after MSI and overlaps. */
>> +    num_mem_ranges = 0;
>> +    for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>> +    {
>> +        struct vpci_bar *bar = &header->bars[i];
> There's no need to declare this local variable AFAICT, just use
> header->bars[i].mem.
Ok
>   In any case this is likely to go away if you
> follow my recommendation below to just call defer_map unconditionally
> like it's currently done.
Please see below
>> +
>> +        if ( !rangeset_is_empty(bar->mem) )
>> +            num_mem_ranges++;
>> +    }
>> +
>> +    /*
>> +     * There are cases when PCI device, root port for example, has neither
>> +     * memory space nor IO. In this case PCI command register write is
>> +     * missed resulting in the underlying PCI device not functional, so:
>> +     *   - if there are no regions write the command register now
>> +     *   - if there are regions then defer work and write later on
>> +     */
>> +    if ( !num_mem_ranges )
>> +        pci_conf_write16(pdev->sbdf, PCI_COMMAND, cmd);
> I think this is wrong, as not calling defer_map will prevent the
> rangesets (bar[i]->mem) from being destroyed, so we are effectively
> leaking memory.
Not really. As in case of num_mem_ranges == 0 there are no rangesets
to free as none was allocated
>
> You need to take a path similar to the failure one in case there are
> no mappings pending, or even better just call defer_map anyway and let
> it do it's thing, it should be capable of handling empty rangesets
> just fine. That's how it's currently done.
So, I think this is still valid to break early and do not go with defer_map
>
>> +    else
>> +        defer_map(dev->domain, dev, cmd, rom_only, num_mem_ranges);
>>   
>>       return 0;
>> +
>> +fail:
> We usually ask labels to be indented with one space.
Sure. I am confused a bit: there is no word for that in the coding
style and the sources use labels with and without the space.
>
>> +    for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>> +    {
>> +        struct vpci_bar *bar = &header->bars[i];
>> +
>> +        rangeset_destroy(bar->mem);
>> +        bar->mem = NULL;
>> +    }
>> +    return rc;
>>   }
>>   
>>   static void cmd_write(const struct pci_dev *pdev, unsigned int reg,
>> diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
>> index a0320b22cb36..352e02d0106d 100644
>> --- a/xen/include/xen/vpci.h
>> +++ b/xen/include/xen/vpci.h
>> @@ -80,6 +80,7 @@ struct vpci {
>>               /* Guest view of the BAR. */
>>               uint64_t guest_addr;
>>               uint64_t size;
>> +            struct rangeset *mem;
>>               enum {
>>                   VPCI_BAR_EMPTY,
>>                   VPCI_BAR_IO,
>> @@ -154,9 +155,9 @@ struct vpci {
>>   
>>   struct vpci_vcpu {
>>       /* Per-vcpu structure to store state while {un}mapping of PCI BARs. */
>> -    struct rangeset *mem;
>>       struct pci_dev *pdev;
>>       uint16_t cmd;
>> +    uint8_t num_mem_ranges;
> AFAICT This could be a simple bool:
>
> bool map_pending : 1;
>
> As there's no strict need to know how many BARs have pending mappings.
This is true
>
> Thanks, Roger.
Thank you,
Oleksandr
 | 
|  | Lists.xenproject.org is hosted with RackSpace, monitoring our |