[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH 3/3] x86/iommu: use a rangeset for hwdom setup



On Fri, Nov 17, 2023 at 10:47:49AM +0100, Roger Pau Monne wrote:
> The current loop that iterates from 0 to the maximum RAM address in order to
> setup the IOMMU mappings is highly inefficient, and it will get worse as the
> amount of RAM increases.  It's also not accounting for any reserved regions
> past the last RAM address.
> 
> Instead of iterating over memory addresses, iterate over the memory map 
> regions
> and use a rangeset in order to keep track of which ranges need to be identity
> mapped in the hardware domain physical address space.
> 
> On an AMD EPYC 7452 with 512GiB of RAM, the time to execute
> arch_iommu_hwdom_init() in nanoseconds is:
> 
> x old
> + new
>     N           Min           Max        Median           Avg        Stddev
> x   5 2.2364154e+10  2.338244e+10 2.2474685e+10 2.2622409e+10 4.2949869e+08
> +   5       1025012       1033036       1026188     1028276.2     3623.1194
> Difference at 95.0% confidence
>       -2.26214e+10 +/- 4.42931e+08
>       -99.9955% +/- 9.05152e-05%
>       (Student's t, pooled s = 3.03701e+08)
> 
> Execution time of arch_iommu_hwdom_init() goes down from ~22s to ~0.001s.
> 
> Note there's a change for HVM domains (ie: PVH dom0) that get switched to
> create the p2m mappings using map_mmio_regions() instead of
> p2m_add_identity_entry(), so that ranges can be mapped with a single function
> call if possible.  Note that the interface of map_mmio_regions() doesn't
> allow creating read-only mappings, but so far there are no such mappings
> created for PVH dom0 in arch_iommu_hwdom_init().
> 
> No change intended in the resulting mappings that a hardware domain gets.
> 
> Signed-off-by: Roger Pau Monné <roger.pau@xxxxxxxxxx>
> ---
>  xen/arch/x86/hvm/io.c               |  15 +-
>  xen/arch/x86/include/asm/hvm/io.h   |   4 +-
>  xen/drivers/passthrough/x86/iommu.c | 355 +++++++++++++++++-----------
>  3 files changed, 231 insertions(+), 143 deletions(-)
> 
> diff --git a/xen/arch/x86/hvm/io.c b/xen/arch/x86/hvm/io.c
> index d75af83ad01f..7c4b7317b13a 100644
> --- a/xen/arch/x86/hvm/io.c
> +++ b/xen/arch/x86/hvm/io.c
> @@ -364,9 +364,20 @@ static const struct hvm_mmcfg *vpci_mmcfg_find(const 
> struct domain *d,
>      return NULL;
>  }
>  
> -bool vpci_is_mmcfg_address(const struct domain *d, paddr_t addr)
> +int vpci_subtract_mmcfg(const struct domain *d, struct rangeset *r)
>  {
> -    return vpci_mmcfg_find(d, addr);
> +    const struct hvm_mmcfg *mmcfg;
> +
> +    list_for_each_entry ( mmcfg, &d->arch.hvm.mmcfg_regions, next )
> +    {
> +        int rc = rangeset_remove_range(r, PFN_DOWN(mmcfg->addr),
> +                                       PFN_DOWN(mmcfg->addr + mmcfg->size - 
> 1));
> +
> +        if ( rc )
> +            return rc;
> +    }
> +
> +    return 0;
>  }
>  
>  static unsigned int vpci_mmcfg_decode_addr(const struct hvm_mmcfg *mmcfg,
> diff --git a/xen/arch/x86/include/asm/hvm/io.h 
> b/xen/arch/x86/include/asm/hvm/io.h
> index e5225e75ef26..c9d058fd5695 100644
> --- a/xen/arch/x86/include/asm/hvm/io.h
> +++ b/xen/arch/x86/include/asm/hvm/io.h
> @@ -153,8 +153,8 @@ int register_vpci_mmcfg_handler(struct domain *d, paddr_t 
> addr,
>  /* Destroy tracked MMCFG areas. */
>  void destroy_vpci_mmcfg(struct domain *d);
>  
> -/* Check if an address is between a MMCFG region for a domain. */
> -bool vpci_is_mmcfg_address(const struct domain *d, paddr_t addr);
> +/* Remove MMCFG regions from a given rangeset. */
> +int vpci_subtract_mmcfg(const struct domain *d, struct rangeset *r);
>  
>  #endif /* __ASM_X86_HVM_IO_H__ */
>  
> diff --git a/xen/drivers/passthrough/x86/iommu.c 
> b/xen/drivers/passthrough/x86/iommu.c
> index d70cee9fea77..be2c909f61d8 100644
> --- a/xen/drivers/passthrough/x86/iommu.c
> +++ b/xen/drivers/passthrough/x86/iommu.c
> @@ -301,129 +301,133 @@ void iommu_identity_map_teardown(struct domain *d)
>      }
>  }
>  
> -static int __hwdom_init xen_in_range(unsigned long mfn)
> +static int __hwdom_init remove_xen_ranges(struct rangeset *r)
>  {
>      paddr_t start, end;
> -    int i;
> -
> -    enum { region_s3, region_ro, region_rw, region_bss, nr_regions };
> -    static struct {
> -        paddr_t s, e;
> -    } xen_regions[nr_regions] __hwdom_initdata;
> +    int rc;
>  
> -    /* initialize first time */
> -    if ( !xen_regions[0].s )
> -    {
> -        /* S3 resume code (and other real mode trampoline code) */
> -        xen_regions[region_s3].s = bootsym_phys(trampoline_start);
> -        xen_regions[region_s3].e = bootsym_phys(trampoline_end);
> +    /* S3 resume code (and other real mode trampoline code) */
> +    rc = rangeset_remove_range(r, PFN_DOWN(bootsym_phys(trampoline_start)),
> +                               PFN_DOWN(bootsym_phys(trampoline_end)));
> +    if ( rc )
> +        return rc;
>  
> -        /*
> -         * This needs to remain in sync with the uses of the same symbols in
> -         * - __start_xen()
> -         * - is_xen_fixed_mfn()
> -         * - tboot_shutdown()
> -         */
> +    /*
> +     * This needs to remain in sync with the uses of the same symbols in
> +     * - __start_xen()
> +     * - is_xen_fixed_mfn()
> +     * - tboot_shutdown()
> +     */
> +    /* hypervisor .text + .rodata */
> +    rc = rangeset_remove_range(r, PFN_DOWN(__pa(&_stext)),
> +                               PFN_DOWN(__pa(&__2M_rodata_end)));
> +    if ( rc )
> +        return rc;
>  
> -        /* hypervisor .text + .rodata */
> -        xen_regions[region_ro].s = __pa(&_stext);
> -        xen_regions[region_ro].e = __pa(&__2M_rodata_end);
> -        /* hypervisor .data + .bss */
> -        xen_regions[region_rw].s = __pa(&__2M_rwdata_start);
> -        xen_regions[region_rw].e = __pa(&__2M_rwdata_end);
> -        if ( efi_boot_mem_unused(&start, &end) )
> -        {
> -            ASSERT(__pa(start) >= xen_regions[region_rw].s);
> -            ASSERT(__pa(end) <= xen_regions[region_rw].e);
> -            xen_regions[region_rw].e = __pa(start);
> -            xen_regions[region_bss].s = __pa(end);
> -            xen_regions[region_bss].e = __pa(&__2M_rwdata_end);
> -        }
> +    /* hypervisor .data + .bss */
> +    if ( efi_boot_mem_unused(&start, &end) )
> +    {
> +        ASSERT(__pa(start) >= __pa(&__2M_rwdata_start));
> +        rc = rangeset_remove_range(r, PFN_DOWN(__pa(&__2M_rwdata_start)),
> +                                   PFN_DOWN(__pa(start)));
> +        if ( rc )
> +            return rc;
> +        ASSERT(__pa(end) <= __pa(&__2M_rwdata_end));
> +        rc = rangeset_remove_range(r, PFN_DOWN(__pa(end)),
> +                                   PFN_DOWN(__pa(&__2M_rwdata_end)));
> +        if ( rc )
> +            return rc;
> +    }
> +    else
> +    {
> +        rc = rangeset_remove_range(r, PFN_DOWN(__pa(&__2M_rwdata_start)),
> +                                   PFN_DOWN(__pa(&__2M_rwdata_end)));
> +        if ( rc )
> +            return rc;
>      }
> -
> -    start = (paddr_t)mfn << PAGE_SHIFT;
> -    end = start + PAGE_SIZE;
> -    for ( i = 0; i < nr_regions; i++ )
> -        if ( (start < xen_regions[i].e) && (end > xen_regions[i].s) )
> -            return 1;
>  
>      return 0;
>  }
>  
> -static unsigned int __hwdom_init hwdom_iommu_map(const struct domain *d,
> -                                                 unsigned long pfn,
> -                                                 unsigned long max_pfn)
> +static int __hwdom_init map_subtract(unsigned long s, unsigned long e,

Bah, this (and others below) are missing cf_check attribute.

Will fix in v2.

Roger.



 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.