[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCHv6 3/3] xen/privcmd: improve performance of MMAPBATCH_V2



On Fri, 6 Mar 2015, David Vrabel wrote:
> Make the IOCTL_PRIVCMD_MMAPBATCH_V2 (and older V1 version) map
> multiple frames at a time rather than one at a time, despite the pages
> being non-consecutive GFNs.
> 
> xen_remap_foreign_mfn_array() is added which maps an array of GFNs
> (instead of a consecutive range of GFNs).
> 
> Migrate times are significantly improved (when using a PV toolstack
> domain).  For example, for an idle 12 GiB PV guest:
> 
>         Before     After
>   real  0m38.179s  0m26.868s
>   user  0m15.096s  0m13.652s
>   sys   0m28.988s  0m18.732s
> 
> Signed-off-by: David Vrabel <david.vrabel@xxxxxxxxxx>

Nice!


>  arch/arm/xen/enlighten.c |   20 ++++++--
>  arch/x86/xen/mmu.c       |  102 ++++++++++++++++++++++++++++++++--------
>  drivers/xen/privcmd.c    |  117 
> ++++++++++++++++++++++++++++++++--------------
>  drivers/xen/xlate_mmu.c  |   50 ++++++++++++--------
>  include/xen/xen-ops.h    |   47 +++++++++++++++++--
>  5 files changed, 253 insertions(+), 83 deletions(-)
> 
> diff --git a/arch/arm/xen/enlighten.c b/arch/arm/xen/enlighten.c
> index 5c04389..9929cb6 100644
> --- a/arch/arm/xen/enlighten.c
> +++ b/arch/arm/xen/enlighten.c
> @@ -53,15 +53,27 @@ EXPORT_SYMBOL_GPL(xen_platform_pci_unplug);
>  
>  static __read_mostly int xen_events_irq = -1;
>  
> -int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
> +int xen_remap_domain_mfn_array(struct vm_area_struct *vma,
>                              unsigned long addr,
> -                            xen_pfn_t mfn, int nr,
> -                            pgprot_t prot, unsigned domid,
> +                            xen_pfn_t *mfn, int nr,
> +                            int *err_ptr, pgprot_t prot,
> +                            unsigned domid,
>                              struct page **pages)
>  {
> -     return xen_xlate_remap_gfn_range(vma, addr, mfn, nr,
> +     return xen_xlate_remap_gfn_array(vma, addr, mfn, nr, err_ptr,
>                                        prot, domid, pages);
>  }
> +EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_array);
> +
> +/* Not used in ARM. Use xen_remap_domain_mfn_array(). */
> +int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
> +                              unsigned long addr,
> +                              xen_pfn_t mfn, int nr,
> +                              pgprot_t prot, unsigned domid,
> +                              struct page **pages)
> +{
> +     return -ENOSYS;
> +}
>  EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);

I understand that it is not used, but since you have already introduced
xen_remap_domain_mfn_array, you might as well implement
xen_remap_domain_mfn_range by calling xen_remap_domain_mfn_array or
xen_xlate_remap_gfn_array.


>  int xen_unmap_domain_mfn_range(struct vm_area_struct *vma,
> diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
> index 3d536a5..78d352f 100644
> --- a/arch/x86/xen/mmu.c
> +++ b/arch/x86/xen/mmu.c
> @@ -2439,7 +2439,8 @@ void __init xen_hvm_init_mmu_ops(void)
>  #define REMAP_BATCH_SIZE 16
>  
>  struct remap_data {
> -     unsigned long mfn;
> +     xen_pfn_t *mfn;
> +     bool contiguous;
>       pgprot_t prot;
>       struct mmu_update *mmu_update;
>  };
> @@ -2448,7 +2449,14 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, 
> pgtable_t token,
>                                unsigned long addr, void *data)
>  {
>       struct remap_data *rmd = data;
> -     pte_t pte = pte_mkspecial(mfn_pte(rmd->mfn++, rmd->prot));
> +     pte_t pte = pte_mkspecial(mfn_pte(*rmd->mfn, rmd->prot));
> +
> +     /* If we have a contigious range, just update the mfn itself,
> +        else update pointer to be "next mfn". */
> +     if (rmd->contiguous)
> +             (*rmd->mfn)++;
> +     else
> +             rmd->mfn++;
>  
>       rmd->mmu_update->ptr = virt_to_machine(ptep).maddr;
>       rmd->mmu_update->val = pte_val_ma(pte);
> @@ -2457,26 +2465,26 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, 
> pgtable_t token,
>       return 0;
>  }
>  
> -int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
> -                            unsigned long addr,
> -                            xen_pfn_t mfn, int nr,
> -                            pgprot_t prot, unsigned domid,
> -                            struct page **pages)
> -
> +static int do_remap_mfn(struct vm_area_struct *vma,
> +                     unsigned long addr,
> +                     xen_pfn_t *mfn, int nr,
> +                     int *err_ptr, pgprot_t prot,
> +                     unsigned domid,
> +                     struct page **pages)
>  {
> +     int err = 0;
>       struct remap_data rmd;
>       struct mmu_update mmu_update[REMAP_BATCH_SIZE];
> -     int batch;
>       unsigned long range;
> -     int err = 0;
> +     int mapped = 0;
>  
>       BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO)));
>  
>       if (xen_feature(XENFEAT_auto_translated_physmap)) {
>  #ifdef CONFIG_XEN_PVH
>               /* We need to update the local page tables and the xen HAP */
> -             return xen_xlate_remap_gfn_range(vma, addr, mfn, nr, prot,
> -                                              domid, pages);
> +             return xen_xlate_remap_gfn_array(vma, addr, mfn, nr, err_ptr,
> +                                              prot, domid, pages);
>  #else
>               return -EINVAL;
>  #endif
> @@ -2484,9 +2492,15 @@ int xen_remap_domain_mfn_range(struct vm_area_struct 
> *vma,
>  
>       rmd.mfn = mfn;
>       rmd.prot = prot;
> +     /* We use the err_ptr to indicate if there we are doing a contigious
> +      * mapping or a discontigious mapping. */
> +     rmd.contiguous = !err_ptr;
>  
>       while (nr) {
> -             batch = min(REMAP_BATCH_SIZE, nr);
> +             int index = 0;
> +             int done = 0;
> +             int batch = min(REMAP_BATCH_SIZE, nr);
> +             int batch_left = batch;
>               range = (unsigned long)batch << PAGE_SHIFT;
>  
>               rmd.mmu_update = mmu_update;
> @@ -2495,23 +2509,72 @@ int xen_remap_domain_mfn_range(struct vm_area_struct 
> *vma,
>               if (err)
>                       goto out;
>  
> -             err = HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid);
> -             if (err < 0)
> -                     goto out;
> +             /* We record the error for each page that gives an error, but
> +              * continue mapping until the whole set is done */
> +             do {
> +                     int i;
> +
> +                     err = HYPERVISOR_mmu_update(&mmu_update[index],
> +                                                 batch_left, &done, domid);
> +
> +                     /*
> +                      * @err_ptr may be the same buffer as @mfn, so
> +                      * only clear it after each chunk of @mfn is
> +                      * used.
> +                      */
> +                     if (err_ptr) {
> +                             for (i = index; i < index + done; i++)
> +                                     err_ptr[i] = 0;
> +                     }
> +                     if (err < 0) {
> +                             if (!err_ptr)
> +                                     goto out;
> +                             err_ptr[i] = err;
> +                             done++; /* Skip failed frame. */
> +                     } else
> +                             mapped += done;
> +                     batch_left -= done;
> +                     index += done;
> +             } while (batch_left);
>  
>               nr -= batch;
>               addr += range;
> +             if (err_ptr)
> +                     err_ptr += batch;
>       }
> -
> -     err = 0;
>  out:
>  
>       xen_flush_tlb_all();
>  
> -     return err;
> +     return err < 0 ? err : mapped;
> +}
> +
> +int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
> +                            unsigned long addr,
> +                            xen_pfn_t mfn, int nr,
> +                            pgprot_t prot, unsigned domid,
> +                            struct page **pages)
> +{
> +     return do_remap_mfn(vma, addr, &mfn, nr, NULL, prot, domid, pages);
>  }
>  EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
>  
> +int xen_remap_domain_mfn_array(struct vm_area_struct *vma,
> +                            unsigned long addr,
> +                            xen_pfn_t *mfn, int nr,
> +                            int *err_ptr, pgprot_t prot,
> +                            unsigned domid, struct page **pages)
> +{
> +     /* We BUG_ON because it's a programmer error to pass a NULL err_ptr,
> +      * and the consequences later is quite hard to detect what the actual
> +      * cause of "wrong memory was mapped in".
> +      */
> +     BUG_ON(err_ptr == NULL);
> +     return do_remap_mfn(vma, addr, mfn, nr, err_ptr, prot, domid, pages);
> +}
> +EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_array);
> +
> +
>  /* Returns: 0 success */
>  int xen_unmap_domain_mfn_range(struct vm_area_struct *vma,
>                              int numpgs, struct page **pages)
> @@ -2526,3 +2589,4 @@ int xen_unmap_domain_mfn_range(struct vm_area_struct 
> *vma,
>  #endif
>  }
>  EXPORT_SYMBOL_GPL(xen_unmap_domain_mfn_range);
> +
> diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
> index 59ac71c..b667c99 100644
> --- a/drivers/xen/privcmd.c
> +++ b/drivers/xen/privcmd.c
> @@ -159,6 +159,40 @@ static int traverse_pages(unsigned nelem, size_t size,
>       return ret;
>  }
>  
> +/*
> + * Similar to traverse_pages, but use each page as a "block" of
> + * data to be processed as one unit.
> + */
> +static int traverse_pages_block(unsigned nelem, size_t size,
> +                             struct list_head *pos,
> +                             int (*fn)(void *data, int nr, void *state),
> +                             void *state)
> +{
> +     void *pagedata;
> +     unsigned pageidx;
> +     int ret = 0;
> +
> +     BUG_ON(size > PAGE_SIZE);

I looks like that PAGE_SIZE needs to be a multiple of size. Maybe we can
add a BUG_ON for that too.


> +     pageidx = PAGE_SIZE;
> +
> +     while (nelem) {
> +             int nr = (PAGE_SIZE/size);
> +             struct page *page;
> +             if (nr > nelem)
> +                     nr = nelem;
> +             pos = pos->next;
> +             page = list_entry(pos, struct page, lru);
> +             pagedata = page_address(page);
> +             ret = (*fn)(pagedata, nr, state);
> +             if (ret)
> +                     break;
> +             nelem -= nr;
> +     }
> +
> +     return ret;
> +}
> +
>  struct mmap_mfn_state {
>       unsigned long va;
>       struct vm_area_struct *vma;
> @@ -274,39 +308,25 @@ struct mmap_batch_state {
>  /* auto translated dom0 note: if domU being created is PV, then mfn is
>   * mfn(addr on bus). If it's auto xlated, then mfn is pfn (input to HAP).
>   */
> -static int mmap_batch_fn(void *data, void *state)
> +static int mmap_batch_fn(void *data, int nr, void *state)
>  {
>       xen_pfn_t *mfnp = data;
>       struct mmap_batch_state *st = state;
>       struct vm_area_struct *vma = st->vma;
>       struct page **pages = vma->vm_private_data;
> -     struct page *cur_page = NULL;
> +     struct page **cur_pages = NULL;
>       int ret;
>  
>       if (xen_feature(XENFEAT_auto_translated_physmap))
> -             cur_page = pages[st->index++];
> +             cur_pages = &pages[st->index];
>  
> -     ret = xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1,
> -                                      st->vma->vm_page_prot, st->domain,
> -                                      &cur_page);
> +     BUG_ON(nr < 0);
> +     ret = xen_remap_domain_mfn_array(st->vma, st->va & PAGE_MASK, mfnp, nr,
> +                                      (int *)mfnp, st->vma->vm_page_prot, 
> +                                      st->domain, cur_pages);
>  
> -     /* Store error code for second pass. */
> -     if (st->version == 1) {
> -             if (ret < 0) {
> -                     /*
> -                      * V1 encodes the error codes in the 32bit top nibble 
> of the
> -                      * mfn (with its known limitations vis-a-vis 64 bit 
> callers).
> -                      */
> -                     *mfnp |= (ret == -ENOENT) ?
> -                                             PRIVCMD_MMAPBATCH_PAGED_ERROR :
> -                                             PRIVCMD_MMAPBATCH_MFN_ERROR;
> -             }
> -     } else { /* st->version == 2 */
> -             *((int *) mfnp) = ret;
> -     }
> -
> -     /* And see if it affects the global_error. */
> -     if (ret < 0) {
> +     /* Adjust the global_error? */
> +     if (ret != nr) {
>               if (ret == -ENOENT)
>                       st->global_error = -ENOENT;
>               else {
> @@ -315,23 +335,35 @@ static int mmap_batch_fn(void *data, void *state)
>                               st->global_error = 1;
>               }
>       }
> -     st->va += PAGE_SIZE;
> +     st->va += PAGE_SIZE * nr;
> +     st->index += nr;
>  
>       return 0;
>  }
>  
> -static int mmap_return_errors(void *data, void *state)
> +static int mmap_return_error(int err, struct mmap_batch_state *st)
>  {
> -     struct mmap_batch_state *st = state;
> +     int ret;
>  
>       if (st->version == 1) {
> -             xen_pfn_t mfnp = *((xen_pfn_t *) data);
> -             if (mfnp & PRIVCMD_MMAPBATCH_MFN_ERROR)
> -                     return __put_user(mfnp, st->user_mfn++);
> -             else
> +             if (err) {
> +                     xen_pfn_t mfn;
> +
> +                     ret = __get_user(mfn, st->user_mfn);
> +                     if (ret < 0)
> +                             return ret;
> +                     /*
> +                      * V1 encodes the error codes in the 32bit top
> +                      * nibble of the mfn (with its known
> +                      * limitations vis-a-vis 64 bit callers).
> +                      */
> +                     mfn |= (ret == -ENOENT) ?
> +                             PRIVCMD_MMAPBATCH_PAGED_ERROR :
> +                             PRIVCMD_MMAPBATCH_MFN_ERROR;
> +                     return __put_user(mfn, st->user_mfn++);
> +             } else
>                       st->user_mfn++;

Instead of having to resort to __get_user, couldn't you examine err and
base you the choice between PRIVCMD_MMAPBATCH_MFN_ERROR and
PRIVCMD_MMAPBATCH_PAGED_ERROR on it, similarly to what we did before?

Also I think you should spend a few more words in the commit message
regarding the changes you are making to the error reporting path.

>       } else { /* st->version == 2 */
> -             int err = *((int *) data);
>               if (err)
>                       return __put_user(err, st->user_err++);
>               else
> @@ -341,6 +373,21 @@ static int mmap_return_errors(void *data, void *state)
>       return 0;
>  }
>  
> +static int mmap_return_errors(void *data, int nr, void *state)
> +{
> +     struct mmap_batch_state *st = state;
> +     int *errs = data;
> +     int i;
> +     int ret;
> +
> +     for (i = 0; i < nr; i++) {
> +             ret = mmap_return_error(errs[i], st);
> +             if (ret < 0)
> +                     return ret;
> +     }
> +     return 0;
> +}
>
>  /* Allocate pfns that are then mapped with gmfns from foreign domid. Update
>   * the vma with the page info to use later.
>   * Returns: 0 if success, otherwise -errno
> @@ -472,8 +519,8 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, 
> int version)
>       state.version       = version;
>  
>       /* mmap_batch_fn guarantees ret == 0 */
> -     BUG_ON(traverse_pages(m.num, sizeof(xen_pfn_t),
> -                          &pagelist, mmap_batch_fn, &state));
> +     BUG_ON(traverse_pages_block(m.num, sizeof(xen_pfn_t),
> +                                 &pagelist, mmap_batch_fn, &state));
>  
>       up_write(&mm->mmap_sem);
>  
> @@ -481,8 +528,8 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, 
> int version)
>               /* Write back errors in second pass. */
>               state.user_mfn = (xen_pfn_t *)m.arr;
>               state.user_err = m.err;
> -             ret = traverse_pages(m.num, sizeof(xen_pfn_t),
> -                                                      &pagelist, 
> mmap_return_errors, &state);
> +             ret = traverse_pages_block(m.num, sizeof(xen_pfn_t),
> +                                        &pagelist, mmap_return_errors, 
> &state);
>       } else
>               ret = 0;
>  
> diff --git a/drivers/xen/xlate_mmu.c b/drivers/xen/xlate_mmu.c
> index 7724d90..581db0f 100644
> --- a/drivers/xen/xlate_mmu.c
> +++ b/drivers/xen/xlate_mmu.c
> @@ -62,13 +62,15 @@ static int map_foreign_page(unsigned long lpfn, unsigned 
> long fgmfn,
>  }
>  
>  struct remap_data {
> -     xen_pfn_t fgmfn; /* foreign domain's gmfn */
> +     xen_pfn_t *fgmfn; /* foreign domain's gmfn */
>       pgprot_t prot;
>       domid_t  domid;
>       struct vm_area_struct *vma;
>       int index;
>       struct page **pages;
>       struct xen_remap_mfn_info *info;
> +     int *err_ptr;
> +     int mapped;
>  };
>  
>  static int remap_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr,
> @@ -80,38 +82,46 @@ static int remap_pte_fn(pte_t *ptep, pgtable_t token, 
> unsigned long addr,
>       pte_t pte = pte_mkspecial(pfn_pte(pfn, info->prot));
>       int rc;
>  
> -     rc = map_foreign_page(pfn, info->fgmfn, info->domid);
> -     if (rc < 0)
> -             return rc;
> -     set_pte_at(info->vma->vm_mm, addr, ptep, pte);
> +     rc = map_foreign_page(pfn, *info->fgmfn, info->domid);
> +     *info->err_ptr++ = rc;
> +     if (!rc) {
> +             set_pte_at(info->vma->vm_mm, addr, ptep, pte);
> +             info->mapped++;
> +     }
> +     info->fgmfn++;
>  
>       return 0;
>  }
>  
> -int xen_xlate_remap_gfn_range(struct vm_area_struct *vma,
> +int xen_xlate_remap_gfn_array(struct vm_area_struct *vma,
>                             unsigned long addr,
> -                           xen_pfn_t gfn, int nr,
> -                           pgprot_t prot, unsigned domid,
> +                           xen_pfn_t *mfn, int nr,
> +                           int *err_ptr, pgprot_t prot,
> +                           unsigned domid,
>                             struct page **pages)
>  {
>       int err;
>       struct remap_data data;
> -
> -     /* TBD: Batching, current sole caller only does page at a time */
> -     if (nr > 1)
> -             return -EINVAL;
> -
> -     data.fgmfn = gfn;
> -     data.prot = prot;
> +     unsigned long range = nr << PAGE_SHIFT;
> + 
> +     /* Kept here for the purpose of making sure code doesn't break
> +        x86 PVOPS */
> +     BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO)));
> + 
> +     data.fgmfn = mfn;
> +     data.prot  = prot;
>       data.domid = domid;
> -     data.vma = vma;
> -     data.index = 0;
> +     data.vma   = vma;
>       data.pages = pages;
> -     err = apply_to_page_range(vma->vm_mm, addr, nr << PAGE_SHIFT,
> +     data.index = 0;
> +     data.err_ptr = err_ptr;
> +     data.mapped = 0;
> +
> +     err = apply_to_page_range(vma->vm_mm, addr, range,
>                                 remap_pte_fn, &data);
> -     return err;
> +     return err < 0 ? err : data.mapped;
>  }
> -EXPORT_SYMBOL_GPL(xen_xlate_remap_gfn_range);
> +EXPORT_SYMBOL_GPL(xen_xlate_remap_gfn_array);
>  
>  int xen_xlate_unmap_gfn_range(struct vm_area_struct *vma,
>                             int nr, struct page **pages)
> diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h
> index ba7d232..225a2ff 100644
> --- a/include/xen/xen-ops.h
> +++ b/include/xen/xen-ops.h
> @@ -27,19 +27,56 @@ int xen_create_contiguous_region(phys_addr_t pstart, 
> unsigned int order,
>  void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order);
>  
>  struct vm_area_struct;
> +
> +/*
> + * xen_remap_domain_mfn_array() - map an array of foreign frames
> + * @vma:     VMA to map the pages into
> + * @addr:    Address at which to map the pages
> + * @gfn:     Array of GFNs to map
> + * @nr:      Number entries in the GFN array
> + * @err_ptr: Returns per-GFN error status.
> + * @prot:    page protection mask
> + * @domid:   Domain owning the pages
> + * @pages:   Array of pages if this domain has an auto-translated physmap
> + *
> + * @gfn and @err_ptr may point to the same buffer, the GFNs will be
> + * overwritten by the error codes after they are mapped.
> + *
> + * Returns the number of successfully mapped frames, or a -ve error
> + * code.
> + */
> +int xen_remap_domain_mfn_array(struct vm_area_struct *vma,
> +                            unsigned long addr,
> +                            xen_pfn_t *gfn, int nr,
> +                            int *err_ptr, pgprot_t prot,
> +                            unsigned domid, 
> +                            struct page **pages);
> +
> +/* xen_remap_domain_mfn_range() - map a range of foreign frames
> + * @vma:     VMA to map the pages into
> + * @addr:    Address at which to map the pages
> + * @gfn:     First GFN to map.
> + * @nr:      Number frames to map
> + * @prot:    page protection mask
> + * @domid:   Domain owning the pages
> + * @pages:   Array of pages if this domain has an auto-translated physmap
> + *
> + * Returns the number of successfully mapped frames, or a -ve error
> + * code.
> + */
>  int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
>                              unsigned long addr,
> -                            xen_pfn_t mfn, int nr,
> +                            xen_pfn_t gfn, int nr,
>                              pgprot_t prot, unsigned domid,
>                              struct page **pages);
>  int xen_unmap_domain_mfn_range(struct vm_area_struct *vma,
>                              int numpgs, struct page **pages);
>  #ifdef CONFIG_XEN_AUTO_XLATE
> -int xen_xlate_remap_gfn_range(struct vm_area_struct *vma,
> +int xen_xlate_remap_gfn_array(struct vm_area_struct *vma,
>                             unsigned long addr,
> -                           xen_pfn_t gfn, int nr,
> -                           pgprot_t prot,
> -                           unsigned domid, 
> +                           xen_pfn_t *gfn, int nr,
> +                           int *err_ptr, pgprot_t prot,
> +                           unsigned domid,
>                             struct page **pages);
>  int xen_xlate_unmap_gfn_range(struct vm_area_struct *vma,
>                             int nr, struct page **pages);

Since we are introducing a new internal interface, shouldn't we try to
aim for something more generic, maybe like a scatter gather list?
We had one consecutive range and now we also have a list of pages. It
makes sense to jump straight into a list of ranges, right?

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.