[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH v12 1/3] xen/mem_sharing: VM forking



On Wed, Mar 25, 2020 at 9:47 AM Roger Pau Monné <roger.pau@xxxxxxxxxx> wrote:
>
> Sorry it has taken me a while to get to this.
>
> On Mon, Mar 23, 2020 at 10:04:35AM -0700, Tamas K Lengyel wrote:
> > VM forking is the process of creating a domain with an empty memory space 
> > and a
> > parent domain specified from which to populate the memory when necessary. 
> > For
> > the new domain to be functional the VM state is copied over as part of the 
> > fork
> > operation (HVM params, hap allocation, etc).
> >
> > Signed-off-by: Tamas K Lengyel <tamas.lengyel@xxxxxxxxx>
> > ---
> > v12: Minor style adjustments Jan pointed out
> >      Convert mem_sharing_is_fork to inline function
> > v11: Fully copy vcpu_info pages
> >      Setup vcpu_runstate for forks
> >      Added TODO note for PV timers
> >      Copy shared_info page
> >      Add copy_settings function, to be shared with fork_reset in the next 
> > patch
> > ---
> >  xen/arch/x86/domain.c             |  11 +
> >  xen/arch/x86/hvm/hvm.c            |   4 +-
> >  xen/arch/x86/mm/hap/hap.c         |   3 +-
> >  xen/arch/x86/mm/mem_sharing.c     | 368 ++++++++++++++++++++++++++++++
> >  xen/arch/x86/mm/p2m.c             |   9 +-
> >  xen/common/domain.c               |   3 +
> >  xen/include/asm-x86/hap.h         |   1 +
> >  xen/include/asm-x86/hvm/hvm.h     |   2 +
> >  xen/include/asm-x86/mem_sharing.h |  18 ++
> >  xen/include/public/memory.h       |   5 +
> >  xen/include/xen/sched.h           |   5 +
> >  11 files changed, 424 insertions(+), 5 deletions(-)
> >
> > diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
> > index caf2ecad7e..11d3c2216e 100644
> > --- a/xen/arch/x86/domain.c
> > +++ b/xen/arch/x86/domain.c
> > @@ -2202,6 +2202,17 @@ int domain_relinquish_resources(struct domain *d)
> >              ret = relinquish_shared_pages(d);
> >              if ( ret )
> >                  return ret;
> > +
> > +            /*
> > +             * If the domain is forked, decrement the parent's pause count
> > +             * and release the domain.
> > +             */
> > +            if ( mem_sharing_is_fork(d) )
> > +            {
> > +                domain_unpause(d->parent);
> > +                put_domain(d->parent);
> > +                d->parent = NULL;
> > +            }
> >          }
> >  #endif
> >
> > diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
> > index a3d115b650..304b3d1562 100644
> > --- a/xen/arch/x86/hvm/hvm.c
> > +++ b/xen/arch/x86/hvm/hvm.c
> > @@ -1917,7 +1917,7 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned 
> > long gla,
> >      }
> >  #endif
> >
> > -    /* Spurious fault? PoD and log-dirty also take this path. */
> > +    /* Spurious fault? PoD, log-dirty and VM forking also take this path. 
> > */
> >      if ( p2m_is_ram(p2mt) )
> >      {
> >          rc = 1;
> > @@ -4377,7 +4377,7 @@ static int hvm_allow_get_param(struct domain *d,
> >      return rc;
> >  }
> >
> > -static int hvm_get_param(struct domain *d, uint32_t index, uint64_t *value)
> > +int hvm_get_param(struct domain *d, uint32_t index, uint64_t *value)
> >  {
> >      int rc;
> >
> > diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c
> > index a6d5e39b02..814d0c3253 100644
> > --- a/xen/arch/x86/mm/hap/hap.c
> > +++ b/xen/arch/x86/mm/hap/hap.c
> > @@ -321,8 +321,7 @@ static void hap_free_p2m_page(struct domain *d, struct 
> > page_info *pg)
> >  }
> >
> >  /* Return the size of the pool, rounded up to the nearest MB */
> > -static unsigned int
> > -hap_get_allocation(struct domain *d)
> > +unsigned int hap_get_allocation(struct domain *d)
> >  {
> >      unsigned int pg = d->arch.paging.hap.total_pages
> >          + d->arch.paging.hap.p2m_pages;
> > diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
> > index 3835bc928f..23deeddff2 100644
> > --- a/xen/arch/x86/mm/mem_sharing.c
> > +++ b/xen/arch/x86/mm/mem_sharing.c
> > @@ -22,6 +22,7 @@
> >
> >  #include <xen/types.h>
> >  #include <xen/domain_page.h>
> > +#include <xen/event.h>
> >  #include <xen/spinlock.h>
> >  #include <xen/rwlock.h>
> >  #include <xen/mm.h>
> > @@ -36,6 +37,8 @@
> >  #include <asm/altp2m.h>
> >  #include <asm/atomic.h>
> >  #include <asm/event.h>
> > +#include <asm/hap.h>
> > +#include <asm/hvm/hvm.h>
> >  #include <xsm/xsm.h>
> >
> >  #include "mm-locks.h"
> > @@ -1444,6 +1447,334 @@ static inline int mem_sharing_control(struct domain 
> > *d, bool enable)
> >      return 0;
> >  }
> >
> > +/*
> > + * Forking a page only gets called when the VM faults due to no entry being
> > + * in the EPT for the access. Depending on the type of access we either
> > + * populate the physmap with a shared entry for read-only access or
> > + * fork the page if its a write access.
> > + *
> > + * The client p2m is already locked so we only need to lock
> > + * the parent's here.
> > + */
> > +int mem_sharing_fork_page(struct domain *d, gfn_t gfn, bool unsharing)
> > +{
> > +    int rc = -ENOENT;
> > +    shr_handle_t handle;
> > +    struct domain *parent = d->parent;
> > +    struct p2m_domain *p2m;
> > +    unsigned long gfn_l = gfn_x(gfn);
> > +    mfn_t mfn, new_mfn;
> > +    p2m_type_t p2mt;
> > +    struct page_info *page;
> > +
> > +    if ( !mem_sharing_is_fork(d) )
> > +        return -ENOENT;
> > +
> > +    if ( !unsharing )
> > +    {
> > +        /* For read-only accesses we just add a shared entry to the 
> > physmap */
> > +        while ( parent )
> > +        {
> > +            if ( !(rc = nominate_page(parent, gfn, 0, &handle)) )
> > +                break;
> > +
> > +            parent = parent->parent;
> > +        }
> > +
> > +        if ( !rc )
> > +        {
> > +            /* The client's p2m is already locked */
> > +            struct p2m_domain *pp2m = p2m_get_hostp2m(parent);
>
> Nit: I think you could just use the existing p2m local variable.
>
> > +            p2m_lock(pp2m);
> > +            rc = add_to_physmap(parent, gfn_l, handle, d, gfn_l, false);
> > +            p2m_unlock(pp2m);
> > +
> > +            if ( !rc )
> > +                return 0;
> > +        }
> > +    }
> > +
> > +    /*
> > +     * If it's a write access (ie. unsharing) or if adding a shared entry 
> > to
> > +     * the physmap failed we'll fork the page directly.
> > +     */
> > +    p2m = p2m_get_hostp2m(d);
> > +    parent = d->parent;
> > +
> > +    while ( parent )
> > +    {
> > +        mfn = get_gfn_query(parent, gfn_l, &p2mt);
> > +
> > +        /*
> > +         * We can't fork grant memory from the parent, only regular ram.
> > +         */
>
> Nit: single line comments should use /* ... */ (here and below).
>
> > +        if ( mfn_valid(mfn) && p2m_is_ram(p2mt) )
> > +            break;
> > +
> > +        put_gfn(parent, gfn_l);
> > +        parent = parent->parent;
> > +    }
> > +
> > +    if ( !parent )
> > +        return -ENOENT;
> > +
> > +    if ( !(page = alloc_domheap_page(d, 0)) )
> > +    {
> > +        put_gfn(parent, gfn_l);
> > +        return -ENOMEM;
> > +    }
> > +
> > +    new_mfn = page_to_mfn(page);
> > +    copy_domain_page(new_mfn, mfn);
> > +    set_gpfn_from_mfn(mfn_x(new_mfn), gfn_l);
> > +
> > +    put_gfn(parent, gfn_l);
> > +
> > +    return p2m->set_entry(p2m, gfn, new_mfn, PAGE_ORDER_4K, p2m_ram_rw,
> > +                          p2m->default_access, -1);
> > +}
> > +
> > +static int bring_up_vcpus(struct domain *cd, struct domain *d)
> > +{
> > +    unsigned int i;
> > +    int ret = -EINVAL;
>
> Nit: you can get rid of ret...
>
> > +
> > +    if ( d->max_vcpus != cd->max_vcpus ||
> > +        (ret = cpupool_move_domain(cd, d->cpupool)) )
> > +        return ret;
>
> ...and just return -EINVAL here. Seeing as it's not used anywhere
> else.
>
> > +
> > +    for ( i = 0; i < cd->max_vcpus; i++ )
> > +    {
> > +        if ( !d->vcpu[i] || cd->vcpu[i] )
> > +            continue;
> > +
> > +        if ( !vcpu_create(cd, i) )
> > +            return -EINVAL;
> > +    }
> > +
> > +    domain_update_node_affinity(cd);
> > +    return 0;
> > +}
> > +
> > +static int copy_vcpu_settings(struct domain *cd, struct domain *d)
> > +{
> > +    unsigned int i;
> > +    struct p2m_domain *p2m = p2m_get_hostp2m(cd);
> > +    int ret = -EINVAL;
> > +
> > +    for ( i = 0; i < cd->max_vcpus; i++ )
> > +    {
> > +        const struct vcpu *d_vcpu = d->vcpu[i];
> > +        struct vcpu *cd_vcpu = cd->vcpu[i];
> > +        struct vcpu_runstate_info runstate;
> > +        mfn_t vcpu_info_mfn;
> > +
> > +        if ( !d_vcpu || !cd_vcpu )
> > +            continue;
> > +
> > +        /*
> > +         * Copy & map in the vcpu_info page if the guest uses one
> > +         */
> > +        vcpu_info_mfn = d_vcpu->vcpu_info_mfn;
> > +        if ( !mfn_eq(vcpu_info_mfn, INVALID_MFN) )
> > +        {
> > +            mfn_t new_vcpu_info_mfn = cd_vcpu->vcpu_info_mfn;
> > +
> > +            /*
> > +             * Allocate & map the page for it if it hasn't been already
> > +             */
> > +            if ( mfn_eq(new_vcpu_info_mfn, INVALID_MFN) )
> > +            {
> > +                gfn_t gfn = mfn_to_gfn(d, vcpu_info_mfn);
> > +                unsigned long gfn_l = gfn_x(gfn);
> > +                struct page_info *page;
> > +
> > +                if ( !(page = alloc_domheap_page(cd, 0)) )
> > +                    return -ENOMEM;
> > +
> > +                new_vcpu_info_mfn = page_to_mfn(page);
> > +                set_gpfn_from_mfn(mfn_x(new_vcpu_info_mfn), gfn_l);
> > +
> > +                ret = p2m->set_entry(p2m, gfn, new_vcpu_info_mfn, 
> > PAGE_ORDER_4K,
> > +                                     p2m_ram_rw, p2m->default_access, -1);
> > +                if ( ret )
> > +                    return ret;
> > +
> > +                ret = map_vcpu_info(cd_vcpu, gfn_l,
> > +                                    d_vcpu->vcpu_info_offset);
> > +                if ( ret )
> > +                    return ret;
> > +            }
> > +
> > +            copy_domain_page(new_vcpu_info_mfn, vcpu_info_mfn);
> > +        }
> > +
> > +        /*
> > +         * Setup the vCPU runstate area
> > +         */
> > +        if ( guest_handle_is_null(runstate_guest(cd_vcpu)) )
>
> Maybe I'm confused, but isn't this the other way around and you need
> to check? If the parent runstate is not null copy it to the fork,
> ie:
>
> if ( !guest_handle_is_null(runstate_guest(d_vcpu)) )
> {
>     ...
>
> > +        {
> > +            runstate_guest(cd_vcpu) = runstate_guest(d_vcpu);
> > +            vcpu_runstate_get(cd_vcpu, &runstate);
> > +            __copy_to_guest(runstate_guest(cd_vcpu), &runstate, 1);
>
> You should check the return code I think.
>
> > +        }
> > +
> > +        /*
> > +         * TODO: to support VMs with PV interfaces copy additional
> > +         * settings here, such as PV timers.
> > +         */
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> > +static int fork_hap_allocation(struct domain *cd, struct domain *d)
> > +{
> > +    int rc;
> > +    bool preempted;
> > +    unsigned long mb = hap_get_allocation(d);
> > +
> > +    if ( mb == hap_get_allocation(cd) )
> > +        return 0;
> > +
> > +    paging_lock(cd);
> > +    rc = hap_set_allocation(cd, mb << (20 - PAGE_SHIFT), &preempted);
> > +    paging_unlock(cd);
> > +
> > +    return preempted ? -ERESTART : rc;
> > +}
> > +
> > +static void copy_tsc(struct domain *cd, struct domain *d)
> > +{
> > +    uint32_t tsc_mode;
> > +    uint32_t gtsc_khz;
> > +    uint32_t incarnation;
> > +    uint64_t elapsed_nsec;
> > +
> > +    tsc_get_info(d, &tsc_mode, &elapsed_nsec, &gtsc_khz, &incarnation);
> > +    /* Don't bump incarnation on set */
> > +    tsc_set_info(cd, tsc_mode, elapsed_nsec, gtsc_khz, incarnation - 1);
> > +}
> > +
> > +static int copy_special_pages(struct domain *cd, struct domain *d)
> > +{
> > +    mfn_t new_mfn, old_mfn;
> > +    struct p2m_domain *p2m = p2m_get_hostp2m(cd);
> > +    static const unsigned int params[] =
> > +    {
> > +        HVM_PARAM_STORE_PFN,
> > +        HVM_PARAM_IOREQ_PFN,
> > +        HVM_PARAM_BUFIOREQ_PFN,
> > +        HVM_PARAM_CONSOLE_PFN
> > +    };
> > +    unsigned int i;
> > +    int rc;
> > +
> > +    for ( i = 0; i < 4; i++ )
>
> Please use ARRAY_SIZE instead of hard coding 4.
>
> > diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
> > index 9f51370327..1ed7d13084 100644
> > --- a/xen/arch/x86/mm/p2m.c
> > +++ b/xen/arch/x86/mm/p2m.c
> > @@ -509,6 +509,12 @@ mfn_t __get_gfn_type_access(struct p2m_domain *p2m, 
> > unsigned long gfn_l,
> >
> >      mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order, NULL);
> >
> > +    /* Check if we need to fork the page */
> > +    if ( (q & P2M_ALLOC) && p2m_is_hole(*t) &&
> > +         !mem_sharing_fork_page(p2m->domain, gfn, q & P2M_UNSHARE) )
> > +        mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order, NULL);
> > +
> > +    /* Check if we need to unshare the page */
> >      if ( (q & P2M_UNSHARE) && p2m_is_shared(*t) )
> >      {
> >          ASSERT(p2m_is_hostp2m(p2m));
> > @@ -588,7 +594,8 @@ struct page_info *p2m_get_page_from_gfn(
> >              return page;
> >
> >          /* Error path: not a suitable GFN at all */
> > -        if ( !p2m_is_ram(*t) && !p2m_is_paging(*t) && !p2m_is_pod(*t) )
> > +        if ( !p2m_is_ram(*t) && !p2m_is_paging(*t) && !p2m_is_pod(*t) &&
> > +             !mem_sharing_is_fork(p2m->domain) )
> >              return NULL;
> >      }
> >
> > diff --git a/xen/common/domain.c b/xen/common/domain.c
> > index b4eb476a9c..62aed53a16 100644
> > --- a/xen/common/domain.c
> > +++ b/xen/common/domain.c
> > @@ -1270,6 +1270,9 @@ int map_vcpu_info(struct vcpu *v, unsigned long gfn, 
> > unsigned offset)
> >
> >      v->vcpu_info = new_info;
> >      v->vcpu_info_mfn = page_to_mfn(page);
> > +#ifdef CONFIG_MEM_SHARING
> > +    v->vcpu_info_offset = offset;
>
> There's no need to introduce this field, you can just use v->vcpu_info
> & ~PAGE_MASK AFAICT.

Just doing what you suggest above results in:

mem_sharing.c:1603:55: error: invalid operands to binary & (have
‘vcpu_info_t * const’ {aka ‘union <anonymous> * const’} and ‘long
int’)
                                     d_vcpu->vcpu_info & ~PAGE_MASK);

I can of course cast the vcpu_info pointer to (long int), it's just a
bit ugly. Thoughts?

Tamas



 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.