[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [RFC PATCH v3 1/3] xen/memory : Add a stats_table resource type


  • To: Matias Ezequiel Vara Larsen <matiasevara@xxxxxxxxx>
  • From: Jan Beulich <jbeulich@xxxxxxxx>
  • Date: Mon, 22 Apr 2024 18:07:22 +0200
  • Autocrypt: addr=jbeulich@xxxxxxxx; keydata= xsDiBFk3nEQRBADAEaSw6zC/EJkiwGPXbWtPxl2xCdSoeepS07jW8UgcHNurfHvUzogEq5xk hu507c3BarVjyWCJOylMNR98Yd8VqD9UfmX0Hb8/BrA+Hl6/DB/eqGptrf4BSRwcZQM32aZK 7Pj2XbGWIUrZrd70x1eAP9QE3P79Y2oLrsCgbZJfEwCgvz9JjGmQqQkRiTVzlZVCJYcyGGsD /0tbFCzD2h20ahe8rC1gbb3K3qk+LpBtvjBu1RY9drYk0NymiGbJWZgab6t1jM7sk2vuf0Py O9Hf9XBmK0uE9IgMaiCpc32XV9oASz6UJebwkX+zF2jG5I1BfnO9g7KlotcA/v5ClMjgo6Gl MDY4HxoSRu3i1cqqSDtVlt+AOVBJBACrZcnHAUSuCXBPy0jOlBhxPqRWv6ND4c9PH1xjQ3NP nxJuMBS8rnNg22uyfAgmBKNLpLgAGVRMZGaGoJObGf72s6TeIqKJo/LtggAS9qAUiuKVnygo 3wjfkS9A3DRO+SpU7JqWdsveeIQyeyEJ/8PTowmSQLakF+3fote9ybzd880fSmFuIEJldWxp Y2ggPGpiZXVsaWNoQHN1c2UuY29tPsJgBBMRAgAgBQJZN5xEAhsDBgsJCAcDAgQVAggDBBYC AwECHgECF4AACgkQoDSui/t3IH4J+wCfQ5jHdEjCRHj23O/5ttg9r9OIruwAn3103WUITZee e7Sbg12UgcQ5lv7SzsFNBFk3nEQQCACCuTjCjFOUdi5Nm244F+78kLghRcin/awv+IrTcIWF hUpSs1Y91iQQ7KItirz5uwCPlwejSJDQJLIS+QtJHaXDXeV6NI0Uef1hP20+y8qydDiVkv6l IreXjTb7DvksRgJNvCkWtYnlS3mYvQ9NzS9PhyALWbXnH6sIJd2O9lKS1Mrfq+y0IXCP10eS FFGg+Av3IQeFatkJAyju0PPthyTqxSI4lZYuJVPknzgaeuJv/2NccrPvmeDg6Coe7ZIeQ8Yj t0ARxu2xytAkkLCel1Lz1WLmwLstV30g80nkgZf/wr+/BXJW/oIvRlonUkxv+IbBM3dX2OV8 AmRv1ySWPTP7AAMFB/9PQK/VtlNUJvg8GXj9ootzrteGfVZVVT4XBJkfwBcpC/XcPzldjv+3 HYudvpdNK3lLujXeA5fLOH+Z/G9WBc5pFVSMocI71I8bT8lIAzreg0WvkWg5V2WZsUMlnDL9 mpwIGFhlbM3gfDMs7MPMu8YQRFVdUvtSpaAs8OFfGQ0ia3LGZcjA6Ik2+xcqscEJzNH+qh8V m5jjp28yZgaqTaRbg3M/+MTbMpicpZuqF4rnB0AQD12/3BNWDR6bmh+EkYSMcEIpQmBM51qM EKYTQGybRCjpnKHGOxG0rfFY1085mBDZCH5Kx0cl0HVJuQKC+dV2ZY5AqjcKwAxpE75MLFkr wkkEGBECAAkFAlk3nEQCGwwACgkQoDSui/t3IH7nnwCfcJWUDUFKdCsBH/E5d+0ZnMQi+G0A nAuWpQkjM1ASeQwSHEeAWPgskBQL
  • Cc: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>, George Dunlap <george.dunlap@xxxxxxxxxx>, Ian Jackson <iwj@xxxxxxxxxxxxxx>, Julien Grall <julien@xxxxxxx>, Stefano Stabellini <sstabellini@xxxxxxxxxx>, Wei Liu <wl@xxxxxxx>, Dario Faggioli <dfaggioli@xxxxxxxx>, xen-devel@xxxxxxxxxxxxxxxxxxxx
  • Delivery-date: Mon, 22 Apr 2024 16:07:36 +0000
  • List-id: Xen developer discussion <xen-devel.lists.xenproject.org>

On 31.10.2023 17:21, Matias Ezequiel Vara Larsen wrote:
> This commit proposes a new mechanism to query the RUNSTATE_running counter for
> a given vcpu from a dom0 userspace application. This commit proposes to expose
> that counter by using the acquire_resource interface. For this purpose, the
> commit adds a new resource named XENMEM_resource_stats_table and a
> type-specific resource named XENMEM_resource_stats_table_id_vcpustats. This
> type-specific resource is aiming at exposing per-VCPU counters.
> 
> The current mechanism relies on the XEN_DOMCTL_getvcpuinfo and holds a single
> global domctl_lock for the entire hypercall; and iterate over every vcpu in 
> the
> system for every update thus impacting operations that share that lock. This
> commit proposes to expose vcpu RUNSTATE_running via the xenforeignmemory
> interface thus preventing to issue the hypercall and holding the lock.

Throughout can you please avoid "this commit" and "proposes" and alike? You
want to plainly describe what you do and why.

> --- a/xen/common/memory.c
> +++ b/xen/common/memory.c
> @@ -1092,6 +1092,27 @@ unsigned int ioreq_server_max_frames(const struct 
> domain *d)
>      return nr;
>  }
>  
> +unsigned int stats_table_max_frames(const struct domain *d, unsigned int id)

Any particular reason this isn't next to its sibling acquire_stats_table()?

> +{
> +    unsigned int nr = 0;
> +    unsigned int size;
> +
> +    switch ( id )
> +    {
> +    case XENMEM_resource_stats_table_id_vcpustats:
> +        size = DIV_ROUND_UP(sizeof(struct vcpu_shmem_stats), 
> SMP_CACHE_BYTES) +
> +               DIV_ROUND_UP(sizeof(struct vcpu_stats), SMP_CACHE_BYTES) * 
> d->max_vcpus;
> +
> +        nr = DIV_ROUND_UP(size, PAGE_SIZE);
> +        break;
> +
> +    default:
> +        break;
> +    }
> +
> +    return nr;
> +}
> +
>  /*
>   * Return 0 on any kind of error.  Caller converts to -EINVAL.
>   *
> @@ -1113,6 +1134,9 @@ static unsigned int resource_max_frames(const struct 
> domain *d,
>      case XENMEM_resource_vmtrace_buf:
>          return d->vmtrace_size >> PAGE_SHIFT;
>  
> +    case XENMEM_resource_stats_table:
> +        return stats_table_max_frames(d, id);
> +
>      default:
>          return -EOPNOTSUPP;
>      }
> @@ -1176,6 +1200,146 @@ static int acquire_vmtrace_buf(
>      return nr_frames;
>  }
>  
> +void stats_free_vcpu_mfn(struct domain * d)
> +{
> +    struct page_info *pg = d->vcpustats_page.pg;
> +    void * _va = d->vcpustats_page.va;

What use is the leading underscore here? Also (nit) stray blank after *.

> +    unsigned int i;
> +    unsigned int size;
> +    unsigned int nr_frames;
> +
> +    if ( !pg )
> +        return;
> +
> +    d->vcpustats_page.pg = NULL;
> +    d->vcpustats_page.va = NULL;
> +
> +    size = DIV_ROUND_UP(sizeof(struct vcpu_shmem_stats), SMP_CACHE_BYTES) +
> +           DIV_ROUND_UP(sizeof(struct vcpu_stats), SMP_CACHE_BYTES)
> +           * d->max_vcpus;
> +
> +    nr_frames = DIV_ROUND_UP(size, PAGE_SIZE);
> +
> +    vunmap(_va);
> +
> +    for ( i = 0; i < nr_frames; i++ )
> +    {
> +        put_page_alloc_ref(&pg[i]);
> +        put_page_and_type(&pg[i]);
> +    }
> +}
> +
> +static int stats_vcpu_alloc_mfn(struct domain *d)

Both functions' names suggest a single MFN is being allocated / freed.

> +{
> +    struct page_info *pg;
> +    int order;

This can't be negative, can it?

> +    unsigned int i;
> +    unsigned int size;
> +    unsigned int nr_frames;
> +    void *_va;
> +    mfn_t mfn;
> +    struct vcpu_shmem_stats *hd;
> +
> +    size = DIV_ROUND_UP(sizeof(struct vcpu_shmem_stats), SMP_CACHE_BYTES) +

Nit: Style is correct here, ...

> +           DIV_ROUND_UP(sizeof(struct vcpu_stats), SMP_CACHE_BYTES)
> +           * d->max_vcpus;

... but then not here (* belongs on the earlier line). Also this is the 3rd
instance of all the same expression, which thus likely wants putting in a
helper function.

> +    nr_frames = DIV_ROUND_UP(size, PAGE_SIZE);
> +
> +    order = get_order_from_bytes(size);
> +    pg = alloc_domheap_pages(d, order, MEMF_no_refcount);
> +
> +    if ( !pg )
> +        return -ENOMEM;
> +
> +    for ( i = 0; i < nr_frames; i++ )
> +    {
> +        if ( unlikely(!get_page_and_type(&pg[i], d, PGT_writable_page)) )
> +            /*
> +             * The domain can't possibly know about this page yet, so failure
> +             * here is a clear indication of something fishy going on.
> +             */
> +            goto refcnt_err;
> +    }
> +
> +    mfn = page_to_mfn(pg);
> +    _va = vmap(&mfn, nr_frames);

This won't work when nr_frames > 1. The first parameter of this function
points to an array of nr_frames elements. I think you mean vmap_contig(),
yet even better would be if you didn't needlessly allocate non-order-0
pages (and perhaps excess space) here.

> +    if ( !_va )
> +        goto refcnt_err;
> +
> +    for ( i = 0; i < nr_frames; i++ )
> +        clear_page(_va + i * PAGE_SIZE);
> +
> +    /* Initialize vcpu_shmem_stats header */
> +    hd = (struct vcpu_shmem_stats*)_va;

Please avoid casts when they're not needed.

> +    hd->magic = VCPU_STATS_MAGIC;
> +    hd->offset = ROUNDUP(sizeof(struct vcpu_shmem_stats), SMP_CACHE_BYTES);
> +    hd->size = sizeof(struct vcpu_stats);
> +    hd->stride = ROUNDUP(sizeof(struct vcpu_stats), SMP_CACHE_BYTES);

Don't you need an smp_wmb() here, to make sure ...

> +    d->vcpustats_page.va  = _va;
> +    d->vcpustats_page.pg = pg;

.. the global announcement of these pointers becomes visible only with the
page properly seeded?

> +    return 0;
> +
> + refcnt_err:
> +    /*
> +     * We can theoretically reach this point if someone has taken 2^43 refs 
> on
> +     * the frames in the time the above loop takes to execute, or someone has
> +     * made a blind decrease reservation hypercall and managed to pick the
> +     * right mfn.  Free the memory we safely can, and leak the rest.
> +     */
> +    while ( i-- )
> +    {
> +        put_page_alloc_ref(&pg[i]);
> +        put_page_and_type(&pg[i]);
> +    }
> +
> +    return -ENODATA;
> +}
> +
> +static int acquire_stats_table(struct domain *d,
> +                               unsigned int id,
> +                               unsigned int frame,
> +                               unsigned int nr_frames,
> +                               xen_pfn_t mfn_list[])
> +{
> +    mfn_t mfn;
> +    int rc;
> +    unsigned int i;
> +    unsigned int max_frames;
> +
> +    if ( !d )
> +        return -ENOENT;
> +
> +    switch ( id )
> +    {
> +    case XENMEM_resource_stats_table_id_vcpustats:
> +        max_frames = DIV_ROUND_UP(d->max_vcpus * sizeof(struct vcpu_stats),
> +                                  PAGE_SIZE);

What about the sizeof(struct vcpu_shmem_stats) part? And what about the
SMP_CACHE_BYTES alignment enforced elsewhere?

> +        if ( (frame + nr_frames) > max_frames )
> +            return -EINVAL;
> +
> +        if ( !d->vcpustats_page.pg )
> +        {
> +            rc = stats_vcpu_alloc_mfn(d);
> +            if ( rc )
> +                return rc;
> +        }

I might guess you have taken acquire_vmtrace_buf() for reference, but
that looks to suffer the same issue as this code: What if two racing
requests appear? Both will allocate new memory, and either of the two
blocks will be leaked. With the "right" timing you may even end up with
inconsistent .va and .pg fields.

> --- a/xen/common/sched/core.c
> +++ b/xen/common/sched/core.c
> @@ -264,6 +264,11 @@ static inline void vcpu_runstate_change(
>  {
>      s_time_t delta;
>      struct sched_unit *unit = v->sched_unit;
> +    struct vcpu_shmem_stats *vcpu_shmem;
> +    struct vcpu_stats *vcpu_info;
> +    void *_va;
> +    struct domain *d = v->domain;
> +    int offset;

Once again - this can't be negative, can it? (I question the need for the
variable in the first place, though.)

> @@ -287,6 +292,21 @@ static inline void vcpu_runstate_change(
>      }
>  
>      v->runstate.state = new_state;
> +
> +    _va = d->vcpustats_page.va;
> +
> +    if ( !_va )
> +        return;
> +
> +    vcpu_shmem = (struct vcpu_shmem_stats*)_va;
> +    vcpu_info = (struct vcpu_stats*)((void*)vcpu_shmem + vcpu_shmem->offset);

You just cast _va from void *. And here you cast it back right away. As before,
please limit casts to ones actually necessary (and sensible).

> --- a/xen/include/public/vcpu.h
> +++ b/xen/include/public/vcpu.h
> @@ -235,6 +235,33 @@ struct vcpu_register_time_memory_area {
>  typedef struct vcpu_register_time_memory_area 
> vcpu_register_time_memory_area_t;
>  DEFINE_XEN_GUEST_HANDLE(vcpu_register_time_memory_area_t);
>  
> +struct vcpu_shmem_stats {
> +#define VCPU_STATS_MAGIC 0xaabbccdd
> +    uint32_t magic;
> +    uint32_t offset;
> +    uint32_t size;
> +    uint32_t stride;
> +};
> +
> +struct vcpu_stats {
> +    /*
> +     * If the least-significant bit of the seq number is set then an update
> +     * is in progress and the consumer must wait to read a consistent set of
> +     * values. This mechanism is similar to Linux's seqlock.
> +     */
> +    uint32_t seq;
> +    uint32_t pad0;
> +    /*
> +     * If the most-significant bit of a counter is set then the counter
> +     * is inactive and the consumer must ignore its value. Note that this
> +     * could also indicate that the counter has overflowed.
> +     */
> +     uint64_t runstate_running_time;

Nit: Indentation.

> +};
> +
> +typedef struct vcpu_shmem_stats xen_vcpu_shmemstats_t;
> +typedef struct vcpu_stats xen_shared_vcpustats_t;

When you properly use xen_ prefixes for the typedef names, why not also for
the structure tags and (then XEN_) for the magic number?

Also - no blank line above here please.

Jan



 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.