[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH v4 03/10] xen/page_alloc: Implement NUMA-node-specific claims


  • To: Bernhard Kaindl <bernhard.kaindl@xxxxxxxxxx>
  • From: Jan Beulich <jbeulich@xxxxxxxx>
  • Date: Thu, 5 Mar 2026 11:53:30 +0100
  • Autocrypt: addr=jbeulich@xxxxxxxx; keydata= xsDiBFk3nEQRBADAEaSw6zC/EJkiwGPXbWtPxl2xCdSoeepS07jW8UgcHNurfHvUzogEq5xk hu507c3BarVjyWCJOylMNR98Yd8VqD9UfmX0Hb8/BrA+Hl6/DB/eqGptrf4BSRwcZQM32aZK 7Pj2XbGWIUrZrd70x1eAP9QE3P79Y2oLrsCgbZJfEwCgvz9JjGmQqQkRiTVzlZVCJYcyGGsD /0tbFCzD2h20ahe8rC1gbb3K3qk+LpBtvjBu1RY9drYk0NymiGbJWZgab6t1jM7sk2vuf0Py O9Hf9XBmK0uE9IgMaiCpc32XV9oASz6UJebwkX+zF2jG5I1BfnO9g7KlotcA/v5ClMjgo6Gl MDY4HxoSRu3i1cqqSDtVlt+AOVBJBACrZcnHAUSuCXBPy0jOlBhxPqRWv6ND4c9PH1xjQ3NP nxJuMBS8rnNg22uyfAgmBKNLpLgAGVRMZGaGoJObGf72s6TeIqKJo/LtggAS9qAUiuKVnygo 3wjfkS9A3DRO+SpU7JqWdsveeIQyeyEJ/8PTowmSQLakF+3fote9ybzd880fSmFuIEJldWxp Y2ggPGpiZXVsaWNoQHN1c2UuY29tPsJgBBMRAgAgBQJZN5xEAhsDBgsJCAcDAgQVAggDBBYC AwECHgECF4AACgkQoDSui/t3IH4J+wCfQ5jHdEjCRHj23O/5ttg9r9OIruwAn3103WUITZee e7Sbg12UgcQ5lv7SzsFNBFk3nEQQCACCuTjCjFOUdi5Nm244F+78kLghRcin/awv+IrTcIWF hUpSs1Y91iQQ7KItirz5uwCPlwejSJDQJLIS+QtJHaXDXeV6NI0Uef1hP20+y8qydDiVkv6l IreXjTb7DvksRgJNvCkWtYnlS3mYvQ9NzS9PhyALWbXnH6sIJd2O9lKS1Mrfq+y0IXCP10eS FFGg+Av3IQeFatkJAyju0PPthyTqxSI4lZYuJVPknzgaeuJv/2NccrPvmeDg6Coe7ZIeQ8Yj t0ARxu2xytAkkLCel1Lz1WLmwLstV30g80nkgZf/wr+/BXJW/oIvRlonUkxv+IbBM3dX2OV8 AmRv1ySWPTP7AAMFB/9PQK/VtlNUJvg8GXj9ootzrteGfVZVVT4XBJkfwBcpC/XcPzldjv+3 HYudvpdNK3lLujXeA5fLOH+Z/G9WBc5pFVSMocI71I8bT8lIAzreg0WvkWg5V2WZsUMlnDL9 mpwIGFhlbM3gfDMs7MPMu8YQRFVdUvtSpaAs8OFfGQ0ia3LGZcjA6Ik2+xcqscEJzNH+qh8V m5jjp28yZgaqTaRbg3M/+MTbMpicpZuqF4rnB0AQD12/3BNWDR6bmh+EkYSMcEIpQmBM51qM EKYTQGybRCjpnKHGOxG0rfFY1085mBDZCH5Kx0cl0HVJuQKC+dV2ZY5AqjcKwAxpE75MLFkr wkkEGBECAAkFAlk3nEQCGwwACgkQoDSui/t3IH7nnwCfcJWUDUFKdCsBH/E5d+0ZnMQi+G0A nAuWpQkjM1ASeQwSHEeAWPgskBQL
  • Cc: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>, Anthony PERARD <anthony.perard@xxxxxxxxxx>, Michal Orzel <michal.orzel@xxxxxxx>, Julien Grall <julien@xxxxxxx>, Roger Pau Monné <roger.pau@xxxxxxxxxx>, Stefano Stabellini <sstabellini@xxxxxxxxxx>, Marcus Granado <marcus.granado@xxxxxxxxxx>, xen-devel@xxxxxxxxxxxxxxxxxxxx, Alejandro Vallejo <Alejandro.GarciaVallejo@xxxxxxx>
  • Delivery-date: Thu, 05 Mar 2026 10:53:47 +0000
  • List-id: Xen developer discussion <xen-devel.lists.xenproject.org>

On 26.02.2026 15:29, Bernhard Kaindl wrote:
> --- a/xen/common/domain.c
> +++ b/xen/common/domain.c
> @@ -944,6 +944,7 @@ struct domain *domain_create(domid_t domid,
>      spin_lock_init(&d->node_affinity_lock);
>      d->node_affinity = NODE_MASK_ALL;
>      d->auto_node_affinity = 1;
> +    d->claim_node = NUMA_NO_NODE;

If, as the cover letter says, the new domctl is going to allow claiming from
multiple nodes in one go, why would this new field still be necessary?

> --- a/xen/common/page_alloc.c
> +++ b/xen/common/page_alloc.c
> @@ -488,7 +488,10 @@ static unsigned long *avail[MAX_NUMNODES];
>  /* Global available pages, updated in real-time, protected by heap_lock */
>  static unsigned long total_avail_pages;
>  
> -/* The global heap lock, protecting access to the heap and related 
> structures */
> +/*
> + * The global heap lock, protecting access to the heap and related structures
> + * It protects the heap and claims, d->outstanding_pages and d->claim_node
> + */
>  static DEFINE_SPINLOCK(heap_lock);

Nit: Comment style.

> @@ -510,6 +513,71 @@ static unsigned long node_avail_pages[MAX_NUMNODES];
>  /* total outstanding claims by all domains */
>  static unsigned long outstanding_claims;
>  
> +/*
> + * Per-node accessor for outstanding claims, protected by heap_lock, updated
> + * in lockstep with the global outstanding_claims and d->outstanding_pages
> + * in domain_set_outstanding_pages() and release_outstanding_claims().
> + *
> + * node_outstanding_claims(node) is used to determine the outstanding claims 
> on
> + * a node, which are subtracted from the node's available pages to determine 
> if
> + * a request can be satisfied without violating the node's memory 
> availability.
> + */
> +#define node_outstanding_claims(node) (node_outstanding_claims[node])

See the comment on the earlier patch regarding such a wrapper.

> +/* total outstanding claims by all domains on node */
> +static unsigned long node_outstanding_claims[MAX_NUMNODES];

How come this is being added, rather than it replacing outstanding_claims?

> +/* Return available pages after subtracting claimed pages */
> +static inline unsigned long available_after_claims(unsigned long avail_pages,
> +                                                   unsigned long claims)
> +{
> +    BUG_ON(claims > avail_pages);
> +    return avail_pages - claims; /* Due to the BUG_ON, it cannot be negative 
> */
> +}

A helper for a simple subtraction?

> +/* Answer if host-level memory and claims permit this request to proceed */
> +static inline bool host_allocatable_request(const struct domain *d,
> +                                            unsigned int memflags,
> +                                            unsigned long request)
> +{
> +    unsigned long allocatable_pages;
> +
> +    ASSERT(spin_is_locked(&heap_lock));
> +
> +    allocatable_pages = available_after_claims(total_avail_pages,
> +                                               outstanding_claims);
> +    if ( allocatable_pages >= request )
> +        return true; /* The not claimed pages are enough to proceed */
> +
> +    if ( !d || (memflags & MEMF_no_refcount) )
> +        return false; /* Claims are not available for this allocation */
> +
> +    /* The domain's claims are available, return true if sufficient */
> +    return request <= allocatable_pages + d->outstanding_pages;
> +}

This only uses variables which existed before, i.e. there's nothing NUMA-ish
in here. What's the deal?

> +/* Answer if node-level memory and claims permit this request to proceed */
> +static inline bool node_allocatable_request(const struct domain *d,
> +                                            unsigned int memflags,
> +                                            unsigned long request,
> +                                            nodeid_t node)
> +{
> +    unsigned long allocatable_pages;
> +
> +    ASSERT(spin_is_locked(&heap_lock));
> +    ASSERT(node < MAX_NUMNODES);
> +
> +    allocatable_pages = available_after_claims(node_avail_pages(node),
> +                                               
> node_outstanding_claims(node));
> +    if ( allocatable_pages >= request )
> +        return true; /* The not claimed pages are enough to proceed */
> +
> +    if ( !d || (memflags & MEMF_no_refcount) || (node != d->claim_node) )
> +        return false; /* Claims are not available for this allocation */
> +
> +    /* The domain's claims are available, return true if sufficient */
> +    return request <= allocatable_pages + d->outstanding_pages;
> +}

And this is the NUMA counterpart, almost identical in the basic logic. If
(for whatever reason) both are really needed, I think it should at least be
considered to fold them (with NUMA_NO_NODE indicating the non-NUMA intent).

In fact the node != d->claim_node would probably also apply to the non-NUMA
variant (as d->claim_node != NUMA_NO_NODE).

As to the comments in both functions, personally I think
s/not claimed/unclaimed/ would be slightly more logical to follow.

In any event, the first of these function looks like it could be split out
in a separate, earlier patch. Then (as per above) ideally here that function
would simply be extended to become NUMA-capable.

> @@ -539,14 +607,23 @@ unsigned long domain_adjust_tot_pages(struct domain *d, 
> long pages)
>      return d->tot_pages;
>  }
>  
> -/* Release outstanding claims on the domain, host and later also node */
> +/* Release outstanding claims on the domain, host and node */
>  static inline
>  void release_outstanding_claims(struct domain *d, unsigned long release)
>  {
>      ASSERT(spin_is_locked(&heap_lock));
>      BUG_ON(outstanding_claims < release);
>      outstanding_claims -= release;
> +
> +    if ( d->claim_node != NUMA_NO_NODE )
> +    {
> +        BUG_ON(node_outstanding_claims(d->claim_node) < release);
> +        node_outstanding_claims(d->claim_node) -= release;
> +    }
>      d->outstanding_pages -= release;
> +
> +    if ( d->outstanding_pages == 0 )
> +        d->claim_node = NUMA_NO_NODE; /* Clear if no outstanding pages left 
> */

I fear I don't understand this. If the domain has claims on other nodes,
why would would it be switched back to non-NUMA claims?

> @@ -564,14 +642,41 @@ void consume_outstanding_claims(struct domain *d, 
> unsigned long allocation)
>  
>      /* Of course, the domain can only release up its outstanding claims */
>      allocation = min(allocation, d->outstanding_pages + 0UL);
> +
> +    if ( d->claim_node != NUMA_NO_NODE && d->claim_node != alloc_node )
> +    {
> +        /*
> +         * The domain has a claim on a node, but the alloc is on a different
> +         * node. If it would exceed the domain's max_pages, reduce the claim
> +         * up to the excess over max_pages so we don't reduce the claim more
> +         * than we have to to honor the max_pages limit.
> +         */
> +        unsigned long booked_pages = domain_tot_pages(d) + allocation +
> +                                     d->outstanding_pages;
> +        if ( booked_pages <= d->max_pages )
> +            return; /* booked is within max_pages, no excess, keep the claim 
> */
> +
> +        /* Excess detected, release the exceeding pages from the claimed 
> node */
> +        allocation = min(allocation, booked_pages - d->max_pages);
> +    }
>      release_outstanding_claims(d, allocation);

Please can there be another blank line above this one?

Why is the adjustment made excluded for the NUMA_NO_NODE case? That's odd in
itself, but particularly with release_outstanding_claims() possibly switching a
domain to NUMA_NO_NODE. Plus the caller looks to be passing in the actual node
memory was taken from, not what the original request said (which is specifically
relevant when the request named no particular node).

>  }
>  
> -int domain_set_outstanding_pages(struct domain *d, unsigned long pages)
> +/*
> + * Update outstanding claims for the domain. Note: The node is passed as an
> + * unsigned int to allow checking for overflow above the uint8_t nodeid_t 
> limit.
> + */
> +int domain_set_outstanding_pages(struct domain *d, unsigned long pages,
> +                                 unsigned int node)
>  {
>      int ret = -ENOMEM;
>      unsigned long claim, avail_pages;
>  
> +    /* When releasing a claim, the node must be NUMA_NO_NODE (it is not 
> used) */

Why would this be?

> +    if ( pages == 0 && node != NUMA_NO_NODE )
> +        return -EINVAL;
> +    if ( node != NUMA_NO_NODE && (node >= MAX_NUMNODES || 
> !node_online(node)) )
> +        return -ENOENT;
>      /*

Again, can there please be a blank line after each of the if()s?

> @@ -982,6 +1102,8 @@ static struct page_info *get_free_buddy(unsigned int 
> zone_lo,
>              }
>          } while ( zone-- > zone_lo ); /* careful: unsigned zone may wrap */
>  
> + try_next_node:
> +        /* If MEMF_exact_node was passed, we may not skip to a different 
> node */
>          if ( (memflags & MEMF_exact_node) && req_node != NUMA_NO_NODE )
>              return NULL;

As per this, ...

> @@ -1042,13 +1164,8 @@ static struct page_info *alloc_heap_pages(
>  
>      spin_lock(&heap_lock);
>  
> -    /*
> -     * Claimed memory is considered unavailable unless the request
> -     * is made by a domain with sufficient unclaimed pages.
> -     */
> -    if ( (outstanding_claims + request > total_avail_pages) &&
> -          ((memflags & MEMF_no_refcount) ||
> -           !d || d->outstanding_pages < request) )
> +    /* Proceed if host-level memory and claims permit this request to 
> proceed */
> +    if ( !host_allocatable_request(d, memflags, request) )

... in the MEMF_exact_node case I see little reason to check the global value
here.

Jan



 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.