[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[xen staging] xen/riscv: implement p2m_set_range()



commit fcd6755606361c3b5a409b5cd4b2a641de2feb71
Author:     Oleksii Kurochko <oleksii.kurochko@xxxxxxxxx>
AuthorDate: Tue Dec 16 17:55:20 2025 +0100
Commit:     Jan Beulich <jbeulich@xxxxxxxx>
CommitDate: Thu Dec 18 14:24:32 2025 +0100

    xen/riscv: implement p2m_set_range()
    
    This patch introduces p2m_set_range() and its core helper p2m_set_entry() 
for
    RISC-V, based loosely on the Arm implementation, with several 
RISC-V-specific
    modifications.
    
    The main changes are:
    - Simplification of Break-Before-Make (BBM) approach as according to RISC-V
      spec:
        It is permitted for multiple address-translation cache entries to 
co-exist
        for the same address. This represents the fact that in a conventional
        TLB hierarchy, it is possible for multiple entries to match a single
        address if, for example, a page is upgraded to a superpage without first
        clearing the original non-leaf PTEâ??s valid bit and executing an 
SFENCE.VMA
        with rs1=x0, or if multiple TLBs exist in parallel at a given level of 
the
        hierarchy. In this case, just as if an SFENCE.VMA is not executed 
between
        a write to the memory-management tables and subsequent implicit read of 
the
        same address: it is unpredictable whether the old non-leaf PTE or the 
new
        leaf PTE is used, but the behavior is otherwise well defined.
      In contrast to the Arm architecture, where BBM is mandatory and failing to
      use it in some cases can lead to CPU instability, RISC-V guarantees
      stability, and the behavior remains safe â?? though unpredictable in 
terms of
      which translation will be used.
    - Unlike Arm, the valid bit is not repurposed for other uses in this
      implementation. Instead, entry validity is determined based solely on P2M
      PTE's valid bit.
    
    The main functionality is in p2m_set_entry(), which handles mappings aligned
    to page table block entries (e.g., 1GB, 2MB, or 4KB with 4KB granularity).
    
    p2m_set_range() breaks a region down into block-aligned mappings and calls
    p2m_set_entry() accordingly.
    
    Stub implementations (to be completed later) include:
    - p2m_free_subtree()
    - p2m_next_level()
    - p2m_pte_from_mfn()
    
    Note: Support for shattering block entries is not implemented in this
    patch and will be added separately.
    
    Additionally, some straightforward helper functions are now implemented:
    - p2m_write_pte()
    - p2m_remove_pte()
    - p2m_get_root_pointer()
    
    Signed-off-by: Oleksii Kurochko <oleksii.kurochko@xxxxxxxxx>
    Acked-by: Jan Beulich <jbeulich@xxxxxxxx>
---
 xen/arch/riscv/include/asm/p2m.h |  38 +++++
 xen/arch/riscv/p2m.c             | 326 ++++++++++++++++++++++++++++++++++++++-
 2 files changed, 363 insertions(+), 1 deletion(-)

diff --git a/xen/arch/riscv/include/asm/p2m.h b/xen/arch/riscv/include/asm/p2m.h
index 9acd6a64a8..fa55d8a3bc 100644
--- a/xen/arch/riscv/include/asm/p2m.h
+++ b/xen/arch/riscv/include/asm/p2m.h
@@ -8,10 +8,38 @@
 #include <xen/rwlock.h>
 #include <xen/types.h>
 
+#include <asm/page.h>
 #include <asm/page-bits.h>
 
 #define P2M_ROOT_ORDER  (ilog2(GSTAGE_ROOT_PAGE_TABLE_SIZE) - PAGE_SHIFT)
 #define P2M_ROOT_PAGES  BIT(P2M_ROOT_ORDER, U)
+#define P2M_ROOT_LEVEL(p2m) ((p2m)->mode.paging_levels)
+
+/*
+ * According to the RISC-V spec:
+ *   When hgatp.MODE specifies a translation scheme of Sv32x4, Sv39x4, Sv48x4,
+ *   or Sv57x4, G-stage address translation is a variation on the usual
+ *   page-based virtual address translation scheme of Sv32, Sv39, Sv48, or
+ *   Sv57, respectively. In each case, the size of the incoming address is
+ *   widened by 2 bits (to 34, 41, 50, or 59 bits).
+ *
+ * P2M_LEVEL_ORDER(lvl) defines the bit position in the GFN from which
+ * the index for this level of the P2M page table starts. The extra 2
+ * bits added by the "x4" schemes only affect the root page table width.
+ *
+ * Therefore, this macro can safely reuse XEN_PT_LEVEL_ORDER() for all
+ * levels: the extra 2 bits do not change the indices of lower levels.
+ */
+#define P2M_LEVEL_ORDER(lvl) XEN_PT_LEVEL_ORDER(lvl)
+
+#define P2M_ROOT_EXTRA_BITS(p2m, lvl) (2 * ((lvl) == P2M_ROOT_LEVEL(p2m)))
+
+#define P2M_PAGETABLE_ENTRIES(p2m, lvl) \
+    (BIT(PAGETABLE_ORDER + P2M_ROOT_EXTRA_BITS(p2m, lvl), UL))
+
+#define P2M_TABLE_OFFSET(p2m, lvl) (P2M_PAGETABLE_ENTRIES(p2m, lvl) - 1UL)
+
+#define P2M_GFN_LEVEL_SHIFT(lvl) (P2M_LEVEL_ORDER(lvl) + PAGE_SHIFT)
 
 #define paddr_bits PADDR_BITS
 
@@ -58,6 +86,16 @@ struct p2m_domain {
      * when a page is needed to be fully cleared and cleaned.
      */
     bool clean_dcache;
+
+    /* Highest guest frame that's ever been mapped in the p2m */
+    gfn_t max_mapped_gfn;
+
+    /*
+     * Lowest mapped gfn in the p2m. When releasing mapped gfn's in a
+     * preemptible manner this is updated to track where to resume
+     * the search. Apart from during teardown this can only decrease.
+     */
+    gfn_t lowest_mapped_gfn;
 };
 
 /*
diff --git a/xen/arch/riscv/p2m.c b/xen/arch/riscv/p2m.c
index 8bb197f9b3..c23926933f 100644
--- a/xen/arch/riscv/p2m.c
+++ b/xen/arch/riscv/p2m.c
@@ -9,6 +9,7 @@
 #include <xen/rwlock.h>
 #include <xen/sched.h>
 #include <xen/sections.h>
+#include <xen/xvmalloc.h>
 
 #include <asm/csr.h>
 #include <asm/flushtlb.h>
@@ -17,6 +18,13 @@
 #include <asm/riscv_encoding.h>
 #include <asm/vmid.h>
 
+/*
+ * At the moment, only 4K, 2M, and 1G mappings are supported for G-stage
+ * translation. Therefore, the maximum supported page-table level is 2,
+ * which corresponds to 1G mappings.
+ */
+#define P2M_MAX_SUPPORTED_LEVEL_MAPPING _AC(2, U)
+
 static struct gstage_mode_desc __ro_after_init max_gstage_mode = {
     .mode = HGATP_MODE_OFF,
     .paging_levels = 0,
@@ -28,6 +36,77 @@ unsigned char get_max_supported_mode(void)
     return max_gstage_mode.mode;
 }
 
+static inline unsigned int calc_offset(const struct p2m_domain *p2m,
+                                       const unsigned int lvl,
+                                       const paddr_t gpa)
+{
+    unsigned int off = (gpa >> P2M_GFN_LEVEL_SHIFT(lvl)) &
+                       P2M_TABLE_OFFSET(p2m, lvl);
+
+    /*
+     * For P2M_ROOT_LEVEL, `offset` ranges from 0 to 2047, since the root
+     * page table spans 4 consecutive 4KB pages.
+     * We want to return an index within one of these 4 pages.
+     * The specific page to use is determined by `p2m_get_root_pointer()`.
+     *
+     * Example: if `offset == 512`:
+     *  - A single 4KB page holds 512 entries.
+     *  - Therefore, entry 512 corresponds to index 0 of the second page.
+     *
+     * At all other levels, only one page is allocated, and `offset` is
+     * always in the range 0 to 511, since the VPN is 9 bits long.
+     */
+    return off & (PAGETABLE_ENTRIES - 1);
+}
+
+#define P2M_MAX_ROOT_LEVEL 5
+
+#define P2M_BUILD_LEVEL_OFFSETS(p2m, var, addr) \
+    unsigned int var[P2M_MAX_ROOT_LEVEL]; \
+    BUG_ON(P2M_ROOT_LEVEL(p2m) >= P2M_MAX_ROOT_LEVEL); \
+    for ( unsigned int i = 0; i <= P2M_ROOT_LEVEL(p2m); i++ ) \
+        var[i] = calc_offset(p2m, i, addr);
+
+/*
+ * Map one of the four root pages of the P2M root page table.
+ *
+ * The P2M root page table is larger than normal (16KB instead of 4KB),
+ * so it is allocated as four consecutive 4KB pages. This function selects
+ * the appropriate 4KB page based on the given GFN and returns a mapping
+ * to it.
+ *
+ * The caller is responsible for unmapping the page after use.
+ *
+ * Returns NULL if the calculated offset into the root table is invalid.
+ */
+static pte_t *p2m_get_root_pointer(struct p2m_domain *p2m, gfn_t gfn)
+{
+    unsigned long idx;
+    unsigned long root_level = P2M_ROOT_LEVEL(p2m);
+
+    idx = gfn_x(gfn) >> P2M_LEVEL_ORDER(root_level);
+    if ( idx >= P2M_PAGETABLE_ENTRIES(p2m, root_level) )
+        return NULL;
+
+    /*
+     * The P2M root page table is extended by 2 bits, making its size 16KB
+     * (instead of 4KB for non-root page tables). Therefore, p2m->root is
+     * allocated as four consecutive 4KB pages (since alloc_domheap_pages()
+     * only allocates 4KB pages).
+     *
+     * Initially, `idx` is derived directly from `gfn`.
+     * To locate the correct entry within a single 4KB page,
+     * we rescale the offset so it falls within one of the 4 pages.
+     *
+     * Example: if `idx == 512`
+     * - A 4KB page holds 512 entries.
+     * - Thus, entry 512 corresponds to index 0 of the second page.
+     */
+    idx /= PAGETABLE_ENTRIES;
+
+    return __map_domain_page(p2m->root + idx);
+}
+
 static void __init gstage_mode_detect(void)
 {
     static const struct gstage_mode_desc modes[] __initconst = {
@@ -228,6 +307,9 @@ int p2m_init(struct domain *d)
     rwlock_init(&p2m->lock);
     INIT_PAGE_LIST_HEAD(&p2m->pages);
 
+    p2m->max_mapped_gfn = _gfn(0);
+    p2m->lowest_mapped_gfn = _gfn(ULONG_MAX);
+
     /*
      * Currently, the infrastructure required to enable CONFIG_HAS_PASSTHROUGH
      * is not ready for RISC-V support.
@@ -274,13 +356,255 @@ int p2m_set_allocation(struct domain *d, unsigned long 
pages, bool *preempted)
     return rc;
 }
 
+static inline void p2m_write_pte(pte_t *p, pte_t pte, bool clean_cache)
+{
+    write_pte(p, pte);
+
+    /*
+     * TODO: if multiple adjacent PTEs are written without releasing
+     *       the lock, this then redundant cache flushing can be a
+     *       performance issue.
+     */
+    if ( clean_cache )
+        clean_dcache_va_range(p, sizeof(*p));
+}
+
+static inline void p2m_clean_pte(pte_t *p, bool clean_cache)
+{
+    pte_t pte = { .pte = 0 };
+
+    p2m_write_pte(p, pte, clean_cache);
+}
+
+static pte_t p2m_pte_from_mfn(mfn_t mfn, p2m_type_t t)
+{
+    panic("%s: hasn't been implemented yet\n", __func__);
+
+    return (pte_t) { .pte = 0 };
+}
+
+#define P2M_TABLE_MAP_NONE 0
+#define P2M_TABLE_MAP_NOMEM 1
+#define P2M_TABLE_SUPER_PAGE 2
+#define P2M_TABLE_NORMAL 3
+
+/*
+ * Take the currently mapped table, find the entry corresponding to the GFN,
+ * and map the next-level table if available. The previous table will be
+ * unmapped if the next level was mapped (e.g., when P2M_TABLE_NORMAL is
+ * returned).
+ *
+ * `alloc_tbl` parameter indicates whether intermediate tables should
+ * be allocated when not present.
+ *
+ * Return values:
+ *  P2M_TABLE_MAP_NONE: a table allocation isn't permitted.
+ *  P2M_TABLE_MAP_NOMEM: allocating a new page failed.
+ *  P2M_TABLE_SUPER_PAGE: next level or leaf mapped normally.
+ *  P2M_TABLE_NORMAL: The next entry points to a superpage.
+ */
+static int p2m_next_level(struct p2m_domain *p2m, bool alloc_tbl,
+                          unsigned int level, pte_t **table,
+                          unsigned int offset)
+{
+    panic("%s: hasn't been implemented yet\n", __func__);
+
+    return P2M_TABLE_MAP_NONE;
+}
+
+/* Free pte sub-tree behind an entry */
+static void p2m_free_subtree(struct p2m_domain *p2m,
+                             pte_t entry, unsigned int level)
+{
+    panic("%s: hasn't been implemented yet\n", __func__);
+}
+
+/* Insert an entry in the p2m */
+static int p2m_set_entry(struct p2m_domain *p2m,
+                         gfn_t gfn,
+                         unsigned long page_order,
+                         mfn_t mfn,
+                         p2m_type_t t)
+{
+    unsigned int level;
+    unsigned int target = page_order / PAGETABLE_ORDER;
+    pte_t *entry, *table, orig_pte;
+    int rc;
+    /*
+     * A mapping is removed only if the MFN is explicitly set to INVALID_MFN.
+     * Other MFNs that are considered invalid by mfn_valid() (e.g., MMIO)
+     * are still allowed.
+     */
+    bool removing_mapping = mfn_eq(mfn, INVALID_MFN);
+    P2M_BUILD_LEVEL_OFFSETS(p2m, offsets, gfn_to_gaddr(gfn));
+
+    ASSERT(p2m_is_write_locked(p2m));
+
+    /*
+     * Check if the level target is valid: we only support
+     * 4K - 2M - 1G mapping.
+     */
+    ASSERT(target <= P2M_MAX_SUPPORTED_LEVEL_MAPPING);
+
+    table = p2m_get_root_pointer(p2m, gfn);
+    if ( !table )
+        return -EINVAL;
+
+    for ( level = P2M_ROOT_LEVEL(p2m); level > target; level-- )
+    {
+        /*
+         * Don't try to allocate intermediate page table if the mapping
+         * is about to be removed.
+         */
+        rc = p2m_next_level(p2m, !removing_mapping,
+                            level, &table, offsets[level]);
+        if ( (rc == P2M_TABLE_MAP_NONE) || (rc == P2M_TABLE_MAP_NOMEM) )
+        {
+            rc = (rc == P2M_TABLE_MAP_NONE) ? -ENOENT : -ENOMEM;
+            /*
+             * We are here because p2m_next_level has failed to map
+             * the intermediate page table (e.g the table does not exist
+             * and none should be allocated). It is a valid case
+             * when removing a mapping as it may not exist in the
+             * page table. In this case, just ignore lookup failure.
+             */
+            rc = removing_mapping ? 0 : rc;
+            goto out;
+        }
+
+        if ( rc != P2M_TABLE_NORMAL )
+            break;
+    }
+
+    entry = table + offsets[level];
+
+    /*
+     * If we are here with level > target, we must be at a leaf node,
+     * and we need to break up the superpage.
+     */
+    if ( level > target )
+    {
+        panic("Shattering isn't implemented\n");
+    }
+
+    /*
+     * We should always be there with the correct level because all the
+     * intermediate tables have been installed if necessary.
+     */
+    ASSERT(level == target);
+
+    orig_pte = *entry;
+
+    if ( removing_mapping )
+        p2m_clean_pte(entry, p2m->clean_dcache);
+    else
+    {
+        pte_t pte = p2m_pte_from_mfn(mfn, t);
+
+        p2m_write_pte(entry, pte, p2m->clean_dcache);
+
+        p2m->max_mapped_gfn = gfn_max(p2m->max_mapped_gfn,
+                                      gfn_add(gfn, BIT(page_order, UL) - 1));
+        p2m->lowest_mapped_gfn = gfn_min(p2m->lowest_mapped_gfn, gfn);
+    }
+
+    p2m->need_flush = true;
+
+    /*
+     * Currently, the infrastructure required to enable CONFIG_HAS_PASSTHROUGH
+     * is not ready for RISC-V support.
+     *
+     * When CONFIG_HAS_PASSTHROUGH=y, iommu_iotlb_flush() should be done
+     * here.
+     */
+#ifdef CONFIG_HAS_PASSTHROUGH
+#   error "add code to flush IOMMU TLB"
+#endif
+
+    rc = 0;
+
+    /*
+     * In case of a VALID -> INVALID transition, the original PTE should
+     * always be freed.
+     *
+     * In case of a VALID -> VALID transition, the original PTE should be
+     * freed only if the MFNs are different. If the MFNs are the same
+     * (i.e., only permissions differ), there is no need to free the
+     * original PTE.
+     */
+    if ( pte_is_valid(orig_pte) &&
+         (!pte_is_valid(*entry) ||
+          !mfn_eq(pte_get_mfn(*entry), pte_get_mfn(orig_pte))) )
+        p2m_free_subtree(p2m, orig_pte, level);
+
+ out:
+    unmap_domain_page(table);
+
+    return rc;
+}
+
+/* Return mapping order for given gfn, mfn and nr */
+static unsigned long p2m_mapping_order(const struct p2m_domain *p2m, gfn_t gfn,
+                                       mfn_t mfn, unsigned long nr)
+{
+    unsigned long mask;
+    /* 1gb, 2mb, 4k mappings are supported */
+    unsigned int level = min(P2M_ROOT_LEVEL(p2m), 
P2M_MAX_SUPPORTED_LEVEL_MAPPING);
+    unsigned long order = 0;
+
+    mask = !mfn_eq(mfn, INVALID_MFN) ? mfn_x(mfn) : 0;
+    mask |= gfn_x(gfn);
+
+    for ( ; level != 0; level-- )
+    {
+        if ( !(mask & (BIT(P2M_LEVEL_ORDER(level), UL) - 1)) &&
+             (nr >= BIT(P2M_LEVEL_ORDER(level), UL)) )
+        {
+            order = P2M_LEVEL_ORDER(level);
+            break;
+        }
+    }
+
+    return order;
+}
+
 static int p2m_set_range(struct p2m_domain *p2m,
                          gfn_t sgfn,
                          unsigned long nr,
                          mfn_t smfn,
                          p2m_type_t t)
 {
-    return -EOPNOTSUPP;
+    int rc = 0;
+    unsigned long left = nr;
+
+    /*
+     * Any reference taken by the P2M mappings (e.g. foreign mapping) will
+     * be dropped in relinquish_p2m_mapping(). As the P2M will still
+     * be accessible after, we need to prevent mapping to be added when the
+     * domain is dying.
+     */
+    if ( unlikely(p2m->domain->is_dying) )
+        return -EACCES;
+
+    while ( left )
+    {
+        unsigned long order = p2m_mapping_order(p2m, sgfn, smfn, left);
+
+        rc = p2m_set_entry(p2m, sgfn, order, smfn, t);
+        if ( rc )
+            break;
+
+        sgfn = gfn_add(sgfn, BIT(order, UL));
+        if ( !mfn_eq(smfn, INVALID_MFN) )
+            smfn = mfn_add(smfn, BIT(order, UL));
+
+        left -= BIT(order, UL);
+    }
+
+    if ( left > INT_MAX )
+        rc = -EOVERFLOW;
+
+    return !left ? rc : left;
 }
 
 int map_regions_p2mt(struct domain *d,
--
generated by git-patchbot for /home/xen/git/xen.git#staging



 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.