|
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-changelog] [xen staging-4.8] AMD/IOMMU: Cease using a dynamic height for the IOMMU pagetables
commit 8db85532cbb80c6396e5dab8809feb7b7b0d5c45
Author: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
AuthorDate: Wed Dec 11 16:00:02 2019 +0100
Commit: Jan Beulich <jbeulich@xxxxxxxx>
CommitDate: Wed Dec 11 16:00:02 2019 +0100
AMD/IOMMU: Cease using a dynamic height for the IOMMU pagetables
update_paging_mode() has multiple bugs:
1) Booting with iommu=debug will cause it to inform you that that it called
without the pdev_list lock held.
2) When growing by more than a single level, it leaks the newly allocated
table(s) in the case of a further error.
Furthermore, the choice of default level for a domain has issues:
1) All HVM guests grow from 2 to 3 levels during construction because of
the
position of the VRAM just below the 4G boundary, so defaulting to 2 is a
waste of effort.
2) The limit for PV guests doesn't take memory hotplug into account, and
isn't dynamic at runtime like HVM guests. This means that a PV guest
may
get RAM which it can't map in the IOMMU.
The dynamic height is a property unique to AMD, and adds a substantial
quantity of complexity for what is a marginal performance improvement.
Remove
the complexity by removing the dynamic height.
PV guests now get 3 or 4 levels based on any hotplug regions in the host.
This only makes a difference for hardware which previously had all RAM below
the 512G boundary, and a hotplug region above.
HVM guests now get 4 levels (which will be sufficient until 256TB guests
become a thing), because we don't currently have the information to know
when
3 would be safe to use.
The overhead of this extra level is not expected to be noticeable. It costs
one page (4k) per domain, and one extra IO-TLB paging structure cache entry
which is very hot and less likely to be evicted.
This is XSA-311.
Signed-off-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
Acked-by: Jan Beulich <jbeulich@xxxxxxxx>
master commit: b4f042236ae0bb6725b3e8dd40af5a2466a6f971
master date: 2019-12-11 14:55:32 +0100
---
xen/arch/x86/mm.c | 11 +++
xen/drivers/passthrough/amd/iommu_map.c | 104 ----------------------------
xen/drivers/passthrough/amd/pci_amd_iommu.c | 16 +++--
xen/include/xen/mm.h | 3 +
4 files changed, 25 insertions(+), 109 deletions(-)
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 3600e89f76..05d62a5d47 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -7403,6 +7403,17 @@ void paging_invlpg(struct vcpu *v, unsigned long va)
hvm_funcs.invlpg(v, va);
}
+unsigned long get_upper_mfn_bound(void)
+{
+ unsigned long max_mfn;
+
+ max_mfn = mem_hotplug ? PFN_DOWN(mem_hotplug) : max_page;
+#ifndef CONFIG_BIGMEM
+ max_mfn = min(max_mfn, 1UL << 32);
+#endif
+ return min(max_mfn, 1UL << (paddr_bits - PAGE_SHIFT)) - 1;
+}
+
/*
* Local variables:
* mode: C
diff --git a/xen/drivers/passthrough/amd/iommu_map.c
b/xen/drivers/passthrough/amd/iommu_map.c
index 6cbce1dfcd..688abf017d 100644
--- a/xen/drivers/passthrough/amd/iommu_map.c
+++ b/xen/drivers/passthrough/amd/iommu_map.c
@@ -570,97 +570,6 @@ static int iommu_pde_from_gfn(struct domain *d, unsigned
long pfn,
return 0;
}
-static int update_paging_mode(struct domain *d, unsigned long gfn)
-{
- u16 bdf;
- void *device_entry;
- unsigned int req_id, level, offset;
- unsigned long flags;
- struct pci_dev *pdev;
- struct amd_iommu *iommu = NULL;
- struct page_info *new_root = NULL;
- struct page_info *old_root = NULL;
- void *new_root_vaddr;
- unsigned long old_root_mfn;
- struct domain_iommu *hd = dom_iommu(d);
-
- if ( gfn == gfn_x(INVALID_GFN) )
- return -EADDRNOTAVAIL;
- ASSERT(!(gfn >> DEFAULT_DOMAIN_ADDRESS_WIDTH));
-
- level = hd->arch.paging_mode;
- old_root = hd->arch.root_table;
- offset = gfn >> (PTE_PER_TABLE_SHIFT * (level - 1));
-
- ASSERT(spin_is_locked(&hd->arch.mapping_lock) && is_hvm_domain(d));
-
- while ( offset >= PTE_PER_TABLE_SIZE )
- {
- /* Allocate and install a new root table.
- * Only upper I/O page table grows, no need to fix next level bits */
- new_root = alloc_amd_iommu_pgtable();
- if ( new_root == NULL )
- {
- AMD_IOMMU_DEBUG("%s Cannot allocate I/O page table\n",
- __func__);
- return -ENOMEM;
- }
-
- new_root_vaddr = __map_domain_page(new_root);
- old_root_mfn = page_to_mfn(old_root);
- set_iommu_pde_present(new_root_vaddr, old_root_mfn, level,
- !!IOMMUF_writable, !!IOMMUF_readable);
- level++;
- old_root = new_root;
- offset >>= PTE_PER_TABLE_SHIFT;
- unmap_domain_page(new_root_vaddr);
- }
-
- if ( new_root != NULL )
- {
- hd->arch.paging_mode = level;
- hd->arch.root_table = new_root;
-
- if ( !pcidevs_locked() )
- AMD_IOMMU_DEBUG("%s Try to access pdev_list "
- "without aquiring pcidevs_lock.\n", __func__);
-
- /* Update device table entries using new root table and paging mode */
- for_each_pdev( d, pdev )
- {
- bdf = PCI_BDF2(pdev->bus, pdev->devfn);
- iommu = find_iommu_for_device(pdev->seg, bdf);
- if ( !iommu )
- {
- AMD_IOMMU_DEBUG("%s Fail to find iommu.\n", __func__);
- return -ENODEV;
- }
-
- spin_lock_irqsave(&iommu->lock, flags);
- do {
- req_id = get_dma_requestor_id(pdev->seg, bdf);
- device_entry = iommu->dev_table.buffer +
- (req_id * IOMMU_DEV_TABLE_ENTRY_SIZE);
-
- /* valid = 0 only works for dom0 passthrough mode */
- amd_iommu_set_root_page_table((u32 *)device_entry,
-
page_to_maddr(hd->arch.root_table),
- d->domain_id,
- hd->arch.paging_mode, 1);
-
- amd_iommu_flush_device(iommu, req_id);
- bdf += pdev->phantom_stride;
- } while ( PCI_DEVFN2(bdf) != pdev->devfn &&
- PCI_SLOT(bdf) == PCI_SLOT(pdev->devfn) );
- spin_unlock_irqrestore(&iommu->lock, flags);
- }
-
- /* For safety, invalidate all entries */
- amd_iommu_flush_all_pages(d);
- }
- return 0;
-}
-
int amd_iommu_map_page(struct domain *d, unsigned long gfn, unsigned long mfn,
unsigned int flags)
{
@@ -678,19 +587,6 @@ int amd_iommu_map_page(struct domain *d, unsigned long
gfn, unsigned long mfn,
spin_lock(&hd->arch.mapping_lock);
- /* Since HVM domain is initialized with 2 level IO page table,
- * we might need a deeper page table for lager gfn now */
- if ( is_hvm_domain(d) )
- {
- if ( update_paging_mode(d, gfn) )
- {
- spin_unlock(&hd->arch.mapping_lock);
- AMD_IOMMU_DEBUG("Update page mode failed gfn = %lx\n", gfn);
- domain_crash(d);
- return -EFAULT;
- }
- }
-
if ( iommu_pde_from_gfn(d, gfn, pt_mfn, true) || (pt_mfn[1] == 0) )
{
spin_unlock(&hd->arch.mapping_lock);
diff --git a/xen/drivers/passthrough/amd/pci_amd_iommu.c
b/xen/drivers/passthrough/amd/pci_amd_iommu.c
index f9aee7907f..2b62e1b256 100644
--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -271,11 +271,17 @@ static int amd_iommu_domain_init(struct domain *d)
return -ENOMEM;
}
- /* For pv and dom0, stick with get_paging_mode(max_page)
- * For HVM dom0, use 2 level page table at first */
- hd->arch.paging_mode = is_hvm_domain(d) ?
- IOMMU_PAGING_MODE_LEVEL_2 :
- get_paging_mode(max_page);
+ /*
+ * Choose the number of levels for the IOMMU page tables.
+ * - PV needs 3 or 4, depending on whether there is RAM (including hotplug
+ * RAM) above the 512G boundary.
+ * - HVM could in principle use 3 or 4 depending on how much guest
+ * physical address space we give it, but this isn't known yet so use 4
+ * unilaterally.
+ */
+ hd->arch.paging_mode = is_hvm_domain(d)
+ ? IOMMU_PAGING_MODE_LEVEL_4 : get_paging_mode(get_upper_mfn_bound());
+
return 0;
}
diff --git a/xen/include/xen/mm.h b/xen/include/xen/mm.h
index 05f7dea14e..48ea6d652c 100644
--- a/xen/include/xen/mm.h
+++ b/xen/include/xen/mm.h
@@ -577,6 +577,9 @@ int prepare_ring_for_helper(struct domain *d, unsigned long
gmfn,
struct page_info **_page, void **_va);
void destroy_ring_for_helper(void **_va, struct page_info *page);
+/* Return the upper bound of MFNs, including hotplug memory. */
+unsigned long get_upper_mfn_bound(void);
+
#include <asm/flushtlb.h>
static inline void accumulate_tlbflush(bool *need_tlbflush,
--
generated by git-patchbot for /home/xen/git/xen.git#staging-4.8
_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/xen-changelog
|
![]() |
Lists.xenproject.org is hosted with RackSpace, monitoring our |