|
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [xen staging] x86/pv: Optimise prefetching in svm_load_segs()
commit fca8d65d94939e05f7eff147358e58ffeb424e6e
Author: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
AuthorDate: Tue Sep 8 18:53:25 2020 +0100
Commit: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
CommitDate: Tue Sep 29 13:46:13 2020 +0100
x86/pv: Optimise prefetching in svm_load_segs()
Split into two functions. Passing a load of zeros in results in unnecessary
caller setup code.
Update the prefetching comment to note that the main point is the TLB fill.
Reorder the writes in svm_load_segs() to access the VMCB fields in ascending
order, which gets better next-line prefetch behaviour out of hardware.
Update
the prefetch instruction to match.
The net delta is:
add/remove: 1/0 grow/shrink: 0/2 up/down: 38/-39 (-1)
Function old new delta
svm_load_segs_prefetch - 38 +38
__context_switch 967 951 -16
svm_load_segs 291 268 -23
Signed-off-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
Reviewed-by: Jan Beulich <jbeulich@xxxxxxxx>
---
xen/arch/x86/domain.c | 2 +-
xen/arch/x86/hvm/svm/svm.c | 43 ++++++++++++++++++++-------------------
xen/include/asm-x86/hvm/svm/svm.h | 5 +++--
3 files changed, 26 insertions(+), 24 deletions(-)
diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index 1cc85f4ab9..5ce11cebf8 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -1925,7 +1925,7 @@ static void __context_switch(void)
/* Prefetch the VMCB if we expect to use it later in the context switch */
if ( cpu_has_svm && is_pv_domain(nd) && !is_pv_32bit_domain(nd) &&
!is_idle_domain(nd) )
- svm_load_segs(0, 0, 0, 0, 0);
+ svm_load_segs_prefetch();
#endif
if ( need_full_gdt(nd) && !per_cpu(full_gdt_loaded, cpu) )
diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c
index fa9fca94e2..9b65445577 100644
--- a/xen/arch/x86/hvm/svm/svm.c
+++ b/xen/arch/x86/hvm/svm/svm.c
@@ -1520,6 +1520,19 @@ static void svm_init_erratum_383(const struct
cpuinfo_x86 *c)
}
#ifdef CONFIG_PV
+void svm_load_segs_prefetch(void)
+{
+ const struct vmcb_struct *vmcb = this_cpu(host_vmcb_va);
+
+ if ( vmcb )
+ /*
+ * The main reason for this prefetch is for the TLB fill. Use the
+ * opportunity to fetch the lowest address used, to get the best
+ * behaviour out of hardware's next-line prefetcher.
+ */
+ prefetchw(&vmcb->fs);
+}
+
bool svm_load_segs(unsigned int ldt_ents, unsigned long ldt_base,
unsigned long fs_base, unsigned long gs_base,
unsigned long gs_shadow)
@@ -1530,17 +1543,15 @@ bool svm_load_segs(unsigned int ldt_ents, unsigned long
ldt_base,
if ( unlikely(!vmcb) )
return false;
- if ( !ldt_base )
- {
- /*
- * The actual structure field used here was arbitrarily chosen.
- * Empirically it doesn't seem to matter much which element is used,
- * and a clear explanation of the otherwise poor performance has not
- * been found/provided so far.
- */
- prefetchw(&vmcb->ldtr);
- return true;
- }
+ vmcb->fs.sel = 0;
+ vmcb->fs.attr = 0;
+ vmcb->fs.limit = 0;
+ vmcb->fs.base = fs_base;
+
+ vmcb->gs.sel = 0;
+ vmcb->gs.attr = 0;
+ vmcb->gs.limit = 0;
+ vmcb->gs.base = gs_base;
if ( likely(!ldt_ents) )
memset(&vmcb->ldtr, 0, sizeof(vmcb->ldtr));
@@ -1558,16 +1569,6 @@ bool svm_load_segs(unsigned int ldt_ents, unsigned long
ldt_base,
vmcb->ldtr.base = ldt_base;
}
- vmcb->fs.sel = 0;
- vmcb->fs.attr = 0;
- vmcb->fs.limit = 0;
- vmcb->fs.base = fs_base;
-
- vmcb->gs.sel = 0;
- vmcb->gs.attr = 0;
- vmcb->gs.limit = 0;
- vmcb->gs.base = gs_base;
-
vmcb->kerngsbase = gs_shadow;
svm_vmload_pa(per_cpu(host_vmcb, cpu));
diff --git a/xen/include/asm-x86/hvm/svm/svm.h
b/xen/include/asm-x86/hvm/svm/svm.h
index 2310878e41..faeca40174 100644
--- a/xen/include/asm-x86/hvm/svm/svm.h
+++ b/xen/include/asm-x86/hvm/svm/svm.h
@@ -50,12 +50,13 @@ void __update_guest_eip(struct cpu_user_regs *regs,
unsigned int inst_len);
void svm_update_guest_cr(struct vcpu *, unsigned int cr, unsigned int flags);
/*
- * PV context switch helper. Calls with zero ldt_base request a prefetch of
- * the VMCB area to be loaded from, instead of an actual load of state.
+ * PV context switch helpers. Prefetching the VMCB area itself has been shown
+ * to be useful for performance.
*
* Must only be used for NUL FS/GS, as the segment attributes/limits are not
* read from the GDT/LDT.
*/
+void svm_load_segs_prefetch(void);
bool svm_load_segs(unsigned int ldt_ents, unsigned long ldt_base,
unsigned long fs_base, unsigned long gs_base,
unsigned long gs_shadow);
--
generated by git-patchbot for /home/xen/git/xen.git#staging
|
![]() |
Lists.xenproject.org is hosted with RackSpace, monitoring our |