|
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-devel] [PATCH RFC 41/44] x86/smp: Switch to using the percpu stacks
This is very easy for the APs. __high_start() is modified to switch stacks
before entering C. The BSP however is more complicated, and needs to stay on
cpu0_stack[] until setup is complete.
The end of __start_xen() is modified to copy the top-of-stack data to the
percpu stack immediately before jumping there. The VMCS Host and SYSENTER
stacks are suitably adjusted, and become construction-time constant.
The stack_start and stack_base[] array are removed completely, as well as the
memguard_guard_stack() infrastructure. The STACK_ORDER xenheap allocations
are no longer needed, and higher CPUs on large machines are finally
numa-local.
Signed-off-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
---
xen/arch/x86/boot/x86_64.S | 15 ++++++++-------
xen/arch/x86/efi/efi-boot.h | 8 ++++----
xen/arch/x86/hvm/vmx/vmcs.c | 21 ++++++++++-----------
xen/arch/x86/mm.c | 15 ---------------
xen/arch/x86/setup.c | 29 +++++++++++++++++++----------
xen/arch/x86/smpboot.c | 18 ------------------
xen/arch/x86/tboot.c | 29 +----------------------------
xen/arch/x86/traps.c | 10 ++--------
xen/include/asm-arm/mm.h | 1 -
xen/include/asm-x86/mm.h | 3 ---
xen/include/xen/smp.h | 2 --
11 files changed, 44 insertions(+), 107 deletions(-)
diff --git a/xen/arch/x86/boot/x86_64.S b/xen/arch/x86/boot/x86_64.S
index b1f0457..ed4c805 100644
--- a/xen/arch/x86/boot/x86_64.S
+++ b/xen/arch/x86/boot/x86_64.S
@@ -15,21 +15,25 @@ ENTRY(__high_start)
mov $XEN_MINIMAL_CR4,%rcx
mov %rcx,%cr4
- /* Set up %cr3 (differs between BSP and APs). */
+ /* Set up %cr3 and %rsp (differs between BSP and APs). */
test %ebx, %ebx
jz .Lbsp_setup
/* APs switch onto percpu_idle_pt[], as provided by do_boot_cpu(). */
mov ap_cr3(%rip), %rax
mov %rax, %cr3
+
+ /* APs move straight onto the PERCPU stack. */
+ movabs $STACK_SIZE - CPUINFO_sizeof + PERCPU_STACK_MAPPING, %rsp
+
jmp .Ldone
.Lbsp_setup:
/* The BSP stays on the idle_pg_table[] during early boot. */
-.Ldone:
- mov stack_start(%rip),%rsp
- or $(STACK_SIZE-CPUINFO_sizeof),%rsp
+ /* The BSP starts on cpu0_stack. */
+ lea STACK_SIZE - CPUINFO_sizeof + cpu0_stack(%rip), %rsp
+.Ldone:
/* Reset EFLAGS (subsumes CLI and CLD). */
pushq $0
@@ -61,9 +65,6 @@ GLOBAL(gdt_descr)
.word LAST_RESERVED_GDT_BYTE
.quad boot_cpu_gdt_table - FIRST_RESERVED_GDT_BYTE
-GLOBAL(stack_start)
- .quad cpu0_stack
-
.section .data.page_aligned, "aw", @progbits
.align PAGE_SIZE, 0
GLOBAL(boot_cpu_gdt_table)
diff --git a/xen/arch/x86/efi/efi-boot.h b/xen/arch/x86/efi/efi-boot.h
index d30f688..8af661b 100644
--- a/xen/arch/x86/efi/efi-boot.h
+++ b/xen/arch/x86/efi/efi-boot.h
@@ -251,15 +251,15 @@ static void __init noreturn efi_arch_post_exit_boot(void)
#endif
"movabs $__start_xen, %[rip]\n\t"
"lgdt gdt_descr(%%rip)\n\t"
- "mov stack_start(%%rip), %%rsp\n\t"
+ "lea %c[stkoff] + cpu0_stack(%%rip), %%rsp\n\t"
"mov %[ds], %%ss\n\t"
"mov %[ds], %%ds\n\t"
"mov %[ds], %%es\n\t"
"mov %[ds], %%fs\n\t"
"mov %[ds], %%gs\n\t"
- "movl %[cs], 8(%%rsp)\n\t"
- "mov %[rip], (%%rsp)\n\t"
- "lretq %[stkoff]-16"
+ "push %[cs]\n\t"
+ "push %[rip]\n\t"
+ "lretq"
: [rip] "=&r" (efer/* any dead 64-bit variable */),
[cr4] "+&r" (cr4)
: [cr3] "r" (idle_pg_table),
diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
index 795210f..483f72d 100644
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -804,15 +804,6 @@ static void vmx_set_host_env(struct vcpu *v)
__vmwrite(HOST_TR_BASE, (unsigned long)&per_cpu(init_tss, cpu));
- __vmwrite(HOST_SYSENTER_ESP, get_stack_bottom());
-
- /*
- * Skip end of cpu_user_regs when entering the hypervisor because the
- * CPU does not save context onto the stack. SS,RSP,CS,RIP,RFLAGS,etc
- * all get saved into the VMCS instead.
- */
- __vmwrite(HOST_RSP,
- (unsigned long)&get_cpu_info()->guest_cpu_user_regs.error_code);
}
void vmx_clear_msr_intercept(struct vcpu *v, unsigned int msr,
@@ -1148,13 +1139,21 @@ static int construct_vmcs(struct vcpu *v)
__vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0);
__vmwrite(HOST_CR4, mmu_cr4_features);
- /* Host CS:RIP. */
+ /* Host code/stack. */
__vmwrite(HOST_CS_SELECTOR, __HYPERVISOR_CS);
__vmwrite(HOST_RIP, (unsigned long)vmx_asm_vmexit_handler);
+ __vmwrite(HOST_RSP, /* VMExit doesn't push an excpetion frame. */
+ (PERCPU_STACK_MAPPING + STACK_SIZE -
+ sizeof(struct cpu_info) +
+ offsetof(struct cpu_info, guest_cpu_user_regs.error_code)));
- /* Host SYSENTER CS:RIP. */
+ /* Host SYSENTER code/stack. */
__vmwrite(HOST_SYSENTER_CS, __HYPERVISOR_CS);
__vmwrite(HOST_SYSENTER_EIP, (unsigned long)sysenter_entry);
+ __vmwrite(HOST_SYSENTER_ESP,
+ (PERCPU_STACK_MAPPING + STACK_SIZE -
+ sizeof(struct cpu_info) +
+ offsetof(struct cpu_info, guest_cpu_user_regs.es)));
/* MSR intercepts. */
__vmwrite(VM_EXIT_MSR_LOAD_COUNT, 0);
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 933bd67..cb54921 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -5281,21 +5281,6 @@ void memguard_unguard_range(void *p, unsigned long l)
#endif
-void memguard_guard_stack(void *p)
-{
- BUILD_BUG_ON((PRIMARY_STACK_SIZE + PAGE_SIZE) > STACK_SIZE);
- p = (void *)((unsigned long)p + STACK_SIZE -
- PRIMARY_STACK_SIZE - PAGE_SIZE);
- memguard_guard_range(p, PAGE_SIZE);
-}
-
-void memguard_unguard_stack(void *p)
-{
- p = (void *)((unsigned long)p + STACK_SIZE -
- PRIMARY_STACK_SIZE - PAGE_SIZE);
- memguard_unguard_range(p, PAGE_SIZE);
-}
-
void arch_dump_shared_mem_info(void)
{
printk("Shared frames %u -- Saved frames %u\n",
diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
index d624b95..c0f7289 100644
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -651,8 +651,6 @@ static void noinline init_done(void)
/* Reinitalise all state referring to the old virtual address of the stack. */
static void __init noreturn reinit_bsp_stack(void)
{
- unsigned long *stack = (void*)(get_stack_bottom() & ~(STACK_SIZE - 1));
-
/* Sanity check that IST settings weren't set up before this point. */
ASSERT(MASK_EXTR(idt_tables[0][TRAP_nmi].a, 7UL << 32) == 0);
@@ -664,9 +662,6 @@ static void __init noreturn reinit_bsp_stack(void)
/* Update SYSCALL trampolines */
percpu_traps_init();
- stack_base[0] = stack;
- memguard_guard_stack(stack);
-
reset_stack_and_jump(init_done);
}
@@ -1744,11 +1739,25 @@ void __init noreturn __start_xen(unsigned long mbi_p)
setup_io_bitmap(dom0);
- /* Jump to the 1:1 virtual mappings of cpu0_stack. */
- asm volatile ("mov %[stk], %%rsp; jmp %c[fn]" ::
- [stk] "g" (__va(__pa(get_stack_bottom()))),
- [fn] "i" (reinit_bsp_stack) : "memory");
- unreachable();
+ /*
+ * Switch from cpu0_stack to the percpu stack, copying the non-GPR
+ * cpu_info data into place before hand.
+ */
+ {
+ const struct cpu_info *src = get_cpu_info();
+ struct cpu_info *dst = _p(PERCPU_STACK_MAPPING + STACK_SIZE -
+ sizeof(*dst));
+
+ dst->processor_id = src->processor_id;
+ dst->current_vcpu = src->current_vcpu;
+ dst->per_cpu_offset = src->per_cpu_offset;
+ dst->cr4 = src->cr4;
+
+ asm volatile ("mov %[stk], %%rsp; jmp %c[fn]" ::
+ [stk] "g" (&dst->guest_cpu_user_regs.es),
+ [fn] "i" (reinit_bsp_stack) : "memory");
+ unreachable();
+ }
}
void arch_get_xen_caps(xen_capabilities_info_t *info)
diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c
index f785d5f..77ee883 100644
--- a/xen/arch/x86/smpboot.c
+++ b/xen/arch/x86/smpboot.c
@@ -91,8 +91,6 @@ static enum cpu_state {
} cpu_state;
#define set_cpu_state(state) do { smp_mb(); cpu_state = (state); } while (0)
-void *stack_base[NR_CPUS];
-
void initialize_cpu_data(unsigned int cpu)
{
cpu_data[cpu] = boot_cpu_data;
@@ -386,7 +384,6 @@ void start_secondary(void *unused)
/* Used to pass percpu_idle_pt to the booting AP. */
paddr_t ap_cr3;
-extern void *stack_start;
static int wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
{
@@ -529,7 +526,6 @@ static int do_boot_cpu(int apicid, int cpu)
cpu, apicid, start_eip);
ap_cr3 = per_cpu(percpu_idle_pt, cpu);
- stack_start = stack_base[cpu];
/* This grunge runs the startup process for the targeted processor. */
@@ -1002,13 +998,6 @@ static void cpu_smpboot_free(unsigned int cpu)
free_xenheap_page(idt_tables[cpu]);
idt_tables[cpu] = NULL;
- if ( stack_base[cpu] != NULL )
- {
- memguard_unguard_stack(stack_base[cpu]);
- free_xenheap_pages(stack_base[cpu], STACK_ORDER);
- stack_base[cpu] = NULL;
- }
-
if ( per_cpu(percpu_idle_pt, cpu) )
{
free_domheap_page(maddr_to_page(per_cpu(percpu_idle_pt, cpu)));
@@ -1030,11 +1019,6 @@ static int cpu_smpboot_alloc(unsigned int cpu)
if ( node != NUMA_NO_NODE )
memflags = MEMF_node(node);
- stack_base[cpu] = alloc_xenheap_pages(STACK_ORDER, memflags);
- if ( stack_base[cpu] == NULL )
- goto out;
- memguard_guard_stack(stack_base[cpu]);
-
order = get_order_from_pages(NR_RESERVED_GDT_PAGES);
per_cpu(gdt_table, cpu) = gdt = alloc_xenheap_pages(order, memflags);
if ( gdt == NULL )
@@ -1148,8 +1132,6 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
boot_cpu_physical_apicid = get_apic_id();
x86_cpu_to_apicid[0] = boot_cpu_physical_apicid;
- stack_base[0] = stack_start;
-
set_nr_sockets();
socket_cpumask = xzalloc_array(cpumask_t *, nr_sockets);
diff --git a/xen/arch/x86/tboot.c b/xen/arch/x86/tboot.c
index 59d7c47..c283b91 100644
--- a/xen/arch/x86/tboot.c
+++ b/xen/arch/x86/tboot.c
@@ -243,29 +243,6 @@ static void tboot_gen_domain_integrity(const uint8_t
key[TB_KEY_SIZE],
memset(&ctx, 0, sizeof(ctx));
}
-/*
- * For stack overflow detection in debug build, a guard page is set up.
- * This fn is used to detect whether a page is in the guarded pages for
- * the above reason.
- */
-static int mfn_in_guarded_stack(unsigned long mfn)
-{
- void *p;
- int i;
-
- for ( i = 0; i < nr_cpu_ids; i++ )
- {
- if ( !stack_base[i] )
- continue;
- p = (void *)((unsigned long)stack_base[i] + STACK_SIZE -
- PRIMARY_STACK_SIZE - PAGE_SIZE);
- if ( mfn == virt_to_mfn(p) )
- return -1;
- }
-
- return 0;
-}
-
static void tboot_gen_xenheap_integrity(const uint8_t key[TB_KEY_SIZE],
vmac_t *mac)
{
@@ -290,12 +267,8 @@ static void tboot_gen_xenheap_integrity(const uint8_t
key[TB_KEY_SIZE],
if ( is_page_in_use(page) && is_xen_heap_page(page) )
{
- void *pg;
-
- if ( mfn_in_guarded_stack(mfn) )
- continue; /* skip guard stack, see memguard_guard_stack() in
mm.c */
+ void *pg = mfn_to_virt(mfn);
- pg = mfn_to_virt(mfn);
vmac_update((uint8_t *)pg, PAGE_SIZE, &ctx);
}
}
diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c
index eeabb4a..493f8f3 100644
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -356,9 +356,6 @@ unsigned long get_stack_trace_bottom(unsigned long sp)
return ROUNDUP(sp, PAGE_SIZE) -
offsetof(struct cpu_user_regs, es) - sizeof(unsigned long);
-#ifndef MEMORY_GUARD
- case 3 ... 5:
-#endif
case 6 ... 7:
return ROUNDUP(sp, STACK_SIZE) -
sizeof(struct cpu_info) - sizeof(unsigned long);
@@ -375,9 +372,6 @@ unsigned long get_stack_dump_bottom(unsigned long sp)
case 0 ... 2:
return ROUNDUP(sp, PAGE_SIZE) - sizeof(unsigned long);
-#ifndef MEMORY_GUARD
- case 3 ... 5:
-#endif
case 6 ... 7:
return ROUNDUP(sp, STACK_SIZE) - sizeof(unsigned long);
@@ -518,9 +512,9 @@ void show_stack_overflow(unsigned int cpu, const struct
cpu_user_regs *regs)
unsigned long esp_top, esp_bottom;
#endif
- if ( _p(curr_stack_base) != stack_base[cpu] )
+ if ( curr_stack_base != PERCPU_STACK_MAPPING )
printk("Current stack base %p differs from expected %p\n",
- _p(curr_stack_base), stack_base[cpu]);
+ _p(curr_stack_base), _p(PERCPU_STACK_MAPPING));
#ifdef MEMORY_GUARD
esp_bottom = (esp | (STACK_SIZE - 1)) + 1;
diff --git a/xen/include/asm-arm/mm.h b/xen/include/asm-arm/mm.h
index 4d5563b..86b8fcb 100644
--- a/xen/include/asm-arm/mm.h
+++ b/xen/include/asm-arm/mm.h
@@ -362,7 +362,6 @@ unsigned long domain_get_maximum_gpfn(struct domain *d);
extern struct domain *dom_xen, *dom_io, *dom_cow;
-#define memguard_guard_stack(_p) ((void)0)
#define memguard_guard_range(_p,_l) ((void)0)
#define memguard_unguard_range(_p,_l) ((void)0)
diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
index 22c2809..2c1ed1d 100644
--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
@@ -521,9 +521,6 @@ void memguard_unguard_range(void *p, unsigned long l);
#define memguard_unguard_range(_p,_l) ((void)0)
#endif
-void memguard_guard_stack(void *p);
-void memguard_unguard_stack(void *p);
-
struct mmio_ro_emulate_ctxt {
unsigned long cr2;
unsigned int seg, bdf;
diff --git a/xen/include/xen/smp.h b/xen/include/xen/smp.h
index c55f57f..d30f369 100644
--- a/xen/include/xen/smp.h
+++ b/xen/include/xen/smp.h
@@ -69,8 +69,6 @@ void smp_send_call_function_mask(const cpumask_t *mask);
int alloc_cpu_id(void);
-extern void *stack_base[NR_CPUS];
-
void initialize_cpu_data(unsigned int cpu);
#endif /* __XEN_SMP_H__ */
--
2.1.4
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel
|
![]() |
Lists.xenproject.org is hosted with RackSpace, monitoring our |