x86: improve CR0 read/write handling With the only bit in CR0 permitted to be changed by PV guests being TS, optimize the handling towards that: Keep a cached value in a per-CPU variable, and issue HYPERVISOR_fpu_taskswitch hypercalls for updates in all but the unusual case should something in the system still try to modify another bit (the attempt of which would then be logged by the hypervisor). This removes the need to have the hypervisor emulate MOV to/from CR0 instructions in all halfway frequently executed code paths. Signed-off-by: Jan Beulich --- a/arch/i386/kernel/cpu/common-xen.c +++ b/arch/i386/kernel/cpu/common-xen.c @@ -32,6 +32,9 @@ EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr); #ifndef CONFIG_XEN DEFINE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]); EXPORT_PER_CPU_SYMBOL(cpu_16bit_stack); +#else +DEFINE_PER_CPU(unsigned int, xen_x86_cr0); +EXPORT_PER_CPU_SYMBOL(xen_x86_cr0); #endif static int cachesize_override __cpuinitdata = -1; @@ -681,6 +684,7 @@ old_gdt: cpu_gdt_descr->size = GDT_SIZE - 1; cpu_gdt_descr->address = (unsigned long)gdt; #else + __get_cpu_var(xen_x86_cr0) = raw_read_cr0(); if (cpu == 0 && cpu_gdt_descr->address == 0) { gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE); /* alloc_bootmem_pages panics on failure, so no check */ --- a/arch/i386/kernel/process-xen.c +++ b/arch/i386/kernel/process-xen.c @@ -641,6 +641,8 @@ struct task_struct fastcall * __switch_t BUG_ON(mcl > _mcl + ARRAY_SIZE(_mcl)); if (unlikely(HYPERVISOR_multicall_check(_mcl, mcl - _mcl, NULL))) BUG(); + if (_mcl->op == __HYPERVISOR_fpu_taskswitch) + __get_cpu_var(xen_x86_cr0) |= X86_CR0_TS; /* * Restore %fs and %gs if needed. --- a/arch/i386/kernel/traps-xen.c +++ b/arch/i386/kernel/traps-xen.c @@ -1057,6 +1057,7 @@ asmlinkage void math_state_restore(struc struct task_struct *tsk = thread->task; /* NB. 'clts' is done for us by Xen during virtual trap. */ + __get_cpu_var(xen_x86_cr0) &= ~X86_CR0_TS; if (!tsk_used_math(tsk)) init_fpu(tsk); restore_fpu(tsk); --- a/arch/x86_64/kernel/process-xen.c +++ b/arch/x86_64/kernel/process-xen.c @@ -574,6 +574,8 @@ __switch_to(struct task_struct *prev_p, BUG_ON(mcl > _mcl + ARRAY_SIZE(_mcl)); if (unlikely(HYPERVISOR_multicall_check(_mcl, mcl - _mcl, NULL))) BUG(); + if (_mcl->op == __HYPERVISOR_fpu_taskswitch) + __get_cpu_var(xen_x86_cr0) |= X86_CR0_TS; /* * Switch DS and ES. --- a/arch/x86_64/kernel/setup64-xen.c +++ b/arch/x86_64/kernel/setup64-xen.c @@ -126,6 +126,9 @@ void __init setup_per_cpu_areas(void) } #ifdef CONFIG_XEN +DEFINE_PER_CPU(unsigned long, xen_x86_cr0); +EXPORT_PER_CPU_SYMBOL(xen_x86_cr0); + static void switch_pt(void) { xen_pt_switch(__pa_symbol(init_level4_pgt)); @@ -174,6 +177,7 @@ void pda_init(int cpu) if (HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL, (unsigned long)pda)) BUG(); + __get_cpu_var(xen_x86_cr0) = raw_read_cr0(); #endif pda->cpunumber = cpu; pda->irqcount = -1; --- a/arch/x86_64/kernel/traps-xen.c +++ b/arch/x86_64/kernel/traps-xen.c @@ -1075,8 +1075,9 @@ asmlinkage void __attribute__((weak)) mc asmlinkage void math_state_restore(void) { struct task_struct *me = current; - /* clts(); */ /* 'clts' is done for us by Xen during virtual trap. */ + /* NB. 'clts' is done for us by Xen during virtual trap. */ + __get_cpu_var(xen_x86_cr0) &= ~X86_CR0_TS; if (!used_math()) init_fpu(me); restore_fpu_checking(&me->thread.i387.fxsave); --- a/include/asm-i386/mach-xen/asm/system.h +++ b/include/asm-i386/mach-xen/asm/system.h @@ -2,8 +2,10 @@ #define __ASM_SYSTEM_H #include +#include #include #include +#include #include /* for LOCK_PREFIX */ #include #include @@ -90,15 +91,30 @@ __asm__ __volatile__ ("movw %%dx,%1\n\t" #define savesegment(seg, value) \ asm volatile("mov %%" #seg ",%0":"=rm" (value)) -#define read_cr0() ({ \ +DECLARE_PER_CPU(unsigned int, xen_x86_cr0); + +#define raw_read_cr0() ({ \ unsigned int __dummy; \ __asm__ __volatile__( \ "movl %%cr0,%0\n\t" \ :"=r" (__dummy)); \ __dummy; \ }) -#define write_cr0(x) \ - __asm__ __volatile__("movl %0,%%cr0": :"r" (x)) +#define read_cr0() __get_cpu_var(xen_x86_cr0) +#define write_cr0(x) do { \ + unsigned int x__ = (x); \ + switch (x__ ^ __get_cpu_var(xen_x86_cr0)) { \ + case 0: \ + continue; \ + case X86_CR0_TS: \ + HYPERVISOR_fpu_taskswitch(!!(x__ & X86_CR0_TS)); \ + break; \ + default: \ + __asm__ __volatile__("movl %0,%%cr0": :"r" (x__)); \ + break; \ + } \ + __get_cpu_var(xen_x86_cr0) = x__; \ +} while (0) #define read_cr2() (current_vcpu_info()->arch.cr2) #define write_cr2(x) \ @@ -142,8 +158,19 @@ __asm__ __volatile__ ("movw %%dx,%1\n\t" /* * Clear and set 'TS' bit respectively */ -#define clts() (HYPERVISOR_fpu_taskswitch(0)) -#define stts() (HYPERVISOR_fpu_taskswitch(1)) +#define X86_CR0_TS 8 +#define clts() ({ \ + if (__get_cpu_var(xen_x86_cr0) & X86_CR0_TS) { \ + HYPERVISOR_fpu_taskswitch(0); \ + __get_cpu_var(xen_x86_cr0) &= ~X86_CR0_TS; \ + } \ +}) +#define stts() ({ \ + if (!(__get_cpu_var(xen_x86_cr0) & X86_CR0_TS)) { \ + HYPERVISOR_fpu_taskswitch(1); \ + __get_cpu_var(xen_x86_cr0) |= X86_CR0_TS; \ + } \ +}) #endif /* __KERNEL__ */ --- a/include/asm-x86_64/mach-xen/asm/system.h +++ b/include/asm-x86_64/mach-xen/asm/system.h @@ -7,7 +7,7 @@ #include #include -#include +#include #ifdef __KERNEL__ @@ -71,18 +71,41 @@ struct alt_instr { /* * Clear and set 'TS' bit respectively */ -#define clts() (HYPERVISOR_fpu_taskswitch(0)) +#define X86_CR0_TS 8 +#define clts() ({ \ + if (__get_cpu_var(xen_x86_cr0) & X86_CR0_TS) { \ + HYPERVISOR_fpu_taskswitch(0); \ + __get_cpu_var(xen_x86_cr0) &= ~X86_CR0_TS; \ + } \ +}) -static inline unsigned long read_cr0(void) +DECLARE_PER_CPU(unsigned long, xen_x86_cr0); + +static inline unsigned long raw_read_cr0(void) { unsigned long cr0; asm volatile("movq %%cr0,%0" : "=r" (cr0)); return cr0; } +static inline unsigned long read_cr0(void) +{ + return __get_cpu_var(xen_x86_cr0); +} + static inline void write_cr0(unsigned long val) { - asm volatile("movq %0,%%cr0" :: "r" (val)); + switch (val ^ __get_cpu_var(xen_x86_cr0)) { + case 0: + return; + case X86_CR0_TS: + HYPERVISOR_fpu_taskswitch(!!(val & X86_CR0_TS)); + break; + default: + asm volatile("movq %0,%%cr0" :: "r" (val)); + break; + } + __get_cpu_var(xen_x86_cr0) = val; } #define read_cr3() ({ \ @@ -103,7 +126,12 @@ static inline void write_cr4(unsigned lo asm volatile("movq %0,%%cr4" :: "r" (val)); } -#define stts() (HYPERVISOR_fpu_taskswitch(1)) +#define stts() ({ \ + if (!(__get_cpu_var(xen_x86_cr0) & X86_CR0_TS)) { \ + HYPERVISOR_fpu_taskswitch(1); \ + __get_cpu_var(xen_x86_cr0) |= X86_CR0_TS; \ + } \ +}) #define wbinvd() \ __asm__ __volatile__ ("wbinvd": : :"memory");