[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH] x86/mm: Split read_cr3() into read_cr3_pa() and __read_cr3()



The kernel has several code paths that read CR3.  Most of them assume that
CR3 contains the PGD's physical address, whereas some of them awkwardly
use PHYSICAL_PAGE_MASK to mask off low bits.

Add explicit mask macros for CR3 and convert all of the CR3 readers.
This will keep them from breaking when PCID is enabled.

Cc: Tom Lendacky <thomas.lendacky@xxxxxxx>
Cc: Juergen Gross <jgross@xxxxxxxx>
Cc: xen-devel <xen-devel@xxxxxxxxxxxxx>
Cc: Boris Ostrovsky <boris.ostrovsky@xxxxxxxxxx>
Signed-off-by: Andy Lutomirski <luto@xxxxxxxxxx>
---

Hi Ingo-

I broke this out because Tom's SME series and my PCID series both need it.
Please consider applying it to tip:x86/mm.

I'll send PCID v2 soon.  It'll apply to x86/mm + sched/urgent + this patch.

Thanks,
Andy

 arch/x86/boot/compressed/pagetable.c   |  2 +-
 arch/x86/include/asm/efi.h             |  2 +-
 arch/x86/include/asm/mmu_context.h     |  4 ++--
 arch/x86/include/asm/paravirt.h        |  2 +-
 arch/x86/include/asm/processor-flags.h | 36 ++++++++++++++++++++++++++++++++++
 arch/x86/include/asm/processor.h       |  8 ++++++++
 arch/x86/include/asm/special_insns.h   | 10 +++++++---
 arch/x86/include/asm/tlbflush.h        |  4 ++--
 arch/x86/kernel/head64.c               |  3 ++-
 arch/x86/kernel/paravirt.c             |  2 +-
 arch/x86/kernel/process_32.c           |  2 +-
 arch/x86/kernel/process_64.c           |  2 +-
 arch/x86/kvm/vmx.c                     |  2 +-
 arch/x86/mm/fault.c                    | 10 +++++-----
 arch/x86/mm/ioremap.c                  |  2 +-
 arch/x86/platform/efi/efi_64.c         |  4 ++--
 arch/x86/platform/olpc/olpc-xo1-pm.c   |  2 +-
 arch/x86/power/cpu.c                   |  2 +-
 arch/x86/power/hibernate_64.c          |  3 ++-
 arch/x86/xen/mmu_pv.c                  |  6 +++---
 20 files changed, 79 insertions(+), 29 deletions(-)

diff --git a/arch/x86/boot/compressed/pagetable.c 
b/arch/x86/boot/compressed/pagetable.c
index 1d78f1739087..8e69df96492e 100644
--- a/arch/x86/boot/compressed/pagetable.c
+++ b/arch/x86/boot/compressed/pagetable.c
@@ -92,7 +92,7 @@ void initialize_identity_maps(void)
         * and we must append to the existing area instead of entirely
         * overwriting it.
         */
-       level4p = read_cr3();
+       level4p = read_cr3_pa();
        if (level4p == (unsigned long)_pgtable) {
                debug_putstr("booted via startup_32()\n");
                pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE;
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 2f77bcefe6b4..d2ff779f347e 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -74,7 +74,7 @@ struct efi_scratch {
        __kernel_fpu_begin();                                           \
                                                                        \
        if (efi_scratch.use_pgd) {                                      \
-               efi_scratch.prev_cr3 = read_cr3();                      \
+               efi_scratch.prev_cr3 = __read_cr3();                    \
                write_cr3((unsigned long)efi_scratch.efi_pgt);          \
                __flush_tlb_all();                                      \
        }                                                               \
diff --git a/arch/x86/include/asm/mmu_context.h 
b/arch/x86/include/asm/mmu_context.h
index 5a93f6261302..cfe6034ebfc6 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -269,7 +269,7 @@ static inline bool arch_vma_access_permitted(struct 
vm_area_struct *vma,
 
 /*
  * This can be used from process context to figure out what the value of
- * CR3 is without needing to do a (slow) read_cr3().
+ * CR3 is without needing to do a (slow) __read_cr3().
  *
  * It's intended to be used for code like KVM that sneakily changes CR3
  * and needs to restore it.  It needs to be used very carefully.
@@ -281,7 +281,7 @@ static inline unsigned long __get_current_cr3_fast(void)
        /* For now, be very restrictive about when this can be called. */
        VM_WARN_ON(in_nmi() || !in_atomic());
 
-       VM_BUG_ON(cr3 != read_cr3());
+       VM_BUG_ON(cr3 != __read_cr3());
        return cr3;
 }
 
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 9a15739d9f4b..a63e77f8eb41 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -61,7 +61,7 @@ static inline void write_cr2(unsigned long x)
        PVOP_VCALL1(pv_mmu_ops.write_cr2, x);
 }
 
-static inline unsigned long read_cr3(void)
+static inline unsigned long __read_cr3(void)
 {
        return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr3);
 }
diff --git a/arch/x86/include/asm/processor-flags.h 
b/arch/x86/include/asm/processor-flags.h
index 39fb618e2211..79aa2f98398d 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -8,4 +8,40 @@
 #else
 #define X86_VM_MASK    0 /* No VM86 support */
 #endif
+
+/*
+ * CR3's layout varies depending on several things.
+ *
+ * If CR4.PCIDE is set (64-bit only), then CR3[11:0] is the address space ID.
+ * If PAE is enabled, then CR3[11:5] is part of the PDPT address
+ * (i.e. it's 32-byte aligned, not page-aligned) and CR3[4:0] is ignored.
+ * Otherwise (non-PAE, non-PCID), CR3[3] is PWT, CR3[4] is PCD, and
+ * CR3[2:0] and CR3[11:5] are ignored.
+ *
+ * In all cases, Linux puts zeros in the low ignored bits and in PWT and PCD.
+ *
+ * CR3[63] is always read as zero.  If CR4.PCIDE is set, then CR3[63] may be
+ * written as 1 to prevent the write to CR3 from flushing the TLB.
+ *
+ * On systems with SME, one bit (in a variable position!) is stolen to indicate
+ * that the top-level paging structure is encrypted.
+ *
+ * All of the remaining bits indicate the physical address of the top-level
+ * paging structure.
+ *
+ * CR3_ADDR_MASK is the mask used by read_cr3_pa().
+ */
+#ifdef CONFIG_X86_64
+/* Mask off the address space ID bits. */
+#define CR3_ADDR_MASK 0x7FFFFFFFFFFFF000ull
+#define CR3_PCID_MASK 0xFFFull
+#else
+/*
+ * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save
+ * a tiny bit of code size by setting all the bits.
+ */
+#define CR3_ADDR_MASK 0xFFFFFFFFull
+#define CR3_PCID_MASK 0ull
+#endif
+
 #endif /* _ASM_X86_PROCESSOR_FLAGS_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 3cada998a402..9de02c985aa4 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -231,6 +231,14 @@ native_cpuid_reg(ebx)
 native_cpuid_reg(ecx)
 native_cpuid_reg(edx)
 
+/*
+ * Friendlier CR3 helpers.
+ */
+static inline unsigned long read_cr3_pa(void)
+{
+       return __read_cr3() & CR3_ADDR_MASK;
+}
+
 static inline void load_cr3(pgd_t *pgdir)
 {
        write_cr3(__pa(pgdir));
diff --git a/arch/x86/include/asm/special_insns.h 
b/arch/x86/include/asm/special_insns.h
index 12af3e35edfa..9efaabf5b54b 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -39,7 +39,7 @@ static inline void native_write_cr2(unsigned long val)
        asm volatile("mov %0,%%cr2": : "r" (val), "m" (__force_order));
 }
 
-static inline unsigned long native_read_cr3(void)
+static inline unsigned long __native_read_cr3(void)
 {
        unsigned long val;
        asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order));
@@ -159,9 +159,13 @@ static inline void write_cr2(unsigned long x)
        native_write_cr2(x);
 }
 
-static inline unsigned long read_cr3(void)
+/*
+ * Careful!  CR3 contains more than just an address.  You probably want
+ * read_cr3_pa() instead.
+ */
+static inline unsigned long __read_cr3(void)
 {
-       return native_read_cr3();
+       return __native_read_cr3();
 }
 
 static inline void write_cr3(unsigned long x)
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 388c2463fde6..5f78c6a77578 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -156,7 +156,7 @@ static inline void __native_flush_tlb(void)
         * back:
         */
        preempt_disable();
-       native_write_cr3(native_read_cr3());
+       native_write_cr3(__native_read_cr3());
        preempt_enable();
 }
 
@@ -264,7 +264,7 @@ static inline void reset_lazy_tlbstate(void)
        this_cpu_write(cpu_tlbstate.state, 0);
        this_cpu_write(cpu_tlbstate.loaded_mm, &init_mm);
 
-       WARN_ON(read_cr3() != __pa_symbol(swapper_pg_dir));
+       WARN_ON(read_cr3_pa() != __pa_symbol(swapper_pg_dir));
 }
 
 static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch 
*batch,
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 43b7002f44fb..794e8f517a81 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -55,7 +55,8 @@ int __init early_make_pgtable(unsigned long address)
        pmdval_t pmd, *pmd_p;
 
        /* Invalid address or early pgt is done ?  */
-       if (physaddr >= MAXMEM || read_cr3() != __pa_nodebug(early_level4_pgt))
+       if (physaddr >= MAXMEM ||
+           read_cr3_pa() != __pa_nodebug(early_level4_pgt))
                return -1;
 
 again:
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 3586996fc50d..bc0a849589bb 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -391,7 +391,7 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
 
        .read_cr2 = native_read_cr2,
        .write_cr2 = native_write_cr2,
-       .read_cr3 = native_read_cr3,
+       .read_cr3 = __native_read_cr3,
        .write_cr3 = native_write_cr3,
 
        .flush_tlb_user = native_flush_tlb,
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index ffeae818aa7a..c6d6dc5f8bb2 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -92,7 +92,7 @@ void __show_regs(struct pt_regs *regs, int all)
 
        cr0 = read_cr0();
        cr2 = read_cr2();
-       cr3 = read_cr3();
+       cr3 = __read_cr3();
        cr4 = __read_cr4();
        printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n",
                        cr0, cr2, cr3, cr4);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 9c39ab8bcc41..c3169be4c596 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -104,7 +104,7 @@ void __show_regs(struct pt_regs *regs, int all)
 
        cr0 = read_cr0();
        cr2 = read_cr2();
-       cr3 = read_cr3();
+       cr3 = __read_cr3();
        cr4 = __read_cr4();
 
        printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 19cde555d73f..d143dd397dc9 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5024,7 +5024,7 @@ static void vmx_set_constant_host_state(struct vcpu_vmx 
*vmx)
         * Save the most likely value for this task's CR3 in the VMCS.
         * We can't use __get_current_cr3_fast() because we're not atomic.
         */
-       cr3 = read_cr3();
+       cr3 = __read_cr3();
        vmcs_writel(HOST_CR3, cr3);             /* 22.2.3  FIXME: shadow tables 
*/
        vmx->host_state.vmcs_host_cr3 = cr3;
 
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 8ad91a01cbc8..2a1fa10c6a98 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -346,7 +346,7 @@ static noinline int vmalloc_fault(unsigned long address)
         * Do _not_ use "current" here. We might be inside
         * an interrupt in the middle of a task switch..
         */
-       pgd_paddr = read_cr3();
+       pgd_paddr = read_cr3_pa();
        pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
        if (!pmd_k)
                return -1;
@@ -388,7 +388,7 @@ static bool low_pfn(unsigned long pfn)
 
 static void dump_pagetable(unsigned long address)
 {
-       pgd_t *base = __va(read_cr3());
+       pgd_t *base = __va(read_cr3_pa());
        pgd_t *pgd = &base[pgd_index(address)];
        p4d_t *p4d;
        pud_t *pud;
@@ -451,7 +451,7 @@ static noinline int vmalloc_fault(unsigned long address)
         * happen within a race in page table update. In the later
         * case just flush:
         */
-       pgd = (pgd_t *)__va(read_cr3()) + pgd_index(address);
+       pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(address);
        pgd_ref = pgd_offset_k(address);
        if (pgd_none(*pgd_ref))
                return -1;
@@ -555,7 +555,7 @@ static int bad_address(void *p)
 
 static void dump_pagetable(unsigned long address)
 {
-       pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK);
+       pgd_t *base = __va(read_cr3_pa());
        pgd_t *pgd = base + pgd_index(address);
        p4d_t *p4d;
        pud_t *pud;
@@ -700,7 +700,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long 
error_code,
                pgd_t *pgd;
                pte_t *pte;
 
-               pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
+               pgd = __va(read_cr3_pa());
                pgd += pgd_index(address);
 
                pte = lookup_address_in_pgd(pgd, address, &level);
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index bbc558b88a88..4c1b5fd0c7ad 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -424,7 +424,7 @@ static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] 
__page_aligned_bss;
 static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
 {
        /* Don't assume we're using swapper_pg_dir at this point */
-       pgd_t *base = __va(read_cr3());
+       pgd_t *base = __va(read_cr3_pa());
        pgd_t *pgd = &base[pgd_index(addr)];
        p4d_t *p4d = p4d_offset(pgd, addr);
        pud_t *pud = pud_offset(p4d, addr);
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index eb8dff15a7f6..f40bf6230480 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -80,7 +80,7 @@ pgd_t * __init efi_call_phys_prolog(void)
        int n_pgds, i, j;
 
        if (!efi_enabled(EFI_OLD_MEMMAP)) {
-               save_pgd = (pgd_t *)read_cr3();
+               save_pgd = (pgd_t *)__read_cr3();
                write_cr3((unsigned long)efi_scratch.efi_pgt);
                goto out;
        }
@@ -646,7 +646,7 @@ efi_status_t efi_thunk_set_virtual_address_map(
        efi_sync_low_kernel_mappings();
        local_irq_save(flags);
 
-       efi_scratch.prev_cr3 = read_cr3();
+       efi_scratch.prev_cr3 = __read_cr3();
        write_cr3((unsigned long)efi_scratch.efi_pgt);
        __flush_tlb_all();
 
diff --git a/arch/x86/platform/olpc/olpc-xo1-pm.c 
b/arch/x86/platform/olpc/olpc-xo1-pm.c
index c5350fd27d70..0668aaff8bfe 100644
--- a/arch/x86/platform/olpc/olpc-xo1-pm.c
+++ b/arch/x86/platform/olpc/olpc-xo1-pm.c
@@ -77,7 +77,7 @@ static int xo1_power_state_enter(suspend_state_t pm_state)
 
 asmlinkage __visible int xo1_do_sleep(u8 sleep_state)
 {
-       void *pgd_addr = __va(read_cr3());
+       void *pgd_addr = __va(read_cr3_pa());
 
        /* Program wakeup mask (using dword access to CS5536_PM1_EN) */
        outl(wakeup_mask << 16, acpi_base + CS5536_PM1_STS);
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 6b05a9219ea2..78459a6d455a 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -129,7 +129,7 @@ static void __save_processor_state(struct saved_context 
*ctxt)
         */
        ctxt->cr0 = read_cr0();
        ctxt->cr2 = read_cr2();
-       ctxt->cr3 = read_cr3();
+       ctxt->cr3 = __read_cr3();
        ctxt->cr4 = __read_cr4();
 #ifdef CONFIG_X86_64
        ctxt->cr8 = read_cr8();
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index a6e21fee22ea..e3e62c8a8e70 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -150,7 +150,8 @@ static int relocate_restore_code(void)
        memcpy((void *)relocated_restore_code, &core_restore_code, PAGE_SIZE);
 
        /* Make the page containing the relocated code executable */
-       pgd = (pgd_t *)__va(read_cr3()) + pgd_index(relocated_restore_code);
+       pgd = (pgd_t *)__va(read_cr3_pa()) +
+               pgd_index(relocated_restore_code);
        p4d = p4d_offset(pgd, relocated_restore_code);
        if (p4d_large(*p4d)) {
                set_p4d(p4d, __p4d(p4d_val(*p4d) & ~_PAGE_NX));
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 21beb37114b7..4f638309deea 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -2017,7 +2017,7 @@ static phys_addr_t __init xen_early_virt_to_phys(unsigned 
long vaddr)
        pmd_t pmd;
        pte_t pte;
 
-       pa = read_cr3();
+       pa = read_cr3_pa();
        pgd = native_make_pgd(xen_read_phys_ulong(pa + pgd_index(vaddr) *
                                                       sizeof(pgd)));
        if (!pgd_present(pgd))
@@ -2097,7 +2097,7 @@ void __init xen_relocate_p2m(void)
        pt_phys = pmd_phys + PFN_PHYS(n_pmd);
        p2m_pfn = PFN_DOWN(pt_phys) + n_pt;
 
-       pgd = __va(read_cr3());
+       pgd = __va(read_cr3_pa());
        new_p2m = (unsigned long *)(2 * PGDIR_SIZE);
        idx_p4d = 0;
        save_pud = n_pud;
@@ -2204,7 +2204,7 @@ static void __init xen_write_cr3_init(unsigned long cr3)
 {
        unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir));
 
-       BUG_ON(read_cr3() != __pa(initial_page_table));
+       BUG_ON(read_cr3_pa() != __pa(initial_page_table));
        BUG_ON(cr3 != __pa(swapper_pg_dir));
 
        /*
-- 
2.9.4


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
https://lists.xen.org/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.