[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-devel] [PATCH 03/04] Kexec / Kdump: x86_32 specific code
[PATCH 03/04] Kexec / Kdump: x86_32 specific code This patch contains the x86_32 implementation of Kexec / Kdump for Xen. Signed-Off-By: Magnus Damm <magnus@xxxxxxxxxxxxx> --- Applies on top of xen-unstable-11856. buildconfigs/linux-defconfig_xen_x86_32 | 2 linux-2.6-xen-sparse/arch/i386/Kconfig | 2 linux-2.6-xen-sparse/arch/i386/kernel/Makefile | 2 linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c | 25 linux-2.6-xen-sparse/include/asm-i386/kexec-xen.h | 51 + linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h | 8 patches/linux-2.6.16.29/git-35...c9.patch | 401 +++++++ patches/linux-2.6.16.29/linux-2.6.19-rc1-kexec..code-i386.patch | 169 +++ patches/linux-2.6.16.29/linux-2.6.19-rc1-kexec-xen-i386.patch | 54 + patches/linux-2.6.16.29/series | 3 xen/arch/x86/x86_32/entry.S | 2 xen/include/asm-x86/x86_32/elf.h | 37 xen/include/asm-x86/x86_32/kexec.h | 84 +- 13 files changed, 817 insertions(+), 23 deletions(-) --- 0002/buildconfigs/linux-defconfig_xen_x86_32 +++ work/buildconfigs/linux-defconfig_xen_x86_32 2006-10-23 11:36:16.000000000 +0900 @@ -183,6 +183,7 @@ CONFIG_MTRR=y CONFIG_REGPARM=y CONFIG_SECCOMP=y CONFIG_HZ_100=y +CONFIG_KEXEC=y # CONFIG_HZ_250 is not set # CONFIG_HZ_1000 is not set CONFIG_HZ=100 @@ -1036,6 +1037,7 @@ CONFIG_DNOTIFY=y # CONFIG_PROC_FS=y CONFIG_PROC_KCORE=y +# CONFIG_PROC_VMCORE is not set CONFIG_SYSFS=y CONFIG_TMPFS=y # CONFIG_HUGETLB_PAGE is not set --- 0001/linux-2.6-xen-sparse/arch/i386/Kconfig +++ work/linux-2.6-xen-sparse/arch/i386/Kconfig 2006-10-23 11:36:16.000000000 +0900 @@ -726,7 +726,7 @@ source kernel/Kconfig.hz config KEXEC bool "kexec system call (EXPERIMENTAL)" - depends on EXPERIMENTAL && !X86_XEN + depends on EXPERIMENTAL && !XEN_UNPRIVILEGED_GUEST help kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot --- 0001/linux-2.6-xen-sparse/arch/i386/kernel/Makefile +++ work/linux-2.6-xen-sparse/arch/i386/kernel/Makefile 2006-10-23 11:36:16.000000000 +0900 @@ -89,7 +89,7 @@ include $(srctree)/scripts/Makefile.xen obj-y += fixup.o microcode-$(subst m,y,$(CONFIG_MICROCODE)) := microcode-xen.o -n-obj-xen := i8259.o timers/ reboot.o smpboot.o trampoline.o +n-obj-xen := i8259.o timers/ reboot.o smpboot.o trampoline.o crash.o obj-y := $(call filterxen, $(obj-y), $(n-obj-xen)) obj-y := $(call cherrypickxen, $(obj-y)) --- 0001/linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c +++ work/linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c 2006-10-23 11:36:16.000000000 +0900 @@ -69,6 +69,10 @@ #include "setup_arch_pre.h" #include <bios_ebda.h> +#ifdef CONFIG_XEN +#include <xen/interface/kexec.h> +#endif + /* Forward Declaration. */ void __init find_max_pfn(void); @@ -943,6 +947,7 @@ static void __init parse_cmdline_early ( * after a kernel panic. */ else if (!memcmp(from, "crashkernel=", 12)) { +#ifndef CONFIG_XEN unsigned long size, base; size = memparse(from+12, &from); if (*from == '@') { @@ -953,6 +958,10 @@ static void __init parse_cmdline_early ( crashk_res.start = base; crashk_res.end = base + size - 1; } +#else + printk("Ignoring crashkernel command line, " + "parameter will be supplied by xen\n"); +#endif } #endif #ifdef CONFIG_PROC_VMCORE @@ -1322,9 +1331,22 @@ void __init setup_bootmem_allocator(void } #endif #ifdef CONFIG_KEXEC +#ifndef CONFIG_XEN if (crashk_res.start != crashk_res.end) reserve_bootmem(crashk_res.start, crashk_res.end - crashk_res.start + 1); +#else + { + xen_kexec_reserve_t reservation; + BUG_ON(HYPERVISOR_kexec_op(KEXEC_CMD_kexec_reserve, + &reservation)); + if (reservation.size) { + crashk_res.start = reservation.start; + crashk_res.end = reservation.start + + reservation.size - 1; + } + } +#endif #endif if (!xen_feature(XENFEAT_auto_translated_physmap)) @@ -1389,7 +1411,8 @@ legacy_init_iomem_resources(struct e820e request_resource(res, data_resource); #endif #ifdef CONFIG_KEXEC - request_resource(res, &crashk_res); + if (crashk_res.start != crashk_res.end) + request_resource(res, &crashk_res); #endif } } --- /dev/null +++ work/linux-2.6-xen-sparse/include/asm-i386/kexec-xen.h 2006-10-23 11:36:17.000000000 +0900 @@ -0,0 +1,51 @@ +#ifndef _I386_KEXEC_XEN_H +#define _I386_KEXEC_XEN_H + +#include <asm/ptrace.h> +#include <asm/types.h> +#include <xen/interface/arch-x86_32.h> + +static inline void crash_translate_regs(struct pt_regs *linux_regs, + struct cpu_user_regs *xen_regs) +{ + xen_regs->ebx = linux_regs->ebx; + xen_regs->ecx = linux_regs->ecx; + xen_regs->edx = linux_regs->edx; + xen_regs->esi = linux_regs->esi; + xen_regs->edi = linux_regs->edi; + xen_regs->ebp = linux_regs->ebp; + xen_regs->eax = linux_regs->eax; + xen_regs->esp = linux_regs->esp; + xen_regs->ss = linux_regs->xss; + xen_regs->cs = linux_regs->xcs; + xen_regs->ds = linux_regs->xds; + xen_regs->es = linux_regs->xes; + xen_regs->eflags = linux_regs->eflags; +} + +/* Kexec needs to know about the actual physical addresss. + * But in xen, on some architectures, a physical address is a + * pseudo-physical addresss. */ +#ifdef CONFIG_XEN +#define kexec_page_to_pfn(page) pfn_to_mfn(page_to_pfn(page)) +#define kexec_pfn_to_page(pfn) pfn_to_page(mfn_to_pfn(pfn)) +#define kexec_virt_to_phys(addr) virt_to_machine(addr) +#define kexec_phys_to_virt(addr) phys_to_virt(machine_to_phys(addr)) +#else +#define kexec_page_to_pfn(page) page_to_pfn(page) +#define kexec_pfn_to_page(pfn) pfn_to_page(pfn) +#define kexec_virt_to_phys(addr) virt_to_phys(addr) +#define kexec_phys_to_virt(addr) phys_to_virt(addr) +#endif + +#endif /* _I386_KEXEC_XEN_H */ + +/* + * Local variables: + * c-file-style: "linux" + * indent-tabs-mode: t + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ --- 0001/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h +++ work/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h 2006-10-23 11:36:16.000000000 +0900 @@ -385,5 +385,13 @@ HYPERVISOR_xenoprof_op( return _hypercall2(int, xenoprof_op, op, arg); } +static inline int +HYPERVISOR_kexec_op( + unsigned long op, void *args) +{ + return _hypercall2(int, kexec_op, op, args); +} + + #endif /* __HYPERCALL_H__ */ --- /dev/null +++ work/patches/linux-2.6.16.29/git-3566561bfadffcb5dbc85d576be80c0dbf2cccc9.patch 2006-10-23 11:36:17.000000000 +0900 @@ -0,0 +1,401 @@ +From: Magnus Damm <magnus@xxxxxxxxxxxxx> +Date: Tue, 26 Sep 2006 08:52:38 +0000 (+0200) +Subject: [PATCH] i386: Avoid overwriting the current pgd (V4, i386) +X-Git-Url: http://www.kernel.org/git/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff;h=3566561bfadffcb5dbc85d576be80c0dbf2cccc9 + +[PATCH] i386: Avoid overwriting the current pgd (V4, i386) + +kexec: Avoid overwriting the current pgd (V4, i386) + +This patch upgrades the i386-specific kexec code to avoid overwriting the +current pgd. Overwriting the current pgd is bad when CONFIG_CRASH_DUMP is used +to start a secondary kernel that dumps the memory of the previous kernel. + +The code introduces a new set of page tables. These tables are used to provide +an executable identity mapping without overwriting the current pgd. + +Signed-off-by: Magnus Damm <magnus@xxxxxxxxxxxxx> +Signed-off-by: Andi Kleen <ak@xxxxxxx> +--- + +--- a/arch/i386/kernel/machine_kexec.c ++++ b/arch/i386/kernel/machine_kexec.c +@@ -21,70 +21,13 @@ + #include <asm/system.h> + + #define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) +- +-#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) +-#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) +-#define L2_ATTR (_PAGE_PRESENT) +- +-#define LEVEL0_SIZE (1UL << 12UL) +- +-#ifndef CONFIG_X86_PAE +-#define LEVEL1_SIZE (1UL << 22UL) +-static u32 pgtable_level1[1024] PAGE_ALIGNED; +- +-static void identity_map_page(unsigned long address) +-{ +- unsigned long level1_index, level2_index; +- u32 *pgtable_level2; +- +- /* Find the current page table */ +- pgtable_level2 = __va(read_cr3()); +- +- /* Find the indexes of the physical address to identity map */ +- level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE; +- level2_index = address / LEVEL1_SIZE; +- +- /* Identity map the page table entry */ +- pgtable_level1[level1_index] = address | L0_ATTR; +- pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR; +- +- /* Flush the tlb so the new mapping takes effect. +- * Global tlb entries are not flushed but that is not an issue. +- */ +- load_cr3(pgtable_level2); +-} +- +-#else +-#define LEVEL1_SIZE (1UL << 21UL) +-#define LEVEL2_SIZE (1UL << 30UL) +-static u64 pgtable_level1[512] PAGE_ALIGNED; +-static u64 pgtable_level2[512] PAGE_ALIGNED; +- +-static void identity_map_page(unsigned long address) +-{ +- unsigned long level1_index, level2_index, level3_index; +- u64 *pgtable_level3; +- +- /* Find the current page table */ +- pgtable_level3 = __va(read_cr3()); +- +- /* Find the indexes of the physical address to identity map */ +- level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE; +- level2_index = (address % LEVEL2_SIZE)/LEVEL1_SIZE; +- level3_index = address / LEVEL2_SIZE; +- +- /* Identity map the page table entry */ +- pgtable_level1[level1_index] = address | L0_ATTR; +- pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR; +- set_64bit(&pgtable_level3[level3_index], +- __pa(pgtable_level2) | L2_ATTR); +- +- /* Flush the tlb so the new mapping takes effect. +- * Global tlb entries are not flushed but that is not an issue. +- */ +- load_cr3(pgtable_level3); +-} ++static u32 kexec_pgd[1024] PAGE_ALIGNED; ++#ifdef CONFIG_X86_PAE ++static u32 kexec_pmd0[1024] PAGE_ALIGNED; ++static u32 kexec_pmd1[1024] PAGE_ALIGNED; + #endif ++static u32 kexec_pte0[1024] PAGE_ALIGNED; ++static u32 kexec_pte1[1024] PAGE_ALIGNED; + + static void set_idt(void *newidt, __u16 limit) + { +@@ -128,16 +71,6 @@ static void load_segments(void) + #undef __STR + } + +-typedef asmlinkage NORET_TYPE void (*relocate_new_kernel_t)( +- unsigned long indirection_page, +- unsigned long reboot_code_buffer, +- unsigned long start_address, +- unsigned int has_pae) ATTRIB_NORET; +- +-extern const unsigned char relocate_new_kernel[]; +-extern void relocate_new_kernel_end(void); +-extern const unsigned int relocate_new_kernel_size; +- + /* + * A architecture hook called to validate the + * proposed image and prepare the control pages +@@ -170,25 +103,29 @@ void machine_kexec_cleanup(struct kimage + */ + NORET_TYPE void machine_kexec(struct kimage *image) + { +- unsigned long page_list; +- unsigned long reboot_code_buffer; +- +- relocate_new_kernel_t rnk; ++ unsigned long page_list[PAGES_NR]; ++ void *control_page; + + /* Interrupts aren't acceptable while we reboot */ + local_irq_disable(); + +- /* Compute some offsets */ +- reboot_code_buffer = page_to_pfn(image->control_code_page) +- << PAGE_SHIFT; +- page_list = image->head; +- +- /* Set up an identity mapping for the reboot_code_buffer */ +- identity_map_page(reboot_code_buffer); +- +- /* copy it out */ +- memcpy((void *)reboot_code_buffer, relocate_new_kernel, +- relocate_new_kernel_size); ++ control_page = page_address(image->control_code_page); ++ memcpy(control_page, relocate_kernel, PAGE_SIZE); ++ ++ page_list[PA_CONTROL_PAGE] = __pa(control_page); ++ page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel; ++ page_list[PA_PGD] = __pa(kexec_pgd); ++ page_list[VA_PGD] = (unsigned long)kexec_pgd; ++#ifdef CONFIG_X86_PAE ++ page_list[PA_PMD_0] = __pa(kexec_pmd0); ++ page_list[VA_PMD_0] = (unsigned long)kexec_pmd0; ++ page_list[PA_PMD_1] = __pa(kexec_pmd1); ++ page_list[VA_PMD_1] = (unsigned long)kexec_pmd1; ++#endif ++ page_list[PA_PTE_0] = __pa(kexec_pte0); ++ page_list[VA_PTE_0] = (unsigned long)kexec_pte0; ++ page_list[PA_PTE_1] = __pa(kexec_pte1); ++ page_list[VA_PTE_1] = (unsigned long)kexec_pte1; + + /* The segment registers are funny things, they have both a + * visible and an invisible part. Whenever the visible part is +@@ -207,8 +144,8 @@ NORET_TYPE void machine_kexec(struct kim + set_idt(phys_to_virt(0),0); + + /* now call it */ +- rnk = (relocate_new_kernel_t) reboot_code_buffer; +- (*rnk)(page_list, reboot_code_buffer, image->start, cpu_has_pae); ++ relocate_kernel((unsigned long)image->head, (unsigned long)page_list, ++ image->start, cpu_has_pae); + } + + /* crashkernel=size@addr specifies the location to reserve for +--- a/arch/i386/kernel/relocate_kernel.S ++++ b/arch/i386/kernel/relocate_kernel.S +@@ -7,16 +7,138 @@ + */ + + #include <linux/linkage.h> ++#include <asm/page.h> ++#include <asm/kexec.h> ++ ++/* ++ * Must be relocatable PIC code callable as a C function ++ */ ++ ++#define PTR(x) (x << 2) ++#define PAGE_ALIGNED (1 << PAGE_SHIFT) ++#define PAGE_ATTR 0x63 /* _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY */ ++#define PAE_PGD_ATTR 0x01 /* _PAGE_PRESENT */ ++ ++ .text ++ .align PAGE_ALIGNED ++ .globl relocate_kernel ++relocate_kernel: ++ movl 8(%esp), %ebp /* list of pages */ ++ ++#ifdef CONFIG_X86_PAE ++ /* map the control page at its virtual address */ ++ ++ movl PTR(VA_PGD)(%ebp), %edi ++ movl PTR(VA_CONTROL_PAGE)(%ebp), %eax ++ andl $0xc0000000, %eax ++ shrl $27, %eax ++ addl %edi, %eax ++ ++ movl PTR(PA_PMD_0)(%ebp), %edx ++ orl $PAE_PGD_ATTR, %edx ++ movl %edx, (%eax) ++ ++ movl PTR(VA_PMD_0)(%ebp), %edi ++ movl PTR(VA_CONTROL_PAGE)(%ebp), %eax ++ andl $0x3fe00000, %eax ++ shrl $18, %eax ++ addl %edi, %eax ++ ++ movl PTR(PA_PTE_0)(%ebp), %edx ++ orl $PAGE_ATTR, %edx ++ movl %edx, (%eax) ++ ++ movl PTR(VA_PTE_0)(%ebp), %edi ++ movl PTR(VA_CONTROL_PAGE)(%ebp), %eax ++ andl $0x001ff000, %eax ++ shrl $9, %eax ++ addl %edi, %eax ++ ++ movl PTR(PA_CONTROL_PAGE)(%ebp), %edx ++ orl $PAGE_ATTR, %edx ++ movl %edx, (%eax) ++ ++ /* identity map the control page at its physical address */ ++ ++ movl PTR(VA_PGD)(%ebp), %edi ++ movl PTR(PA_CONTROL_PAGE)(%ebp), %eax ++ andl $0xc0000000, %eax ++ shrl $27, %eax ++ addl %edi, %eax ++ ++ movl PTR(PA_PMD_1)(%ebp), %edx ++ orl $PAE_PGD_ATTR, %edx ++ movl %edx, (%eax) ++ ++ movl PTR(VA_PMD_1)(%ebp), %edi ++ movl PTR(PA_CONTROL_PAGE)(%ebp), %eax ++ andl $0x3fe00000, %eax ++ shrl $18, %eax ++ addl %edi, %eax ++ ++ movl PTR(PA_PTE_1)(%ebp), %edx ++ orl $PAGE_ATTR, %edx ++ movl %edx, (%eax) ++ ++ movl PTR(VA_PTE_1)(%ebp), %edi ++ movl PTR(PA_CONTROL_PAGE)(%ebp), %eax ++ andl $0x001ff000, %eax ++ shrl $9, %eax ++ addl %edi, %eax ++ ++ movl PTR(PA_CONTROL_PAGE)(%ebp), %edx ++ orl $PAGE_ATTR, %edx ++ movl %edx, (%eax) ++#else ++ /* map the control page at its virtual address */ ++ ++ movl PTR(VA_PGD)(%ebp), %edi ++ movl PTR(VA_CONTROL_PAGE)(%ebp), %eax ++ andl $0xffc00000, %eax ++ shrl $20, %eax ++ addl %edi, %eax ++ ++ movl PTR(PA_PTE_0)(%ebp), %edx ++ orl $PAGE_ATTR, %edx ++ movl %edx, (%eax) ++ ++ movl PTR(VA_PTE_0)(%ebp), %edi ++ movl PTR(VA_CONTROL_PAGE)(%ebp), %eax ++ andl $0x003ff000, %eax ++ shrl $10, %eax ++ addl %edi, %eax ++ ++ movl PTR(PA_CONTROL_PAGE)(%ebp), %edx ++ orl $PAGE_ATTR, %edx ++ movl %edx, (%eax) ++ ++ /* identity map the control page at its physical address */ ++ ++ movl PTR(VA_PGD)(%ebp), %edi ++ movl PTR(PA_CONTROL_PAGE)(%ebp), %eax ++ andl $0xffc00000, %eax ++ shrl $20, %eax ++ addl %edi, %eax ++ ++ movl PTR(PA_PTE_1)(%ebp), %edx ++ orl $PAGE_ATTR, %edx ++ movl %edx, (%eax) ++ ++ movl PTR(VA_PTE_1)(%ebp), %edi ++ movl PTR(PA_CONTROL_PAGE)(%ebp), %eax ++ andl $0x003ff000, %eax ++ shrl $10, %eax ++ addl %edi, %eax ++ ++ movl PTR(PA_CONTROL_PAGE)(%ebp), %edx ++ orl $PAGE_ATTR, %edx ++ movl %edx, (%eax) ++#endif + +- /* +- * Must be relocatable PIC code callable as a C function, that once +- * it starts can not use the previous processes stack. +- */ +- .globl relocate_new_kernel + relocate_new_kernel: + /* read the arguments and say goodbye to the stack */ + movl 4(%esp), %ebx /* page_list */ +- movl 8(%esp), %ebp /* reboot_code_buffer */ ++ movl 8(%esp), %ebp /* list of pages */ + movl 12(%esp), %edx /* start address */ + movl 16(%esp), %ecx /* cpu_has_pae */ + +@@ -24,11 +146,26 @@ relocate_new_kernel: + pushl $0 + popfl + +- /* set a new stack at the bottom of our page... */ +- lea 4096(%ebp), %esp ++ /* get physical address of control page now */ ++ /* this is impossible after page table switch */ ++ movl PTR(PA_CONTROL_PAGE)(%ebp), %edi ++ ++ /* switch to new set of page tables */ ++ movl PTR(PA_PGD)(%ebp), %eax ++ movl %eax, %cr3 ++ ++ /* setup a new stack at the end of the physical control page */ ++ lea 4096(%edi), %esp + +- /* store the parameters back on the stack */ +- pushl %edx /* store the start address */ ++ /* jump to identity mapped page */ ++ movl %edi, %eax ++ addl $(identity_mapped - relocate_kernel), %eax ++ pushl %eax ++ ret ++ ++identity_mapped: ++ /* store the start address on the stack */ ++ pushl %edx + + /* Set cr0 to a known state: + * 31 0 == Paging disabled +@@ -113,8 +250,3 @@ relocate_new_kernel: + xorl %edi, %edi + xorl %ebp, %ebp + ret +-relocate_new_kernel_end: +- +- .globl relocate_new_kernel_size +-relocate_new_kernel_size: +- .long relocate_new_kernel_end - relocate_new_kernel +--- a/include/asm-i386/kexec.h ++++ b/include/asm-i386/kexec.h +@@ -1,6 +1,26 @@ + #ifndef _I386_KEXEC_H + #define _I386_KEXEC_H + ++#define PA_CONTROL_PAGE 0 ++#define VA_CONTROL_PAGE 1 ++#define PA_PGD 2 ++#define VA_PGD 3 ++#define PA_PTE_0 4 ++#define VA_PTE_0 5 ++#define PA_PTE_1 6 ++#define VA_PTE_1 7 ++#ifdef CONFIG_X86_PAE ++#define PA_PMD_0 8 ++#define VA_PMD_0 9 ++#define PA_PMD_1 10 ++#define VA_PMD_1 11 ++#define PAGES_NR 12 ++#else ++#define PAGES_NR 8 ++#endif ++ ++#ifndef __ASSEMBLY__ ++ + #include <asm/fixmap.h> + #include <asm/ptrace.h> + #include <asm/string.h> +@@ -72,5 +92,12 @@ static inline void crash_setup_regs(stru + newregs->eip = (unsigned long)current_text_addr(); + } + } ++asmlinkage NORET_TYPE void ++relocate_kernel(unsigned long indirection_page, ++ unsigned long control_page, ++ unsigned long start_address, ++ unsigned int has_pae) ATTRIB_NORET; ++ ++#endif /* __ASSEMBLY__ */ + + #endif /* _I386_KEXEC_H */ --- /dev/null +++ work/patches/linux-2.6.16.29/linux-2.6.19-rc1-kexec-move_segment_code-i386.patch 2006-10-23 11:36:17.000000000 +0900 @@ -0,0 +1,169 @@ +kexec: Move asm segment handling code to the assembly file (i386) + +This patch moves the idt, gdt, and segment handling code from machine_kexec.c +to relocate_kernel.S. The main reason behind this move is to avoid code +duplication in the Xen hypervisor. With this patch all code required to kexec +is put on the control page. + +On top of that this patch also counts as a cleanup - I think it is much +nicer to write assembly directly in assembly files than wrap inline assembly +in C functions for no apparent reason. + +Signed-off-by: Magnus Damm <magnus@xxxxxxxxxxxxx> +--- + + Applies to 2.6.19-rc1. + + machine_kexec.c | 59 ----------------------------------------------------- + relocate_kernel.S | 58 +++++++++++++++++++++++++++++++++++++++++++++++----- + 2 files changed, 53 insertions(+), 64 deletions(-) + +--- 0002/arch/i386/kernel/machine_kexec.c ++++ work/arch/i386/kernel/machine_kexec.c 2006-10-05 15:49:08.000000000 +0900 +@@ -29,48 +29,6 @@ static u32 kexec_pmd1[1024] PAGE_ALIGNED + static u32 kexec_pte0[1024] PAGE_ALIGNED; + static u32 kexec_pte1[1024] PAGE_ALIGNED; + +-static void set_idt(void *newidt, __u16 limit) +-{ +- struct Xgt_desc_struct curidt; +- +- /* ia32 supports unaliged loads & stores */ +- curidt.size = limit; +- curidt.address = (unsigned long)newidt; +- +- load_idt(&curidt); +-}; +- +- +-static void set_gdt(void *newgdt, __u16 limit) +-{ +- struct Xgt_desc_struct curgdt; +- +- /* ia32 supports unaligned loads & stores */ +- curgdt.size = limit; +- curgdt.address = (unsigned long)newgdt; +- +- load_gdt(&curgdt); +-}; +- +-static void load_segments(void) +-{ +-#define __STR(X) #X +-#define STR(X) __STR(X) +- +- __asm__ __volatile__ ( +- "\tljmp $"STR(__KERNEL_CS)",$1f\n" +- "\t1:\n" +- "\tmovl $"STR(__KERNEL_DS)",%%eax\n" +- "\tmovl %%eax,%%ds\n" +- "\tmovl %%eax,%%es\n" +- "\tmovl %%eax,%%fs\n" +- "\tmovl %%eax,%%gs\n" +- "\tmovl %%eax,%%ss\n" +- ::: "eax", "memory"); +-#undef STR +-#undef __STR +-} +- + /* + * A architecture hook called to validate the + * proposed image and prepare the control pages +@@ -127,23 +85,6 @@ NORET_TYPE void machine_kexec(struct kim + page_list[PA_PTE_1] = __pa(kexec_pte1); + page_list[VA_PTE_1] = (unsigned long)kexec_pte1; + +- /* The segment registers are funny things, they have both a +- * visible and an invisible part. Whenever the visible part is +- * set to a specific selector, the invisible part is loaded +- * with from a table in memory. At no other time is the +- * descriptor table in memory accessed. +- * +- * I take advantage of this here by force loading the +- * segments, before I zap the gdt with an invalid value. +- */ +- load_segments(); +- /* The gdt & idt are now invalid. +- * If you want to load them you must set up your own idt & gdt. +- */ +- set_gdt(phys_to_virt(0),0); +- set_idt(phys_to_virt(0),0); +- +- /* now call it */ + relocate_kernel((unsigned long)image->head, (unsigned long)page_list, + image->start, cpu_has_pae); + } +--- 0002/arch/i386/kernel/relocate_kernel.S ++++ work/arch/i386/kernel/relocate_kernel.S 2006-10-05 16:03:21.000000000 +0900 +@@ -154,14 +154,45 @@ relocate_new_kernel: + movl PTR(PA_PGD)(%ebp), %eax + movl %eax, %cr3 + ++ /* setup idt */ ++ movl %edi, %eax ++ addl $(idt_48 - relocate_kernel), %eax ++ lidtl (%eax) ++ ++ /* setup gdt */ ++ movl %edi, %eax ++ addl $(gdt - relocate_kernel), %eax ++ movl %edi, %esi ++ addl $((gdt_48 - relocate_kernel) + 2), %esi ++ movl %eax, (%esi) ++ ++ movl %edi, %eax ++ addl $(gdt_48 - relocate_kernel), %eax ++ lgdtl (%eax) ++ ++ /* setup data segment registers */ ++ mov $(gdt_ds - gdt), %eax ++ mov %eax, %ds ++ mov %eax, %es ++ mov %eax, %fs ++ mov %eax, %gs ++ mov %eax, %ss ++ + /* setup a new stack at the end of the physical control page */ + lea 4096(%edi), %esp + +- /* jump to identity mapped page */ +- movl %edi, %eax +- addl $(identity_mapped - relocate_kernel), %eax +- pushl %eax +- ret ++ /* load new code segment and jump to identity mapped page */ ++ movl %edi, %esi ++ xorl %eax, %eax ++ pushl %eax ++ pushl %esi ++ pushl %eax ++ movl $(gdt_cs - gdt), %eax ++ pushl %eax ++ movl %edi, %eax ++ addl $(identity_mapped - relocate_kernel),%eax ++ pushl %eax ++ iretl + + identity_mapped: + /* store the start address on the stack */ +@@ -250,3 +281,20 @@ identity_mapped: + xorl %edi, %edi + xorl %ebp, %ebp + ret ++ ++ .align 16 ++gdt: ++ .quad 0x0000000000000000 /* NULL descriptor */ ++gdt_cs: ++ .quad 0x00cf9a000000ffff /* kernel 4GB code at 0x00000000 */ ++gdt_ds: ++ .quad 0x00cf92000000ffff /* kernel 4GB data at 0x00000000 */ ++gdt_end: ++ ++gdt_48: ++ .word gdt_end - gdt - 1 /* limit */ ++ .long 0 /* base - filled in by code above */ ++ ++idt_48: ++ .word 0 /* limit */ ++ .long 0 /* base */ --- /dev/null +++ work/patches/linux-2.6.16.29/linux-2.6.19-rc1-kexec-xen-i386.patch 2006-10-23 11:36:17.000000000 +0900 @@ -0,0 +1,54 @@ +--- 0004/arch/i386/kernel/machine_kexec.c ++++ work/arch/i386/kernel/machine_kexec.c 2006-10-11 18:34:06.000000000 +0900 +@@ -20,6 +20,10 @@ + #include <asm/desc.h> + #include <asm/system.h> + ++#ifdef CONFIG_XEN ++#include <xen/interface/kexec.h> ++#endif ++ + #define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) + static u32 kexec_pgd[1024] PAGE_ALIGNED; + #ifdef CONFIG_X86_PAE +@@ -29,6 +33,40 @@ static u32 kexec_pmd1[1024] PAGE_ALIGNED + static u32 kexec_pte0[1024] PAGE_ALIGNED; + static u32 kexec_pte1[1024] PAGE_ALIGNED; + ++#ifdef CONFIG_XEN ++ ++#define __ma(x) (pfn_to_mfn(__pa((x)) >> PAGE_SHIFT) << PAGE_SHIFT) ++ ++#if PAGES_NR > KEXEC_XEN_NO_PAGES ++#error PAGES_NR is greater than KEXEC_XEN_NO_PAGES - Xen support will break ++#endif ++ ++#if PA_CONTROL_PAGE != 0 ++#error PA_CONTROL_PAGE is non zero - Xen support will break ++#endif ++ ++void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, struct kimage *image) ++{ ++ void *control_page; ++ ++ memset(xki->page_list, 0, sizeof(xki->page_list)); ++ ++ control_page = page_address(image->control_code_page); ++ memcpy(control_page, relocate_kernel, PAGE_SIZE); ++ ++ xki->page_list[PA_CONTROL_PAGE] = __ma(control_page); ++ xki->page_list[PA_PGD] = __ma(kexec_pgd); ++#ifdef CONFIG_X86_PAE ++ xki->page_list[PA_PMD_0] = __ma(kexec_pmd0); ++ xki->page_list[PA_PMD_1] = __ma(kexec_pmd1); ++#endif ++ xki->page_list[PA_PTE_0] = __ma(kexec_pte0); ++ xki->page_list[PA_PTE_1] = __ma(kexec_pte1); ++ ++} ++ ++#endif /* CONFIG_XEN */ ++ + /* + * A architecture hook called to validate the + * proposed image and prepare the control pages --- 0004/patches/linux-2.6.16.29/series +++ work/patches/linux-2.6.16.29/series 2006-10-23 11:36:16.000000000 +0900 @@ -1,6 +1,9 @@ kexec-generic.patch git-2efe55a9cec8418f0e0cde3dc3787a42fddc4411.patch git-2a8a3d5b65e86ec1dfef7d268c64a909eab94af7.patch +git-3566561bfadffcb5dbc85d576be80c0dbf2cccc9.patch +linux-2.6.19-rc1-kexec-move_segment_code-i386.patch +linux-2.6.19-rc1-kexec-xen-i386.patch blktap-aio-16_03_06.patch device_bind.patch fix-hz-suspend.patch --- 0001/xen/arch/x86/x86_32/entry.S +++ work/xen/arch/x86/x86_32/entry.S 2006-10-23 11:36:16.000000000 +0900 @@ -672,6 +672,7 @@ ENTRY(hypercall_table) .long do_hvm_op .long do_sysctl /* 35 */ .long do_domctl + .long do_kexec_op .rept NR_hypercalls-((.-hypercall_table)/4) .long do_ni_hypercall .endr @@ -714,6 +715,7 @@ ENTRY(hypercall_args_table) .byte 2 /* do_hvm_op */ .byte 1 /* do_sysctl */ /* 35 */ .byte 1 /* do_domctl */ + .byte 2 /* do_kexec_op */ .rept NR_hypercalls-(.-hypercall_args_table) .byte 0 /* do_ni_hypercall */ .endr --- 0004/xen/include/asm-x86/x86_32/elf.h +++ work/xen/include/asm-x86/x86_32/elf.h 2006-10-23 11:36:17.000000000 +0900 @@ -1,14 +1,39 @@ +/* + * Based heavily on include/asm-i386/elf.h and + * include/asm-i386/system.h from Linux 2.6.16 + */ + #ifndef __X86_32_ELF_H__ #define __X86_32_ELF_H__ -#include <xen/lib.h> /* for printk() used in stub */ +#define ELF_NGREG 17 -#define ELF_NGREG 1 /* XXX: Define to be at least as large as - however many register slots are needed when - crash notes are written during crash dump */ +/* XXX: Xen doesn't have orig_eax. For kdump, on a dom0 crash, the values + * for the crashing CPU could could be passed down from dom0, but is that + * neccessary? + * Also, I'm not sure why fs and gs are derived from the CPU + * rather than regs */ -#define ELF_CORE_COPY_REGS(pr_reg, regs) \ - printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__); +#define ELF_CORE_COPY_REGS(pr_reg, regs) do { \ + unsigned i; \ + pr_reg[0] = regs->ebx; \ + pr_reg[1] = regs->ecx; \ + pr_reg[2] = regs->edx; \ + pr_reg[3] = regs->esi; \ + pr_reg[4] = regs->edi; \ + pr_reg[5] = regs->ebp; \ + pr_reg[6] = regs->eax; \ + pr_reg[7] = regs->ds; \ + pr_reg[8] = regs->es; \ + asm volatile("mov %%fs,%0":"=rm" (i)); pr_reg[9] = i; \ + asm volatile("mov %%gs,%0":"=rm" (i)); pr_reg[10] = i; \ + pr_reg[11] = 0; /* regs->orig_eax; */ \ + pr_reg[12] = regs->eip; \ + pr_reg[13] = regs->cs; \ + pr_reg[14] = regs->eflags; \ + pr_reg[15] = regs->esp; \ + pr_reg[16] = regs->ss; \ +} while(0); #endif /* __X86_32_ELF_H__ */ --- 0004/xen/include/asm-x86/x86_32/kexec.h +++ work/xen/include/asm-x86/x86_32/kexec.h 2006-10-23 11:36:17.000000000 +0900 @@ -1,36 +1,92 @@ -#ifndef __X86_32_KEXEC_H__ -#define __X86_32_KEXEC_H__ +/****************************************************************************** + * kexec.h + * + * Based heavily on machine_kexec.c and kexec.h from Linux 2.6.19-rc1 + * + */ + +#ifndef __X86_KEXEC_X86_32_H__ +#define __X86_KEXEC_X86_32_H__ -#include <xen/lib.h> /* for printk() used in stub */ #include <xen/types.h> -#include <public/xen.h> #include <xen/kexec.h> +#include <asm/fixmap.h> +#include <asm/processor.h> +/* CPU does not save ss and esp on stack if execution is already + * running in kernel mode at the time of NMI occurrence. This code + * fixes it. + */ static inline void crash_fixup_ss_esp(struct cpu_user_regs *newregs, - struct cpu_user_regs *oldregs) + struct cpu_user_regs *oldregs) { - printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__); - return; + memcpy(newregs, oldregs, sizeof(*newregs)); + newregs->esp = (unsigned long)&(oldregs->esp); + __asm__ __volatile__( + "xorl %%eax, %%eax\n\t" + "movw %%ss, %%ax\n\t" + :"=a"(newregs->ss)); } - + +/* + * This function is responsible for capturing register states if coming + * via panic otherwise just fix up the ss and esp if coming via kernel + * mode exception. + */ static inline void crash_setup_regs(struct cpu_user_regs *newregs, - struct cpu_user_regs *oldregs) + struct cpu_user_regs *oldregs) { - printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__); + if (oldregs) + crash_fixup_ss_esp(newregs, oldregs); + else { + __asm__ __volatile__("movl %%ebx,%0" : "=m"(newregs->ebx)); + __asm__ __volatile__("movl %%ecx,%0" : "=m"(newregs->ecx)); + __asm__ __volatile__("movl %%edx,%0" : "=m"(newregs->edx)); + __asm__ __volatile__("movl %%esi,%0" : "=m"(newregs->esi)); + __asm__ __volatile__("movl %%edi,%0" : "=m"(newregs->edi)); + __asm__ __volatile__("movl %%ebp,%0" : "=m"(newregs->ebp)); + __asm__ __volatile__("movl %%eax,%0" : "=m"(newregs->eax)); + __asm__ __volatile__("movl %%esp,%0" : "=m"(newregs->esp)); + __asm__ __volatile__("movw %%ss, %%ax;" :"=a"(newregs->ss)); + __asm__ __volatile__("movw %%cs, %%ax;" :"=a"(newregs->cs)); + __asm__ __volatile__("movw %%ds, %%ax;" :"=a"(newregs->ds)); + __asm__ __volatile__("movw %%es, %%ax;" :"=a"(newregs->es)); + __asm__ __volatile__("pushfl; popl %0" :"=m"(newregs->eflags)); + + newregs->eip = (unsigned long)current_text_addr(); + } } +/* + * From Linux 2.6.16's include/asm-i386/mach-xen/asm/ptrace.h + * + * user_mode_vm(regs) determines whether a register set came from user mode. + * This is true if V8086 mode was enabled OR if the register set was from + * protected mode with RPL-3 CS value. This tricky test checks that with + * one comparison. Many places in the kernel can bypass this full check + * if they have already ruled out V8086 mode, so user_mode(regs) can be used. + */ static inline int user_mode(struct cpu_user_regs *regs) { - printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__); - return -1; + return (regs->cs & 2) != 0; } +typedef asmlinkage void (*relocate_new_kernel_t)( + unsigned long indirection_page, + unsigned long page_list, + unsigned long start_address, + unsigned int has_pae); + static inline void machine_kexec(xen_kexec_image_t *image) { - printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__); + relocate_new_kernel_t rnk; + + rnk = (relocate_new_kernel_t) image->page_list[1]; + (*rnk)(image->indirection_page, (unsigned long)image->page_list, + image->start_address, (unsigned long)cpu_has_pae); } -#endif /* __X86_32_KEXEC_H__ */ +#endif /* __X86_KEXEC_X86_32_H__ */ /* * Local variables: _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-devel
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |