[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH 04/04] Kexec / Kdump: x86_64 specific code



[PATCH 04/04] Kexec / Kdump: x86_64 specific code

This patch contains the x86_64 implementation of Kexec / Kdump for Xen.

Signed-Off-By: Magnus Damm <magnus@xxxxxxxxxxxxx>
---

 Applies on top of xen-unstable-11856.

 buildconfigs/linux-defconfig_xen_x86_64                           |    1
 linux-2.6-xen-sparse/arch/x86_64/Kconfig                          |    2
 linux-2.6-xen-sparse/arch/x86_64/kernel/Makefile                  |    2
 linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c               |   27
 linux-2.6-xen-sparse/include/asm-x86_64/kexec-xen.h               |   64 +
 linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/hypercall.h  |    7
 linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/ptrace.h     |    2
 patches/linux-2.6.16.29/git-4b...1f.patch                         |  375 ++++
 patches/linux-2.6.16.29/linux-2.6.19-rc1-kexec..code-x86_64.patch |  161 ++
 patches/linux-2.6.16.29/linux-2.6.19-rc1-kexec-xen-x86_64.patch   |  162 ++
 patches/linux-2.6.16.29/series                                    |    3
 xen/arch/x86/x86_64/entry.S                                       |    2
 xen/include/asm-x86/x86_64/elf.h                                  |   49 +
 xen/include/asm-x86/x86_64/kexec.h                                |   60 +
 14 files changed, 903 insertions(+), 14 deletions(-)

--- 0002/buildconfigs/linux-defconfig_xen_x86_64
+++ work/buildconfigs/linux-defconfig_xen_x86_64        2006-10-23 
11:36:17.000000000 +0900
@@ -138,6 +138,7 @@ CONFIG_SWIOTLB=y
 CONFIG_PHYSICAL_START=0x100000
 CONFIG_SECCOMP=y
 CONFIG_HZ_100=y
+CONFIG_KEXEC=y
 # CONFIG_HZ_250 is not set
 # CONFIG_HZ_1000 is not set
 CONFIG_HZ=100
--- 0001/linux-2.6-xen-sparse/arch/x86_64/Kconfig
+++ work/linux-2.6-xen-sparse/arch/x86_64/Kconfig       2006-10-23 
11:36:17.000000000 +0900
@@ -435,7 +435,7 @@ config X86_MCE_AMD
 
 config KEXEC
        bool "kexec system call (EXPERIMENTAL)"
-       depends on EXPERIMENTAL && !X86_64_XEN
+       depends on EXPERIMENTAL && !XEN_UNPRIVILEGED_GUEST
        help
          kexec is a system call that implements the ability to shutdown your
          current kernel, and to start another kernel.  It is like a reboot
--- 0001/linux-2.6-xen-sparse/arch/x86_64/kernel/Makefile
+++ work/linux-2.6-xen-sparse/arch/x86_64/kernel/Makefile       2006-10-23 
11:36:17.000000000 +0900
@@ -59,7 +59,7 @@ pci-dma-y                     += ../../i386/kernel/pci-dma
 microcode-$(subst m,y,$(CONFIG_MICROCODE))  := 
../../i386/kernel/microcode-xen.o
 quirks-y                       := ../../i386/kernel/quirks-xen.o
 
-n-obj-xen := i8259.o reboot.o i8237.o smpboot.o trampoline.o
+n-obj-xen := i8259.o reboot.o i8237.o smpboot.o trampoline.o crash.o
 
 include $(srctree)/scripts/Makefile.xen
 
--- 0001/linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c
+++ work/linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c    2006-10-23 
11:36:17.000000000 +0900
@@ -80,6 +80,10 @@
 #include <asm/mach-xen/setup_arch_post.h>
 #include <xen/interface/memory.h>
 
+#ifdef CONFIG_XEN
+#include <xen/interface/kexec.h>
+#endif
+
 extern unsigned long start_pfn;
 extern struct edid_info edid_info;
 
@@ -450,6 +454,7 @@ static __init void parse_cmdline_early (
                 * after a kernel panic.
                 */
                else if (!memcmp(from, "crashkernel=", 12)) {
+#ifndef CONFIG_XEN
                        unsigned long size, base;
                        size = memparse(from+12, &from);
                        if (*from == '@') {
@@ -460,6 +465,10 @@ static __init void parse_cmdline_early (
                                crashk_res.start = base;
                                crashk_res.end   = base + size - 1;
                        }
+#else
+                       printk("Ignoring crashkernel command line, "
+                              "parameter will be supplied by xen\n");
+#endif
                }
 #endif
 
@@ -812,10 +821,23 @@ void __init setup_arch(char **cmdline_p)
 #endif
 #endif /* !CONFIG_XEN */
 #ifdef CONFIG_KEXEC
+#ifndef CONFIG_XEN
        if (crashk_res.start != crashk_res.end) {
                reserve_bootmem(crashk_res.start,
                        crashk_res.end - crashk_res.start + 1);
        }
+#else
+       {
+               xen_kexec_reserve_t reservation;
+               BUG_ON(HYPERVISOR_kexec_op(KEXEC_CMD_kexec_reserve,
+                                       &reservation));
+               if (reservation.size) {
+                       crashk_res.start = reservation.start;
+                       crashk_res.end = reservation.start + 
+                               reservation.size - 1;
+               }
+       }
+#endif
 #endif
 
        paging_init();
@@ -954,6 +976,11 @@ void __init setup_arch(char **cmdline_p)
        iommu_hole_init();
 #endif
 
+#ifdef CONFIG_KEXEC
+       if (crashk_res.start != crashk_res.end)
+               request_resource(&ioport_resource, &crashk_res);
+#endif
+
 #ifdef CONFIG_XEN
        {
                struct physdev_set_iopl set_iopl;
--- /dev/null
+++ work/linux-2.6-xen-sparse/include/asm-x86_64/kexec-xen.h    2006-10-23 
11:36:18.000000000 +0900
@@ -0,0 +1,64 @@
+/*
+ * include/asm-x86_64/kexec-xen.h
+ *
+ * Created By: Horms <horms@xxxxxxxxxxxx>
+ */
+
+#ifndef _X86_64_KEXEC_XEN_H
+#define _X86_64_KEXEC_XEN_H
+
+#include <asm/ptrace.h>
+#include <asm/types.h>
+#include <xen/interface/arch-x86_64.h>
+
+static inline void crash_translate_regs(struct pt_regs *linux_regs,
+                                       struct cpu_user_regs *xen_regs)
+{
+       xen_regs->r15 = linux_regs->r15;
+       xen_regs->r14 = linux_regs->r14;
+       xen_regs->r13 = linux_regs->r13;
+       xen_regs->r12 = linux_regs->r12;
+       xen_regs->rbp = linux_regs->rbp;
+       xen_regs->rbx = linux_regs->rbx;
+       xen_regs->r11 = linux_regs->r11;
+       xen_regs->r10 = linux_regs->r10;
+       xen_regs->r9 = linux_regs->r9;
+       xen_regs->r8 = linux_regs->r8;
+       xen_regs->rax = linux_regs->rax;
+       xen_regs->rcx = linux_regs->rcx;
+       xen_regs->rdx = linux_regs->rdx;
+       xen_regs->rsi = linux_regs->rsi;
+       xen_regs->rdi = linux_regs->rdi;
+       xen_regs->rip = linux_regs->rip;
+       xen_regs->cs = linux_regs->cs;
+       xen_regs->rflags = linux_regs->eflags;
+       xen_regs->rsp = linux_regs->rsp;
+       xen_regs->ss = linux_regs->ss;
+}
+
+/* Kexec needs to know about the actual physical addresss.
+ * But in xen, on some architectures, a physical address is a
+ * pseudo-physical addresss. */
+#ifdef CONFIG_XEN
+#define kexec_page_to_pfn(page)  pfn_to_mfn(page_to_pfn(page))
+#define kexec_pfn_to_page(pfn)   pfn_to_page(mfn_to_pfn(pfn))
+#define kexec_virt_to_phys(addr) virt_to_machine(addr)
+#define kexec_phys_to_virt(addr) phys_to_virt(machine_to_phys(addr))
+#else
+#define kexec_page_to_pfn(page)  page_to_pfn(page)
+#define kexec_pfn_to_page(pfn)   pfn_to_page(pfn)
+#define kexec_virt_to_phys(addr) virt_to_phys(addr)
+#define kexec_phys_to_virt(addr) phys_to_virt(addr)
+#endif
+
+#endif /* _X86_64_KEXEC_XEN_H */
+
+/*
+ * Local variables:
+ *  c-file-style: "linux"
+ *  indent-tabs-mode: t
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ *  tab-width: 8
+ * End:
+ */
--- 0001/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/hypercall.h
+++ work/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/hypercall.h       
2006-10-23 11:36:17.000000000 +0900
@@ -386,4 +386,11 @@ HYPERVISOR_xenoprof_op(
        return _hypercall2(int, xenoprof_op, op, arg);
 }
 
+static inline int
+HYPERVISOR_kexec_op(
+       unsigned long op, void *args)
+{
+       return _hypercall2(int, kexec_op, op, args);
+}
+
 #endif /* __HYPERCALL_H__ */
--- 0001/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/ptrace.h
+++ work/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/ptrace.h  
2006-10-23 11:36:17.000000000 +0900
@@ -90,6 +90,8 @@ extern unsigned long profile_pc(struct p
 #define profile_pc(regs) instruction_pointer(regs)
 #endif
 
+#include <linux/compiler.h>
+
 void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
 
 struct task_struct;
--- /dev/null
+++ 
work/patches/linux-2.6.16.29/git-4bfaaef01a1badb9e8ffb0c0a37cd2379008d21f.patch 
    2006-10-23 11:36:18.000000000 +0900
@@ -0,0 +1,375 @@
+From: Magnus Damm <magnus@xxxxxxxxxxxxx>
+Date: Tue, 26 Sep 2006 08:52:38 +0000 (+0200)
+Subject: [PATCH] Avoid overwriting the current pgd (V4, x86_64)
+X-Git-Tag: v2.6.19-rc1
+X-Git-Url: 
http://www.kernel.org/git/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff;h=4bfaaef01a1badb9e8ffb0c0a37cd2379008d21f
+
+[PATCH] Avoid overwriting the current pgd (V4, x86_64)
+
+kexec: Avoid overwriting the current pgd (V4, x86_64)
+
+This patch upgrades the x86_64-specific kexec code to avoid overwriting the
+current pgd. Overwriting the current pgd is bad when CONFIG_CRASH_DUMP is used
+to start a secondary kernel that dumps the memory of the previous kernel.
+
+The code introduces a new set of page tables. These tables are used to provide
+an executable identity mapping without overwriting the current pgd.
+
+Signed-off-by: Magnus Damm <magnus@xxxxxxxxxxxxx>
+Signed-off-by: Andi Kleen <ak@xxxxxxx>
+---
+
+--- a/arch/x86_64/kernel/machine_kexec.c
++++ b/arch/x86_64/kernel/machine_kexec.c
+@@ -15,6 +15,15 @@
+ #include <asm/mmu_context.h>
+ #include <asm/io.h>
+ 
++#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
++static u64 kexec_pgd[512] PAGE_ALIGNED;
++static u64 kexec_pud0[512] PAGE_ALIGNED;
++static u64 kexec_pmd0[512] PAGE_ALIGNED;
++static u64 kexec_pte0[512] PAGE_ALIGNED;
++static u64 kexec_pud1[512] PAGE_ALIGNED;
++static u64 kexec_pmd1[512] PAGE_ALIGNED;
++static u64 kexec_pte1[512] PAGE_ALIGNED;
++
+ static void init_level2_page(pmd_t *level2p, unsigned long addr)
+ {
+       unsigned long end_addr;
+@@ -144,32 +153,19 @@ static void load_segments(void)
+               );
+ }
+ 
+-typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long 
indirection_page,
+-                                      unsigned long control_code_buffer,
+-                                      unsigned long start_address,
+-                                      unsigned long pgtable) ATTRIB_NORET;
+-
+-extern const unsigned char relocate_new_kernel[];
+-extern const unsigned long relocate_new_kernel_size;
+-
+ int machine_kexec_prepare(struct kimage *image)
+ {
+-      unsigned long start_pgtable, control_code_buffer;
++      unsigned long start_pgtable;
+       int result;
+ 
+       /* Calculate the offsets */
+       start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
+-      control_code_buffer = start_pgtable + PAGE_SIZE;
+ 
+       /* Setup the identity mapped 64bit page table */
+       result = init_pgtable(image, start_pgtable);
+       if (result)
+               return result;
+ 
+-      /* Place the code in the reboot code buffer */
+-      memcpy(__va(control_code_buffer), relocate_new_kernel,
+-                                              relocate_new_kernel_size);
+-
+       return 0;
+ }
+ 
+@@ -184,28 +180,34 @@ void machine_kexec_cleanup(struct kimage
+  */
+ NORET_TYPE void machine_kexec(struct kimage *image)
+ {
+-      unsigned long page_list;
+-      unsigned long control_code_buffer;
+-      unsigned long start_pgtable;
+-      relocate_new_kernel_t rnk;
++      unsigned long page_list[PAGES_NR];
++      void *control_page;
+ 
+       /* Interrupts aren't acceptable while we reboot */
+       local_irq_disable();
+ 
+-      /* Calculate the offsets */
+-      page_list = image->head;
+-      start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
+-      control_code_buffer = start_pgtable + PAGE_SIZE;
++      control_page = page_address(image->control_code_page) + PAGE_SIZE;
++      memcpy(control_page, relocate_kernel, PAGE_SIZE);
+ 
+-      /* Set the low half of the page table to my identity mapped
+-       * page table for kexec.  Leave the high half pointing at the
+-       * kernel pages.   Don't bother to flush the global pages
+-       * as that will happen when I fully switch to my identity mapped
+-       * page table anyway.
+-       */
+-      memcpy(__va(read_cr3()), __va(start_pgtable), PAGE_SIZE/2);
+-      __flush_tlb();
++      page_list[PA_CONTROL_PAGE] = __pa(control_page);
++      page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel;
++      page_list[PA_PGD] = __pa(kexec_pgd);
++      page_list[VA_PGD] = (unsigned long)kexec_pgd;
++      page_list[PA_PUD_0] = __pa(kexec_pud0);
++      page_list[VA_PUD_0] = (unsigned long)kexec_pud0;
++      page_list[PA_PMD_0] = __pa(kexec_pmd0);
++      page_list[VA_PMD_0] = (unsigned long)kexec_pmd0;
++      page_list[PA_PTE_0] = __pa(kexec_pte0);
++      page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
++      page_list[PA_PUD_1] = __pa(kexec_pud1);
++      page_list[VA_PUD_1] = (unsigned long)kexec_pud1;
++      page_list[PA_PMD_1] = __pa(kexec_pmd1);
++      page_list[VA_PMD_1] = (unsigned long)kexec_pmd1;
++      page_list[PA_PTE_1] = __pa(kexec_pte1);
++      page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
+ 
++      page_list[PA_TABLE_PAGE] =
++        (unsigned long)__pa(page_address(image->control_code_page));
+ 
+       /* The segment registers are funny things, they have both a
+        * visible and an invisible part.  Whenever the visible part is
+@@ -222,9 +224,10 @@ NORET_TYPE void machine_kexec(struct kim
+        */
+       set_gdt(phys_to_virt(0),0);
+       set_idt(phys_to_virt(0),0);
++
+       /* now call it */
+-      rnk = (relocate_new_kernel_t) control_code_buffer;
+-      (*rnk)(page_list, control_code_buffer, image->start, start_pgtable);
++      relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
++                      image->start);
+ }
+ 
+ /* crashkernel=size@addr specifies the location to reserve for
+--- a/arch/x86_64/kernel/relocate_kernel.S
++++ b/arch/x86_64/kernel/relocate_kernel.S
+@@ -7,31 +7,169 @@
+  */
+ 
+ #include <linux/linkage.h>
++#include <asm/page.h>
++#include <asm/kexec.h>
+ 
+-      /*
+-       * Must be relocatable PIC code callable as a C function, that once
+-       * it starts can not use the previous processes stack.
+-       */
+-      .globl relocate_new_kernel
++/*
++ * Must be relocatable PIC code callable as a C function
++ */
++
++#define PTR(x) (x << 3)
++#define PAGE_ALIGNED (1 << PAGE_SHIFT)
++#define PAGE_ATTR 0x63 /* _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY */
++
++      .text
++      .align PAGE_ALIGNED
+       .code64
++      .globl relocate_kernel
++relocate_kernel:
++      /* %rdi indirection_page
++       * %rsi page_list
++       * %rdx start address
++       */
++
++      /* map the control page at its virtual address */
++
++      movq    $0x0000ff8000000000, %r10        /* mask */
++      mov     $(39 - 3), %cl                   /* bits to shift */
++      movq    PTR(VA_CONTROL_PAGE)(%rsi), %r11 /* address to map */
++
++      movq    %r11, %r9
++      andq    %r10, %r9
++      shrq    %cl, %r9
++
++      movq    PTR(VA_PGD)(%rsi), %r8
++      addq    %r8, %r9
++      movq    PTR(PA_PUD_0)(%rsi), %r8
++      orq     $PAGE_ATTR, %r8
++      movq    %r8, (%r9)
++
++      shrq    $9, %r10
++      sub     $9, %cl
++
++      movq    %r11, %r9
++      andq    %r10, %r9
++      shrq    %cl, %r9
++
++      movq    PTR(VA_PUD_0)(%rsi), %r8
++      addq    %r8, %r9
++      movq    PTR(PA_PMD_0)(%rsi), %r8
++      orq     $PAGE_ATTR, %r8
++      movq    %r8, (%r9)
++
++      shrq    $9, %r10
++      sub     $9, %cl
++
++      movq    %r11, %r9
++      andq    %r10, %r9
++      shrq    %cl, %r9
++
++      movq    PTR(VA_PMD_0)(%rsi), %r8
++      addq    %r8, %r9
++      movq    PTR(PA_PTE_0)(%rsi), %r8
++      orq     $PAGE_ATTR, %r8
++      movq    %r8, (%r9)
++
++      shrq    $9, %r10
++      sub     $9, %cl
++
++      movq    %r11, %r9
++      andq    %r10, %r9
++      shrq    %cl, %r9
++
++      movq    PTR(VA_PTE_0)(%rsi), %r8
++      addq    %r8, %r9
++      movq    PTR(PA_CONTROL_PAGE)(%rsi), %r8
++      orq     $PAGE_ATTR, %r8
++      movq    %r8, (%r9)
++
++      /* identity map the control page at its physical address */
++
++      movq    $0x0000ff8000000000, %r10        /* mask */
++      mov     $(39 - 3), %cl                   /* bits to shift */
++      movq    PTR(PA_CONTROL_PAGE)(%rsi), %r11 /* address to map */
++
++      movq    %r11, %r9
++      andq    %r10, %r9
++      shrq    %cl, %r9
++
++      movq    PTR(VA_PGD)(%rsi), %r8
++      addq    %r8, %r9
++      movq    PTR(PA_PUD_1)(%rsi), %r8
++      orq     $PAGE_ATTR, %r8
++      movq    %r8, (%r9)
++
++      shrq    $9, %r10
++      sub     $9, %cl
++
++      movq    %r11, %r9
++      andq    %r10, %r9
++      shrq    %cl, %r9
++
++      movq    PTR(VA_PUD_1)(%rsi), %r8
++      addq    %r8, %r9
++      movq    PTR(PA_PMD_1)(%rsi), %r8
++      orq     $PAGE_ATTR, %r8
++      movq    %r8, (%r9)
++
++      shrq    $9, %r10
++      sub     $9, %cl
++
++      movq    %r11, %r9
++      andq    %r10, %r9
++      shrq    %cl, %r9
++
++      movq    PTR(VA_PMD_1)(%rsi), %r8
++      addq    %r8, %r9
++      movq    PTR(PA_PTE_1)(%rsi), %r8
++      orq     $PAGE_ATTR, %r8
++      movq    %r8, (%r9)
++
++      shrq    $9, %r10
++      sub     $9, %cl
++
++      movq    %r11, %r9
++      andq    %r10, %r9
++      shrq    %cl, %r9
++
++      movq    PTR(VA_PTE_1)(%rsi), %r8
++      addq    %r8, %r9
++      movq    PTR(PA_CONTROL_PAGE)(%rsi), %r8
++      orq     $PAGE_ATTR, %r8
++      movq    %r8, (%r9)
++
+ relocate_new_kernel:
+-      /* %rdi page_list
+-       * %rsi reboot_code_buffer
++      /* %rdi indirection_page
++       * %rsi page_list
+        * %rdx start address
+-       * %rcx page_table
+-       * %r8  arg5
+-       * %r9  arg6
+        */
+ 
+       /* zero out flags, and disable interrupts */
+       pushq $0
+       popfq
+ 
+-      /* set a new stack at the bottom of our page... */
+-      lea   4096(%rsi), %rsp
++      /* get physical address of control page now */
++      /* this is impossible after page table switch */
++      movq    PTR(PA_CONTROL_PAGE)(%rsi), %r8
++
++      /* get physical address of page table now too */
++      movq    PTR(PA_TABLE_PAGE)(%rsi), %rcx
++
++      /* switch to new set of page tables */
++      movq    PTR(PA_PGD)(%rsi), %r9
++      movq    %r9, %cr3
++
++      /* setup a new stack at the end of the physical control page */
++      lea     4096(%r8), %rsp
++
++      /* jump to identity mapped page */
++      addq    $(identity_mapped - relocate_kernel), %r8
++      pushq   %r8
++      ret
+ 
+-      /* store the parameters back on the stack */
+-      pushq   %rdx /* store the start address */
++identity_mapped:
++      /* store the start address on the stack */
++      pushq   %rdx
+ 
+       /* Set cr0 to a known state:
+        * 31 1 == Paging enabled
+@@ -136,8 +274,3 @@ relocate_new_kernel:
+       xorq    %r15, %r15
+ 
+       ret
+-relocate_new_kernel_end:
+-
+-      .globl relocate_new_kernel_size
+-relocate_new_kernel_size:
+-      .quad relocate_new_kernel_end - relocate_new_kernel
+--- a/include/asm-x86_64/kexec.h
++++ b/include/asm-x86_64/kexec.h
+@@ -1,6 +1,27 @@
+ #ifndef _X86_64_KEXEC_H
+ #define _X86_64_KEXEC_H
+ 
++#define PA_CONTROL_PAGE  0
++#define VA_CONTROL_PAGE  1
++#define PA_PGD           2
++#define VA_PGD           3
++#define PA_PUD_0         4
++#define VA_PUD_0         5
++#define PA_PMD_0         6
++#define VA_PMD_0         7
++#define PA_PTE_0         8
++#define VA_PTE_0         9
++#define PA_PUD_1         10
++#define VA_PUD_1         11
++#define PA_PMD_1         12
++#define VA_PMD_1         13
++#define PA_PTE_1         14
++#define VA_PTE_1         15
++#define PA_TABLE_PAGE    16
++#define PAGES_NR         17
++
++#ifndef __ASSEMBLY__
++
+ #include <linux/string.h>
+ 
+ #include <asm/page.h>
+@@ -64,4 +85,12 @@ static inline void crash_setup_regs(stru
+               newregs->rip = (unsigned long)current_text_addr();
+       }
+ }
++
++NORET_TYPE void
++relocate_kernel(unsigned long indirection_page,
++              unsigned long page_list,
++              unsigned long start_address) ATTRIB_NORET;
++
++#endif /* __ASSEMBLY__ */
++
+ #endif /* _X86_64_KEXEC_H */
--- /dev/null
+++ 
work/patches/linux-2.6.16.29/linux-2.6.19-rc1-kexec-move_segment_code-x86_64.patch
  2006-10-23 11:36:18.000000000 +0900
@@ -0,0 +1,161 @@
+kexec: Move asm segment handling code to the assembly file (x86_64)
+
+This patch moves the idt, gdt, and segment handling code from machine_kexec.c
+to relocate_kernel.S.  The main reason behind this move is to avoid code 
+duplication in the Xen hypervisor. With this patch all code required to kexec
+is put on the control page.
+
+On top of that this patch also counts as a cleanup - I think it is much
+nicer to write assembly directly in assembly files than wrap inline assembly
+in C functions for no apparent reason.
+
+Signed-off-by: Magnus Damm <magnus@xxxxxxxxxxxxx>
+---
+
+ Applies to 2.6.19-rc1.
+
+ machine_kexec.c   |   58 -----------------------------------------------------
+ relocate_kernel.S |   50 +++++++++++++++++++++++++++++++++++++++++----
+ 2 files changed, 45 insertions(+), 63 deletions(-)
+
+--- 0002/arch/x86_64/kernel/machine_kexec.c
++++ work/arch/x86_64/kernel/machine_kexec.c    2006-10-05 16:15:49.000000000 
+0900
+@@ -112,47 +112,6 @@ static int init_pgtable(struct kimage *i
+       return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT);
+ }
+ 
+-static void set_idt(void *newidt, u16 limit)
+-{
+-      struct desc_ptr curidt;
+-
+-      /* x86-64 supports unaliged loads & stores */
+-      curidt.size    = limit;
+-      curidt.address = (unsigned long)newidt;
+-
+-      __asm__ __volatile__ (
+-              "lidtq %0\n"
+-              : : "m" (curidt)
+-              );
+-};
+-
+-
+-static void set_gdt(void *newgdt, u16 limit)
+-{
+-      struct desc_ptr curgdt;
+-
+-      /* x86-64 supports unaligned loads & stores */
+-      curgdt.size    = limit;
+-      curgdt.address = (unsigned long)newgdt;
+-
+-      __asm__ __volatile__ (
+-              "lgdtq %0\n"
+-              : : "m" (curgdt)
+-              );
+-};
+-
+-static void load_segments(void)
+-{
+-      __asm__ __volatile__ (
+-              "\tmovl %0,%%ds\n"
+-              "\tmovl %0,%%es\n"
+-              "\tmovl %0,%%ss\n"
+-              "\tmovl %0,%%fs\n"
+-              "\tmovl %0,%%gs\n"
+-              : : "a" (__KERNEL_DS) : "memory"
+-              );
+-}
+-
+ int machine_kexec_prepare(struct kimage *image)
+ {
+       unsigned long start_pgtable;
+@@ -209,23 +168,6 @@ NORET_TYPE void machine_kexec(struct kim
+       page_list[PA_TABLE_PAGE] =
+         (unsigned long)__pa(page_address(image->control_code_page));
+ 
+-      /* The segment registers are funny things, they have both a
+-       * visible and an invisible part.  Whenever the visible part is
+-       * set to a specific selector, the invisible part is loaded
+-       * with from a table in memory.  At no other time is the
+-       * descriptor table in memory accessed.
+-       *
+-       * I take advantage of this here by force loading the
+-       * segments, before I zap the gdt with an invalid value.
+-       */
+-      load_segments();
+-      /* The gdt & idt are now invalid.
+-       * If you want to load them you must set up your own idt & gdt.
+-       */
+-      set_gdt(phys_to_virt(0),0);
+-      set_idt(phys_to_virt(0),0);
+-
+-      /* now call it */
+       relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
+                       image->start);
+ }
+--- 0002/arch/x86_64/kernel/relocate_kernel.S
++++ work/arch/x86_64/kernel/relocate_kernel.S  2006-10-05 16:18:07.000000000 
+0900
+@@ -159,13 +159,39 @@ relocate_new_kernel:
+       movq    PTR(PA_PGD)(%rsi), %r9
+       movq    %r9, %cr3
+ 
++      /* setup idt */
++      movq    %r8, %rax
++      addq    $(idt_80 - relocate_kernel), %rax
++      lidtq   (%rax)
++
++      /* setup gdt */
++      movq    %r8, %rax
++      addq    $(gdt - relocate_kernel), %rax
++      movq    %r8, %r9
++      addq    $((gdt_80 - relocate_kernel) + 2), %r9
++      movq    %rax, (%r9)
++
++      movq    %r8, %rax
++      addq    $(gdt_80 - relocate_kernel), %rax
++      lgdtq   (%rax)
++
++      /* setup data segment registers */
++      xorl    %eax, %eax
++      movl    %eax, %ds
++      movl    %eax, %es
++      movl    %eax, %fs
++      movl    %eax, %gs
++      movl    %eax, %ss
++      
+       /* setup a new stack at the end of the physical control page */
+       lea     4096(%r8), %rsp
+ 
+-      /* jump to identity mapped page */
+-      addq    $(identity_mapped - relocate_kernel), %r8
+-      pushq   %r8
+-      ret
++      /* load new code segment and jump to identity mapped page */
++      movq    %r8, %rax
++      addq    $(identity_mapped - relocate_kernel), %rax
++      pushq   $(gdt_cs - gdt)
++      pushq   %rax
++      lretq
+ 
+ identity_mapped:
+       /* store the start address on the stack */
+@@ -272,5 +298,19 @@ identity_mapped:
+       xorq    %r13, %r13
+       xorq    %r14, %r14
+       xorq    %r15, %r15
+-
+       ret
++
++      .align  16
++gdt:
++      .quad   0x0000000000000000      /* NULL descriptor */
++gdt_cs:
++      .quad   0x00af9a000000ffff
++gdt_end:
++
++gdt_80:
++      .word   gdt_end - gdt - 1       /* limit */
++      .quad   0                       /* base - filled in by code above */
++
++idt_80:
++      .word   0                       /* limit */
++      .quad   0                       /* base */
--- /dev/null
+++ work/patches/linux-2.6.16.29/linux-2.6.19-rc1-kexec-xen-x86_64.patch        
2006-10-23 11:36:18.000000000 +0900
@@ -0,0 +1,162 @@
+--- 0006/arch/x86_64/kernel/machine_kexec.c
++++ work/arch/x86_64/kernel/machine_kexec.c    2006-10-06 15:36:16.000000000 
+0900
+@@ -24,6 +24,104 @@ static u64 kexec_pud1[512] PAGE_ALIGNED;
+ static u64 kexec_pmd1[512] PAGE_ALIGNED;
+ static u64 kexec_pte1[512] PAGE_ALIGNED;
+ 
++#ifdef CONFIG_XEN
++
++/* In the case of Xen, override hypervisor functions to be able to create
++ * a regular identity mapping page table...
++ */
++
++#include <xen/interface/kexec.h>
++#include <xen/interface/memory.h>
++
++#define x__pmd(x) ((pmd_t) { (x) } )
++#define x__pud(x) ((pud_t) { (x) } )
++#define x__pgd(x) ((pgd_t) { (x) } )
++
++#define x_pmd_val(x)   ((x).pmd)
++#define x_pud_val(x)   ((x).pud)
++#define x_pgd_val(x)   ((x).pgd)
++
++static inline void x_set_pmd(pmd_t *dst, pmd_t val)
++{
++      x_pmd_val(*dst) = x_pmd_val(val);
++}
++
++static inline void x_set_pud(pud_t *dst, pud_t val)
++{
++      x_pud_val(*dst) = phys_to_machine(x_pud_val(val));
++}
++
++static inline void x_pud_clear (pud_t *pud)
++{
++      x_pud_val(*pud) = 0;
++}
++
++static inline void x_set_pgd(pgd_t *dst, pgd_t val)
++{
++      x_pgd_val(*dst) = phys_to_machine(x_pgd_val(val));
++}
++
++static inline void x_pgd_clear (pgd_t * pgd)
++{
++      x_pgd_val(*pgd) = 0;
++}
++
++#define X__PAGE_KERNEL_LARGE_EXEC \
++         _PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_PSE
++#define X_KERNPG_TABLE _PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY
++
++#define __ma(x) (pfn_to_mfn(__pa((x)) >> PAGE_SHIFT) << PAGE_SHIFT)
++
++#if PAGES_NR > KEXEC_XEN_NO_PAGES
++#error PAGES_NR is greater than KEXEC_XEN_NO_PAGES - Xen support will break
++#endif
++
++#if PA_CONTROL_PAGE != 0
++#error PA_CONTROL_PAGE is non zero - Xen support will break
++#endif
++
++void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, struct kimage 
*image)
++{
++      void *control_page;
++      void *table_page;
++
++      memset(xki->page_list, 0, sizeof(xki->page_list));
++
++      control_page = page_address(image->control_code_page) + PAGE_SIZE;
++      memcpy(control_page, relocate_kernel, PAGE_SIZE);
++
++      table_page = page_address(image->control_code_page);
++
++      xki->page_list[PA_CONTROL_PAGE] = __ma(control_page);
++      xki->page_list[PA_TABLE_PAGE] = __ma(table_page);
++
++      xki->page_list[PA_PGD] = __ma(kexec_pgd);
++      xki->page_list[PA_PUD_0] = __ma(kexec_pud0);
++      xki->page_list[PA_PUD_1] = __ma(kexec_pud1);
++      xki->page_list[PA_PMD_0] = __ma(kexec_pmd0);
++      xki->page_list[PA_PMD_1] = __ma(kexec_pmd1);
++      xki->page_list[PA_PTE_0] = __ma(kexec_pte0);
++      xki->page_list[PA_PTE_1] = __ma(kexec_pte1);
++}
++
++#else /* CONFIG_XEN */
++
++#define x__pmd(x) __pmd(x)
++#define x__pud(x) __pud(x)
++#define x__pgd(x) __pgd(x)
++
++#define x_set_pmd(x, y) set_pmd(x, y)
++#define x_set_pud(x, y) set_pud(x, y)
++#define x_set_pgd(x, y) set_pgd(x, y)
++
++#define x_pud_clear(x) pud_clear(x)
++#define x_pgd_clear(x) pgd_clear(x)
++
++#define X__PAGE_KERNEL_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC
++#define X_KERNPG_TABLE _KERNPG_TABLE
++
++#endif /* CONFIG_XEN */
++
+ static void init_level2_page(pmd_t *level2p, unsigned long addr)
+ {
+       unsigned long end_addr;
+@@ -31,7 +129,7 @@ static void init_level2_page(pmd_t *leve
+       addr &= PAGE_MASK;
+       end_addr = addr + PUD_SIZE;
+       while (addr < end_addr) {
+-              set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
++              x_set_pmd(level2p++, x__pmd(addr | X__PAGE_KERNEL_LARGE_EXEC));
+               addr += PMD_SIZE;
+       }
+ }
+@@ -56,12 +154,12 @@ static int init_level3_page(struct kimag
+               }
+               level2p = (pmd_t *)page_address(page);
+               init_level2_page(level2p, addr);
+-              set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE));
++              x_set_pud(level3p++, x__pud(__pa(level2p) | X_KERNPG_TABLE));
+               addr += PUD_SIZE;
+       }
+       /* clear the unused entries */
+       while (addr < end_addr) {
+-              pud_clear(level3p++);
++              x_pud_clear(level3p++);
+               addr += PUD_SIZE;
+       }
+ out:
+@@ -92,12 +190,12 @@ static int init_level4_page(struct kimag
+               if (result) {
+                       goto out;
+               }
+-              set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
++              x_set_pgd(level4p++, x__pgd(__pa(level3p) | X_KERNPG_TABLE));
+               addr += PGDIR_SIZE;
+       }
+       /* clear the unused entries */
+       while (addr < end_addr) {
+-              pgd_clear(level4p++);
++              x_pgd_clear(level4p++);
+               addr += PGDIR_SIZE;
+       }
+ out:
+@@ -108,8 +206,14 @@ out:
+ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
+ {
+       pgd_t *level4p;
++      unsigned long x_end_pfn = end_pfn;
++
++#ifdef CONFIG_XEN
++      x_end_pfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
++#endif
++
+       level4p = (pgd_t *)__va(start_pgtable);
+-      return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT);
++      return init_level4_page(image, level4p, 0, x_end_pfn << PAGE_SHIFT);
+ }
+ 
+ int machine_kexec_prepare(struct kimage *image)
--- 0005/patches/linux-2.6.16.29/series
+++ work/patches/linux-2.6.16.29/series 2006-10-23 11:36:17.000000000 +0900
@@ -4,6 +4,9 @@ git-2a8a3d5b65e86ec1dfef7d268c64a909eab9
 git-3566561bfadffcb5dbc85d576be80c0dbf2cccc9.patch
 linux-2.6.19-rc1-kexec-move_segment_code-i386.patch
 linux-2.6.19-rc1-kexec-xen-i386.patch
+git-4bfaaef01a1badb9e8ffb0c0a37cd2379008d21f.patch
+linux-2.6.19-rc1-kexec-move_segment_code-x86_64.patch
+linux-2.6.19-rc1-kexec-xen-x86_64.patch
 blktap-aio-16_03_06.patch
 device_bind.patch
 fix-hz-suspend.patch
--- 0001/xen/arch/x86/x86_64/entry.S
+++ work/xen/arch/x86/x86_64/entry.S    2006-10-23 11:36:17.000000000 +0900
@@ -573,6 +573,7 @@ ENTRY(hypercall_table)
         .quad do_hvm_op
         .quad do_sysctl             /* 35 */
         .quad do_domctl
+        .quad do_kexec_op
         .rept NR_hypercalls-((.-hypercall_table)/8)
         .quad do_ni_hypercall
         .endr
@@ -615,6 +616,7 @@ ENTRY(hypercall_args_table)
         .byte 2 /* do_hvm_op            */
         .byte 1 /* do_sysctl            */  /* 35 */
         .byte 1 /* do_domctl            */
+        .byte 2 /* do_kexec             */
         .rept NR_hypercalls-(.-hypercall_args_table)
         .byte 0 /* do_ni_hypercall      */
         .endr
--- 0004/xen/include/asm-x86/x86_64/elf.h
+++ work/xen/include/asm-x86/x86_64/elf.h       2006-10-23 11:36:17.000000000 
+0900
@@ -1,14 +1,51 @@
+/*
+ * Based on include/asm-x86_64/elf.h:ELF_CORE_COPY_REGS from Linux 2.6.16
+ */
+
 #ifndef __X86_64_ELF_H__
 #define __X86_64_ELF_H__
 
-#include <xen/lib.h>       /* for printk() used in stub */
+#define ELF_NGREG 27
 
-#define ELF_NGREG 1       /* XXX: Define to be at least as large as
-                             however many register slots are needed when
-                             crash notes are written during crash dump */
+/* XXX: Xen doesn't have orig_rax, so it is omitted.
+ *      Xen dosn't have threads, so fs and gs are read from the CPU and
+ *      thus values 21 and 22 are just duplicates of 25 and 26
+ *      respectively.  All these values could be passed from dom0 in the
+ *      case of it crashing, but does that help?
+ *
+ *      Lastly, I'm not sure why ds, es, fs and gs are read from
+ *      the CPU rather than regs, but linux does this
+ */
 
-#define ELF_CORE_COPY_REGS(pr_reg, regs)                                \
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+#define ELF_CORE_COPY_REGS(pr_reg, regs)  do { \
+       unsigned v;                                             \
+       (pr_reg)[0] = (regs)->r15;                              \
+       (pr_reg)[1] = (regs)->r14;                              \
+       (pr_reg)[2] = (regs)->r13;                              \
+       (pr_reg)[3] = (regs)->r12;                              \
+       (pr_reg)[4] = (regs)->rbp;                              \
+       (pr_reg)[5] = (regs)->rbx;                              \
+       (pr_reg)[6] = (regs)->r11;                              \
+       (pr_reg)[7] = (regs)->r10;                              \
+       (pr_reg)[8] = (regs)->r9;                               \
+       (pr_reg)[9] = (regs)->r8;                               \
+       (pr_reg)[10] = (regs)->rax;                             \
+       (pr_reg)[11] = (regs)->rcx;                             \
+       (pr_reg)[12] = (regs)->rdx;                             \
+       (pr_reg)[13] = (regs)->rsi;                             \
+       (pr_reg)[14] = (regs)->rdi;                             \
+       (pr_reg)[16] = (regs)->rip;                     \
+       (pr_reg)[17] = (regs)->cs;                      \
+       (pr_reg)[18] = (regs)->eflags;                  \
+       (pr_reg)[19] = (regs)->rsp;                     \
+       (pr_reg)[20] = (regs)->ss;                      \
+       asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[21] = v;       \
+       asm("movl %%gs,%0" : "=r" (v)); (pr_reg)[22] = v;       \
+       asm("movl %%ds,%0" : "=r" (v)); (pr_reg)[23] = v;       \
+       asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v;       \
+       asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v;       \
+       asm("movl %%gs,%0" : "=r" (v)); (pr_reg)[26] = v;       \
+} while(0);
 
 #endif /* __X86_64_ELF_H__ */
 
--- 0004/xen/include/asm-x86/x86_64/kexec.h
+++ work/xen/include/asm-x86/x86_64/kexec.h     2006-10-23 11:36:17.000000000 
+0900
@@ -1,20 +1,68 @@
+/******************************************************************************
+ * kexec.h
+ * 
+ * Based heavily on machine_kexec.c and kexec.h from Linux 2.6.19-rc1
+ *
+ */
+
 #ifndef __X86_64_KEXEC_H__
 #define __X86_64_KEXEC_H__
-
-#include <xen/lib.h>       /* for printk() used in stub */
+  
+#include <xen/lib.h>
 #include <xen/types.h>
 #include <public/xen.h>
 #include <xen/kexec.h>
-
+#include <asm/processor.h>
+#include <xen/string.h>
+#include <asm/fixmap.h>
+  
+/*
+ * Saving the registers of the cpu on which panic occured in
+ * crash_kexec to save a valid sp. The registers of other cpus
+ * will be saved in machine_crash_shutdown while shooting down them.
+ */
 static inline void crash_setup_regs(struct cpu_user_regs *newregs,
-                                    struct cpu_user_regs *oldregs)
+                            struct cpu_user_regs *oldregs)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+       if (oldregs)
+               memcpy(newregs, oldregs, sizeof(*newregs));
+       else {
+               __asm__ __volatile__("movq %%rbx,%0" : "=m"(newregs->rbx));
+               __asm__ __volatile__("movq %%rcx,%0" : "=m"(newregs->rcx));
+               __asm__ __volatile__("movq %%rdx,%0" : "=m"(newregs->rdx));
+               __asm__ __volatile__("movq %%rsi,%0" : "=m"(newregs->rsi));
+               __asm__ __volatile__("movq %%rdi,%0" : "=m"(newregs->rdi));
+               __asm__ __volatile__("movq %%rbp,%0" : "=m"(newregs->rbp));
+               __asm__ __volatile__("movq %%rax,%0" : "=m"(newregs->rax));
+               __asm__ __volatile__("movq %%rsp,%0" : "=m"(newregs->rsp));
+               __asm__ __volatile__("movq %%r8,%0" : "=m"(newregs->r8));
+               __asm__ __volatile__("movq %%r9,%0" : "=m"(newregs->r9));
+               __asm__ __volatile__("movq %%r10,%0" : "=m"(newregs->r10));
+               __asm__ __volatile__("movq %%r11,%0" : "=m"(newregs->r11));
+               __asm__ __volatile__("movq %%r12,%0" : "=m"(newregs->r12));
+               __asm__ __volatile__("movq %%r13,%0" : "=m"(newregs->r13));
+               __asm__ __volatile__("movq %%r14,%0" : "=m"(newregs->r14));
+               __asm__ __volatile__("movq %%r15,%0" : "=m"(newregs->r15));
+               __asm__ __volatile__("movl %%ss, %%eax;" :"=a"(newregs->ss));
+               __asm__ __volatile__("movl %%cs, %%eax;" :"=a"(newregs->cs));
+               __asm__ __volatile__("pushfq; popq %0" :"=m"(newregs->eflags));
+
+               newregs->rip = (unsigned long)current_text_addr();
+       }
 }
 
+typedef void (*relocate_new_kernel_t)(
+                unsigned long indirection_page,
+                unsigned long page_list,
+                unsigned long start_address);
+
 static inline void machine_kexec(xen_kexec_image_t *image)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+    relocate_new_kernel_t rnk;
+
+    rnk = (relocate_new_kernel_t) image->page_list[1];
+    (*rnk)(image->indirection_page, (unsigned long)image->page_list, 
+           image->start_address);
 }
 
 #endif /* __X86_64_KEXEC_H__ */

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.