[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [RFC][PATCH] xen: Kexec patch for pvops kernel



Hi,

I am posting first kexec patch for pvops kernel. It applies to
git://oss.oracle.com/git/kwilk/xen.git tree, stable/2.6.39.x branch.
Tested on x86_64. Compiles for x86_32. It should be used with
latest kexec-tools development version which could be found at
git://git.kernel.org/pub/scm/utils/kernel/kexec/kexec-tools.git.

TODO:
  - it should work on bare metal and Xen hypervisor
    (now this future is broken; kexec/kdump works
    only on Xen hypervisor),
  - move Xen code from generic and arch source files
    to Xen specific files,
  - reuse available generic Linux Kernel code
    as much as possible.

It is WIP and I am looking for comments only.
It is not final version.

Daniel

 arch/x86/include/asm/kexec.h         |   16 ++
 arch/x86/include/asm/xen/hypercall.h |    6 +
 arch/x86/kernel/machine_kexec_32.c   |  118 ++++++++--------
 arch/x86/kernel/machine_kexec_64.c   |  192 +++++++++++++++++---------
 arch/x86/kernel/relocate_kernel_32.S |   39 +++++-
 arch/x86/kernel/relocate_kernel_64.S |   36 +++++-
 arch/x86/kernel/setup.c              |    5 +-
 arch/x86/xen/enlighten.c             |   11 ++-
 drivers/base/cpu.c                   |    4 +-
 drivers/xen/Makefile                 |    1 +
 drivers/xen/machine_kexec.c          |  256 ++++++++++++++++++++++++++++++++++
 drivers/xen/sys-hypervisor.c         |   40 ++++++
 drivers/xen/xenbus/xenbus_probe.c    |   98 +++++++++++++
 include/linux/kexec.h                |   13 ++
 include/xen/interface/kexec.h        |  158 +++++++++++++++++++++
 include/xen/interface/xen.h          |    1 +
 kernel/kexec.c                       |   93 ++++++++++--
 17 files changed, 939 insertions(+), 148 deletions(-)

diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 317ff17..578697e 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -5,14 +5,30 @@
 # define PA_CONTROL_PAGE       0
 # define VA_CONTROL_PAGE       1
 # define PA_PGD                        2
+# ifndef CONFIG_XEN
 # define PA_SWAP_PAGE          3
 # define PAGES_NR              4
+# else /* CONFIG_XEN */
+/*
+ * The hypervisor interface implicitly requires that all entries (except
+ * for possibly the final one) are arranged in matching PA_/VA_ pairs.
+#  define VA_PGD               3
+ */
+#  define PA_SWAP_PAGE         4
+#  define PAGES_NR             5
+# endif /* CONFIG_XEN */
 #else
 # define PA_CONTROL_PAGE       0
 # define VA_CONTROL_PAGE       1
 # define PA_TABLE_PAGE         2
+# ifndef CONFIG_XEN
 # define PA_SWAP_PAGE          3
 # define PAGES_NR              4
+# else /* CONFIG_XEN, see comment above
+#  define VA_TABLE_PAGE                3 */
+#  define PA_SWAP_PAGE         4
+#  define PAGES_NR             5
+# endif /* CONFIG_XEN */
 #endif
 
 # define KEXEC_CONTROL_CODE_MAX_SIZE   2048
diff --git a/arch/x86/include/asm/xen/hypercall.h 
b/arch/x86/include/asm/xen/hypercall.h
index 18882f7..2db0222 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -468,6 +468,12 @@ HYPERVISOR_xenoprof_op(unsigned int op, void *arg)
        return _hypercall2(int, xenoprof_op, op, arg);
 }
 
+static inline int __must_check
+HYPERVISOR_kexec_op(unsigned long op, void *args)
+{
+       return _hypercall2(int, kexec_op, op, args);
+}
+
 static inline void
 MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set)
 {
diff --git a/arch/x86/kernel/machine_kexec_32.c 
b/arch/x86/kernel/machine_kexec_32.c
index a3fa43b..14b7fa8 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -27,47 +27,13 @@
 #include <asm/cacheflush.h>
 #include <asm/debugreg.h>
 
-static void set_idt(void *newidt, __u16 limit)
-{
-       struct desc_ptr curidt;
-
-       /* ia32 supports unaliged loads & stores */
-       curidt.size    = limit;
-       curidt.address = (unsigned long)newidt;
-
-       load_idt(&curidt);
-}
-
+#ifdef CONFIG_XEN
+#include <xen/xen-ops.h>
 
-static void set_gdt(void *newgdt, __u16 limit)
-{
-       struct desc_ptr curgdt;
-
-       /* ia32 supports unaligned loads & stores */
-       curgdt.size    = limit;
-       curgdt.address = (unsigned long)newgdt;
+#include <xen/interface/kexec.h>
 
-       load_gdt(&curgdt);
-}
-
-static void load_segments(void)
-{
-#define __STR(X) #X
-#define STR(X) __STR(X)
-
-       __asm__ __volatile__ (
-               "\tljmp $"STR(__KERNEL_CS)",$1f\n"
-               "\t1:\n"
-               "\tmovl $"STR(__KERNEL_DS)",%%eax\n"
-               "\tmovl %%eax,%%ds\n"
-               "\tmovl %%eax,%%es\n"
-               "\tmovl %%eax,%%fs\n"
-               "\tmovl %%eax,%%gs\n"
-               "\tmovl %%eax,%%ss\n"
-               : : : "eax", "memory");
-#undef STR
-#undef __STR
-}
+#include <asm/xen/page.h>
+#endif
 
 static void machine_kexec_free_page_tables(struct kimage *image)
 {
@@ -84,6 +50,15 @@ static int machine_kexec_alloc_page_tables(struct kimage 
*image)
 {
        image->arch.pgd = (pgd_t *)get_zeroed_page(GFP_KERNEL);
 #ifdef CONFIG_X86_PAE
+#ifdef CONFIG_XEN /* machine address must fit into xki->page_list[PA_PGD] */
+       if (image->arch.pgd) {
+               if 
(xen_create_contiguous_region(native_pgd_val(*image->arch.pgd), 0, 
BITS_PER_LONG) < 0) {
+                       __free_page(virt_to_page(image->arch.pgd));
+                       image->arch.pgd = NULL;
+                       return -ENOMEM;
+               }
+       }
+#endif
        image->arch.pmd0 = (pmd_t *)get_zeroed_page(GFP_KERNEL);
        image->arch.pmd1 = (pmd_t *)get_zeroed_page(GFP_KERNEL);
 #endif
@@ -139,6 +114,51 @@ static void machine_kexec_prepare_page_tables(struct 
kimage *image)
                __pa(control_page), __pa(control_page));
 }
 
+#ifdef CONFIG_XEN
+
+#define __ma(x) (pfn_to_mfn(__pa((x)) >> PAGE_SHIFT) << PAGE_SHIFT)
+
+#if PAGES_NR > KEXEC_XEN_NO_PAGES
+#error PAGES_NR is greater than KEXEC_XEN_NO_PAGES - Xen support will break
+#endif
+
+#if PA_CONTROL_PAGE != 0
+#error PA_CONTROL_PAGE is non zero - Xen support will break
+#endif
+
+void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, struct kimage *image)
+{
+       void *control_page;
+
+       memset(xki->page_list, 0, sizeof(xki->page_list));
+
+       control_page = page_address(image->control_code_page);
+       memcpy(control_page, relocate_kernel, PAGE_SIZE);
+
+       xki->page_list[PA_CONTROL_PAGE] = __ma(control_page);
+       xki->page_list[PA_PGD] = __ma(image->arch.pgd);
+
+       if (image->type == KEXEC_TYPE_DEFAULT)
+               xki->page_list[PA_SWAP_PAGE] = page_to_phys(image->swap_page);
+}
+
+int __init machine_kexec_setup_resources(struct resource *hypervisor,
+                                        struct resource *phys_cpus,
+                                        int nr_phys_cpus)
+{
+       int k;
+
+       /* The per-cpu crash note resources belong to the hypervisor resource */
+       for (k = 0; k < nr_phys_cpus; k++)
+               request_resource(hypervisor, phys_cpus + k);
+
+       return 0;
+}
+
+void machine_kexec_register_resources(struct resource *res) { ; }
+
+#endif /* CONFIG_XEN */
+
 /*
  * A architecture hook called to validate the
  * proposed image and prepare the control pages
@@ -176,6 +196,7 @@ void machine_kexec_cleanup(struct kimage *image)
        machine_kexec_free_page_tables(image);
 }
 
+#ifndef CONFIG_XEN
 /*
  * Do not allocate memory (or fail in any way) in machine_kexec().
  * We are past the point of no return, committed to rebooting now.
@@ -228,24 +249,6 @@ void machine_kexec(struct kimage *image)
                page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
                                                << PAGE_SHIFT);
 
-       /*
-        * The segment registers are funny things, they have both a
-        * visible and an invisible part.  Whenever the visible part is
-        * set to a specific selector, the invisible part is loaded
-        * with from a table in memory.  At no other time is the
-        * descriptor table in memory accessed.
-        *
-        * I take advantage of this here by force loading the
-        * segments, before I zap the gdt with an invalid value.
-        */
-       load_segments();
-       /*
-        * The gdt & idt are now invalid.
-        * If you want to load them you must set up your own idt & gdt.
-        */
-       set_gdt(phys_to_virt(0), 0);
-       set_idt(phys_to_virt(0), 0);
-
        /* now call it */
        image->start = relocate_kernel_ptr((unsigned long)image->head,
                                           (unsigned long)page_list,
@@ -259,6 +262,7 @@ void machine_kexec(struct kimage *image)
 
        __ftrace_enabled_restore(save_ftrace_enabled);
 }
+#endif
 
 void arch_crash_save_vmcoreinfo(void)
 {
diff --git a/arch/x86/kernel/machine_kexec_64.c 
b/arch/x86/kernel/machine_kexec_64.c
index b3ea9db..c7623a4 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -21,6 +21,115 @@
 #include <asm/mmu_context.h>
 #include <asm/debugreg.h>
 
+#ifdef CONFIG_XEN
+
+/* In the case of Xen, override hypervisor functions to be able to create
+ * a regular identity mapping page table...
+ */
+
+#include <xen/interface/kexec.h>
+#include <xen/interface/memory.h>
+
+#include <asm/xen/page.h>
+#include <asm/xen/hypercall.h>
+
+#define x__pmd(x) ((pmd_t) { (x) } )
+#define x__pud(x) ((pud_t) { (x) } )
+#define x__pgd(x) ((pgd_t) { (x) } )
+
+#define x_pmd_val(x)   ((x).pmd)
+#define x_pud_val(x)   ((x).pud)
+#define x_pgd_val(x)   ((x).pgd)
+
+static inline void x_set_pmd(pmd_t *dst, pmd_t val)
+{
+       x_pmd_val(*dst) = x_pmd_val(val);
+}
+
+static inline void x_set_pud(pud_t *dst, pud_t val)
+{
+       x_pud_val(*dst) = phys_to_machine(XPADDR(x_pud_val(val))).maddr;
+}
+
+static inline void x_pud_clear (pud_t *pud)
+{
+       x_pud_val(*pud) = 0;
+}
+
+static inline void x_set_pgd(pgd_t *dst, pgd_t val)
+{
+       x_pgd_val(*dst) = phys_to_machine(XPADDR(x_pgd_val(val))).maddr;
+}
+
+static inline void x_pgd_clear (pgd_t * pgd)
+{
+       x_pgd_val(*pgd) = 0;
+}
+
+#define X__PAGE_KERNEL_LARGE_EXEC \
+         _PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_PSE
+#define X_KERNPG_TABLE _PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY
+
+#define __ma(x) (pfn_to_mfn(__pa((x)) >> PAGE_SHIFT) << PAGE_SHIFT)
+
+#if PAGES_NR > KEXEC_XEN_NO_PAGES
+#error PAGES_NR is greater than KEXEC_XEN_NO_PAGES - Xen support will break
+#endif
+
+#if PA_CONTROL_PAGE != 0
+#error PA_CONTROL_PAGE is non zero - Xen support will break
+#endif
+
+void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, struct kimage *image)
+{
+       void *control_page;
+       void *table_page;
+
+       memset(xki->page_list, 0, sizeof(xki->page_list));
+
+       control_page = page_address(image->control_code_page) + PAGE_SIZE;
+       memcpy(control_page, relocate_kernel, PAGE_SIZE);
+
+       table_page = page_address(image->control_code_page);
+
+       xki->page_list[PA_CONTROL_PAGE] = __ma(control_page);
+       xki->page_list[PA_TABLE_PAGE] = __ma(table_page);
+
+       if (image->type == KEXEC_TYPE_DEFAULT)
+               xki->page_list[PA_SWAP_PAGE] = page_to_phys(image->swap_page);
+}
+
+int __init machine_kexec_setup_resources(struct resource *hypervisor,
+                                        struct resource *phys_cpus,
+                                        int nr_phys_cpus)
+{
+       int k;
+
+       /* The per-cpu crash note resources belong to the hypervisor resource */
+       for (k = 0; k < nr_phys_cpus; k++)
+               request_resource(hypervisor, phys_cpus + k);
+
+       return 0;
+}
+
+#else /* CONFIG_XEN */
+
+#define x__pmd(x) __pmd(x)
+#define x__pud(x) __pud(x)
+#define x__pgd(x) __pgd(x)
+
+#define x_set_pmd(x, y) set_pmd(x, y)
+#define x_set_pud(x, y) set_pud(x, y)
+#define x_set_pgd(x, y) set_pgd(x, y)
+
+#define x_pud_clear(x) pud_clear(x)
+#define x_pgd_clear(x) pgd_clear(x)
+
+#define X__PAGE_KERNEL_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC
+#define X_KERNPG_TABLE _KERNPG_TABLE
+
+#endif /* CONFIG_XEN */
+
 static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
                                unsigned long addr)
 {
@@ -50,7 +159,7 @@ static int init_one_level2_page(struct kimage *image, pgd_t 
*pgd,
        }
        pmd = pmd_offset(pud, addr);
        if (!pmd_present(*pmd))
-               set_pmd(pmd, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
+               x_set_pmd(pmd, x__pmd(addr | X__PAGE_KERNEL_LARGE_EXEC));
        result = 0;
 out:
        return result;
@@ -63,7 +172,7 @@ static void init_level2_page(pmd_t *level2p, unsigned long 
addr)
        addr &= PAGE_MASK;
        end_addr = addr + PUD_SIZE;
        while (addr < end_addr) {
-               set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
+               x_set_pmd(level2p++, x__pmd(addr | X__PAGE_KERNEL_LARGE_EXEC));
                addr += PMD_SIZE;
        }
 }
@@ -88,12 +197,12 @@ static int init_level3_page(struct kimage *image, pud_t 
*level3p,
                }
                level2p = (pmd_t *)page_address(page);
                init_level2_page(level2p, addr);
-               set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE));
+               x_set_pud(level3p++, x__pud(__pa(level2p) | X_KERNPG_TABLE));
                addr += PUD_SIZE;
        }
        /* clear the unused entries */
        while (addr < end_addr) {
-               pud_clear(level3p++);
+               x_pud_clear(level3p++);
                addr += PUD_SIZE;
        }
 out:
@@ -123,12 +232,12 @@ static int init_level4_page(struct kimage *image, pgd_t 
*level4p,
                result = init_level3_page(image, level3p, addr, last_addr);
                if (result)
                        goto out;
-               set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
+               x_set_pgd(level4p++, x__pgd(__pa(level3p) | X_KERNPG_TABLE));
                addr += PGDIR_SIZE;
        }
        /* clear the unused entries */
        while (addr < end_addr) {
-               pgd_clear(level4p++);
+               x_pgd_clear(level4p++);
                addr += PGDIR_SIZE;
        }
 out:
@@ -189,8 +298,14 @@ static int init_pgtable(struct kimage *image, unsigned 
long start_pgtable)
 {
        pgd_t *level4p;
        int result;
+       unsigned long x_max_pfn = max_pfn;
+
+#ifdef CONFIG_XEN
+       x_max_pfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
+#endif
+
        level4p = (pgd_t *)__va(start_pgtable);
-       result = init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT);
+       result = init_level4_page(image, level4p, 0, x_max_pfn << PAGE_SHIFT);
        if (result)
                return result;
        /*
@@ -203,47 +318,6 @@ static int init_pgtable(struct kimage *image, unsigned 
long start_pgtable)
        return init_transition_pgtable(image, level4p);
 }
 
-static void set_idt(void *newidt, u16 limit)
-{
-       struct desc_ptr curidt;
-
-       /* x86-64 supports unaliged loads & stores */
-       curidt.size    = limit;
-       curidt.address = (unsigned long)newidt;
-
-       __asm__ __volatile__ (
-               "lidtq %0\n"
-               : : "m" (curidt)
-               );
-};
-
-
-static void set_gdt(void *newgdt, u16 limit)
-{
-       struct desc_ptr curgdt;
-
-       /* x86-64 supports unaligned loads & stores */
-       curgdt.size    = limit;
-       curgdt.address = (unsigned long)newgdt;
-
-       __asm__ __volatile__ (
-               "lgdtq %0\n"
-               : : "m" (curgdt)
-               );
-};
-
-static void load_segments(void)
-{
-       __asm__ __volatile__ (
-               "\tmovl %0,%%ds\n"
-               "\tmovl %0,%%es\n"
-               "\tmovl %0,%%ss\n"
-               "\tmovl %0,%%fs\n"
-               "\tmovl %0,%%gs\n"
-               : : "a" (__KERNEL_DS) : "memory"
-               );
-}
-
 int machine_kexec_prepare(struct kimage *image)
 {
        unsigned long start_pgtable;
@@ -265,6 +339,7 @@ void machine_kexec_cleanup(struct kimage *image)
        free_transition_pgtable(image);
 }
 
+#ifndef CONFIG_XEN
 /*
  * Do not allocate memory (or fail in any way) in machine_kexec().
  * We are past the point of no return, committed to rebooting now.
@@ -311,24 +386,6 @@ void machine_kexec(struct kimage *image)
                page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
                                                << PAGE_SHIFT);
 
-       /*
-        * The segment registers are funny things, they have both a
-        * visible and an invisible part.  Whenever the visible part is
-        * set to a specific selector, the invisible part is loaded
-        * with from a table in memory.  At no other time is the
-        * descriptor table in memory accessed.
-        *
-        * I take advantage of this here by force loading the
-        * segments, before I zap the gdt with an invalid value.
-        */
-       load_segments();
-       /*
-        * The gdt & idt are now invalid.
-        * If you want to load them you must set up your own idt & gdt.
-        */
-       set_gdt(phys_to_virt(0), 0);
-       set_idt(phys_to_virt(0), 0);
-
        /* now call it */
        image->start = relocate_kernel((unsigned long)image->head,
                                       (unsigned long)page_list,
@@ -342,10 +399,13 @@ void machine_kexec(struct kimage *image)
 
        __ftrace_enabled_restore(save_ftrace_enabled);
 }
+#endif
 
 void arch_crash_save_vmcoreinfo(void)
 {
+#ifndef CONFIG_XEN /* could really be CONFIG_RELOCATABLE */
        VMCOREINFO_SYMBOL(phys_base);
+#endif
        VMCOREINFO_SYMBOL(init_level4_pgt);
 
 #ifdef CONFIG_NUMA
diff --git a/arch/x86/kernel/relocate_kernel_32.S 
b/arch/x86/kernel/relocate_kernel_32.S
index 4123553..fe0fbfb 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -87,14 +87,32 @@ relocate_kernel:
        movl    PTR(PA_PGD)(%ebp), %eax
        movl    %eax, %cr3
 
+       /* setup idt */
+       lidtl   idt_48 - relocate_kernel(%edi)
+
+       /* setup gdt */
+       leal    gdt - relocate_kernel(%edi), %eax
+       movl    %eax, (gdt_48 - relocate_kernel) + 2(%edi)
+       lgdtl   gdt_48 - relocate_kernel(%edi)
+
+       /* setup data segment registers */
+       mov     $(gdt_ds - gdt), %eax
+       mov     %eax, %ds
+       mov     %eax, %es
+       mov     %eax, %fs
+       mov     %eax, %gs
+       mov     %eax, %ss
+
        /* setup a new stack at the end of the physical control page */
        lea     PAGE_SIZE(%edi), %esp
 
-       /* jump to identity mapped page */
+       /* load new code segment and jump to identity mapped page */
+       pushl   $0
+       pushl   $(gdt_cs - gdt)
        movl    %edi, %eax
        addl    $(identity_mapped - relocate_kernel), %eax
        pushl   %eax
-       ret
+       iretl
 
 identity_mapped:
        /* store the start address on the stack */
@@ -271,5 +289,22 @@ swap_pages:
        popl    %ebp
        ret
 
+       .align  16
+gdt:
+       .quad   0x0000000000000000      /* NULL descriptor */
+gdt_cs:
+       .quad   0x00cf9a000000ffff      /* kernel 4GB code at 0x00000000 */
+gdt_ds:
+       .quad   0x00cf92000000ffff      /* kernel 4GB data at 0x00000000 */
+gdt_end:
+
+gdt_48:
+       .word   gdt_end - gdt - 1       /* limit */
+       .long   0                       /* base - filled in by code above */
+
+idt_48:
+       .word   0                       /* limit */
+       .long   0                       /* base */
+
        .globl kexec_control_code_size
 .set kexec_control_code_size, . - relocate_kernel
diff --git a/arch/x86/kernel/relocate_kernel_64.S 
b/arch/x86/kernel/relocate_kernel_64.S
index 4de8f5b..bb0455d 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -91,13 +91,30 @@ relocate_kernel:
        /* Switch to the identity mapped page tables */
        movq    %r9, %cr3
 
+       /* setup idt */
+       lidtq   idt_80 - relocate_kernel(%r8)
+
+       /* setup gdt */
+       leaq    gdt - relocate_kernel(%r8), %rax
+       movq    %rax, (gdt_80 - relocate_kernel) + 2(%r8)
+       lgdtq   gdt_80 - relocate_kernel(%r8)
+
+       /* setup data segment registers */
+       xorl    %eax, %eax
+       movl    %eax, %ds
+       movl    %eax, %es
+       movl    %eax, %fs
+       movl    %eax, %gs
+       movl    %eax, %ss
+
        /* setup a new stack at the end of the physical control page */
        lea     PAGE_SIZE(%r8), %rsp
 
-       /* jump to identity mapped page */
+       /* load new code segment and jump to identity mapped page */
        addq    $(identity_mapped - relocate_kernel), %r8
+       pushq   $(gdt_cs - gdt)
        pushq   %r8
-       ret
+       lretq
 
 identity_mapped:
        /* store the start address on the stack */
@@ -262,5 +279,20 @@ swap_pages:
 3:
        ret
 
+       .align  16
+gdt:
+       .quad   0x0000000000000000      /* NULL descriptor */
+gdt_cs:
+       .quad   0x00af9a000000ffff
+gdt_end:
+
+gdt_80:
+       .word   gdt_end - gdt - 1       /* limit */
+       .quad   0                       /* base - filled in by code above */
+
+idt_80:
+       .word   0                       /* limit */
+       .quad   0                       /* base */
+
        .globl kexec_control_code_size
 .set kexec_control_code_size, . - relocate_kernel
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index c6724e4..b978d7e 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -509,7 +509,7 @@ static void __init 
memblock_x86_reserve_range_setup_data(void)
  */
 
 #ifdef CONFIG_KEXEC
-
+#ifndef CONFIG_XEN
 static inline unsigned long long get_total_mem(void)
 {
        unsigned long long total;
@@ -581,6 +581,9 @@ static void __init reserve_crashkernel(void)
        insert_resource(&iomem_resource, &crashk_res);
 }
 #else
+#define reserve_crashkernel xen_machine_kexec_setup_resources
+#endif
+#else
 static void __init reserve_crashkernel(void)
 {
 }
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 8a8a156..b504d0e 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1030,7 +1030,9 @@ static void xen_emergency_restart(void)
 
 static void xen_machine_halt(void)
 {
+#ifndef CONFIG_KEXEC
        xen_reboot(SHUTDOWN_poweroff);
+#endif
 }
 
 static void xen_machine_power_off(void)
@@ -1040,10 +1042,13 @@ static void xen_machine_power_off(void)
        xen_reboot(SHUTDOWN_poweroff);
 }
 
+#ifdef CONFIG_KEXEC
 static void xen_crash_shutdown(struct pt_regs *regs)
 {
-       xen_reboot(SHUTDOWN_crash);
+       /* The kernel is broken so disable interrupts */
+       local_irq_disable();
 }
+#endif
 
 static int
 xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
@@ -1067,8 +1072,10 @@ static const struct machine_ops xen_machine_ops 
__initconst = {
        .halt = xen_machine_halt,
        .power_off = xen_machine_power_off,
        .shutdown = xen_machine_halt,
-       .crash_shutdown = xen_crash_shutdown,
        .emergency_restart = xen_emergency_restart,
+#ifdef CONFIG_KEXEC
+       .crash_shutdown = xen_crash_shutdown
+#endif
 };
 
 /*
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 251acea..24d71fd 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -106,7 +106,7 @@ static inline void register_cpu_control(struct cpu *cpu)
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
-#ifdef CONFIG_KEXEC
+#if defined(CONFIG_KEXEC) && !defined(CONFIG_XEN)
 #include <linux/kexec.h>
 
 static ssize_t show_crash_notes(struct sys_device *dev, struct 
sysdev_attribute *attr,
@@ -231,7 +231,7 @@ int __cpuinit register_cpu(struct cpu *cpu, int num)
        if (!error)
                register_cpu_under_node(num, cpu_to_node(num));
 
-#ifdef CONFIG_KEXEC
+#if defined(CONFIG_KEXEC) && !defined(CONFIG_XEN)
        if (!error)
                error = sysdev_create_file(&cpu->sysdev, &attr_crash_notes);
 #endif
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index f1d5622..c0451cd 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_ACPI_PROCESSOR_XEN) += acpi_processor.o
 obj-$(CONFIG_SWIOTLB_XEN)              += swiotlb-xen.o
 obj-$(CONFIG_XEN_DOM0)                 += pci.o
 obj-$(CONFIG_XEN_TMEM)         += tmem.o
+obj-$(CONFIG_KEXEC)                    += machine_kexec.o
 
 xen-evtchn-y                           := evtchn.o
 xen-gntdev-y                           := gntdev.o
diff --git a/drivers/xen/machine_kexec.c b/drivers/xen/machine_kexec.c
new file mode 100644
index 0000000..8cd20e4
--- /dev/null
+++ b/drivers/xen/machine_kexec.c
@@ -0,0 +1,256 @@
+/*
+ * Handle transition of Linux booting another kernel.
+ */
+
+#include <linux/kexec.h>
+#include <linux/reboot.h>
+#include <linux/mm.h>
+#include <linux/bootmem.h>
+
+#include <xen/xen-ops.h>
+
+#include <xen/interface/kexec.h>
+
+#include <asm/xen/page.h>
+#include <asm/xen/hypercall.h>
+
+extern void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, 
+                                        struct kimage *image);
+extern int machine_kexec_setup_resources(struct resource *hypervisor,
+                                        struct resource *phys_cpus,
+                                        int nr_phys_cpus);
+extern void machine_kexec_register_resources(struct resource *res);
+
+static int __initdata xen_max_nr_phys_cpus;
+static struct resource xen_hypervisor_res;
+#if 0
+static struct resource *xen_phys_cpus;
+#endif
+static struct resource xen_phys_cpus[16];
+
+size_t vmcoreinfo_size_xen;
+unsigned long paddr_vmcoreinfo_xen;
+
+void __init xen_machine_kexec_setup_resources(void)
+{
+       xen_kexec_range_t range;
+       struct resource *res;
+       int k = 0;
+       int rc;
+
+       if (strstr(boot_command_line, "crashkernel="))
+               printk(KERN_WARNING "Ignoring crashkernel command line, "
+                      "parameter will be supplied by xen\n");
+
+       if (!xen_initial_domain())
+               return;
+
+       /* determine maximum number of physical cpus */
+
+       while (1) {
+               memset(&range, 0, sizeof(range));
+               range.range = KEXEC_RANGE_MA_CPU;
+               range.nr = k;
+
+               if(HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &range))
+                       break;
+
+               k++;
+       }
+
+       if (k == 0)
+               return;
+
+       xen_max_nr_phys_cpus = k;
+
+#if 0
+       /* allocate xen_phys_cpus */
+
+       xen_phys_cpus = alloc_bootmem_low(k * sizeof(struct resource));
+#endif
+
+       /* fill in xen_phys_cpus with per-cpu crash note information */
+
+       for (k = 0; k < xen_max_nr_phys_cpus; k++) {
+               memset(&range, 0, sizeof(range));
+               range.range = KEXEC_RANGE_MA_CPU;
+               range.nr = k;
+
+               if (HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &range))
+                       goto err;
+
+               res = xen_phys_cpus + k;
+
+               memset(res, 0, sizeof(*res));
+               res->name = "Crash note";
+               res->start = range.start;
+               res->end = range.start + range.size - 1;
+               res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
+       }
+
+       /* fill in xen_hypervisor_res with hypervisor machine address range */
+
+       memset(&range, 0, sizeof(range));
+       range.range = KEXEC_RANGE_MA_XEN;
+
+       if (HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &range))
+               goto err;
+
+       xen_hypervisor_res.name = "Hypervisor code and data";
+       xen_hypervisor_res.start = range.start;
+       xen_hypervisor_res.end = range.start + range.size - 1;
+       xen_hypervisor_res.flags = IORESOURCE_BUSY | IORESOURCE_MEM;
+#ifdef CONFIG_X86
+       insert_resource(&iomem_resource, &xen_hypervisor_res);
+#endif
+
+       /* fill in crashk_res if range is reserved by hypervisor */
+
+       memset(&range, 0, sizeof(range));
+       range.range = KEXEC_RANGE_MA_CRASH;
+
+       if (HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &range))
+               goto err;
+
+       if (range.size) {
+               crashk_res.start = range.start;
+               crashk_res.end = range.start + range.size - 1;
+#ifdef CONFIG_X86
+               insert_resource(&iomem_resource, &crashk_res);
+#endif
+       }
+
+       /* get physical address of vmcoreinfo */
+       memset(&range, 0, sizeof(range));
+       range.range = KEXEC_RANGE_MA_VMCOREINFO;
+
+       rc = HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &range);
+
+       if (rc == 0) {
+               /* Hypercall succeeded */
+               vmcoreinfo_size_xen = range.size;
+               paddr_vmcoreinfo_xen = range.start;
+
+       } else {
+               /* Hypercall failed.
+                * Indicate not to create sysfs file by resetting globals
+                */
+               vmcoreinfo_size_xen = 0;
+               paddr_vmcoreinfo_xen = 0;
+               
+               /* The KEXEC_CMD_kexec_get_range hypercall did not implement
+                * KEXEC_RANGE_MA_VMCOREINFO until Xen 3.3.
+                * Do not bail out if it fails for this reason.
+                */
+               if (rc != -EINVAL)
+                       return;
+       }
+
+       if (machine_kexec_setup_resources(&xen_hypervisor_res, xen_phys_cpus,
+                                         xen_max_nr_phys_cpus))
+               goto err;
+
+#ifdef CONFIG_X86
+       for (k = 0; k < xen_max_nr_phys_cpus; k++) {
+               res = xen_phys_cpus + k;
+               if (!res->parent) /* outside of xen_hypervisor_res range */
+                       insert_resource(&iomem_resource, res);
+       }
+
+       if (xen_create_contiguous_region((unsigned long)&vmcoreinfo_note,
+                                        get_order(sizeof(vmcoreinfo_note)),
+                                        BITS_PER_LONG))
+               goto err;
+#endif
+
+       return;
+
+ err:
+       /*
+        * It isn't possible to free xen_phys_cpus this early in the
+        * boot. Failure at this stage is unexpected and the amount of
+        * memory is small therefore we tolerate the potential leak.
+         */
+       xen_max_nr_phys_cpus = 0;
+       return;
+}
+
+#ifndef CONFIG_X86
+void __init xen_machine_kexec_register_resources(struct resource *res)
+{
+       int k;
+       struct resource *r;
+
+       request_resource(res, &xen_hypervisor_res);
+       for (k = 0; k < xen_max_nr_phys_cpus; k++) {
+               r = xen_phys_cpus + k;
+               if (r->parent == NULL) /* out of xen_hypervisor_res range */
+                       request_resource(res, r);
+       } 
+       machine_kexec_register_resources(res);
+}
+#endif
+
+static void setup_load_arg(xen_kexec_image_t *xki, struct kimage *image)
+{
+       machine_kexec_setup_load_arg(xki, image);
+
+       xki->indirection_page = image->head;
+       xki->start_address = image->start;
+}
+
+/*
+ * Load the image into xen so xen can kdump itself
+ * This might have been done in prepare, but prepare
+ * is currently called too early. It might make sense
+ * to move prepare, but for now, just add an extra hook.
+ */
+int xen_machine_kexec_load(struct kimage *image)
+{
+       xen_kexec_load_t xkl;
+
+       memset(&xkl, 0, sizeof(xkl));
+       xkl.type = image->type;
+       setup_load_arg(&xkl.image, image);
+       return HYPERVISOR_kexec_op(KEXEC_CMD_kexec_load, &xkl);
+}
+
+/*
+ * Unload the image that was stored by machine_kexec_load()
+ * This might have been done in machine_kexec_cleanup() but it
+ * is called too late, and its possible xen could try and kdump
+ * using resources that have been freed.
+ */
+void xen_machine_kexec_unload(struct kimage *image)
+{
+       xen_kexec_load_t xkl;
+
+       memset(&xkl, 0, sizeof(xkl));
+       xkl.type = image->type;
+       WARN_ON(HYPERVISOR_kexec_op(KEXEC_CMD_kexec_unload, &xkl));
+}
+
+/*
+ * Do not allocate memory (or fail in any way) in machine_kexec().
+ * We are past the point of no return, committed to rebooting now.
+ *
+ * This has the hypervisor move to the prefered reboot CPU, 
+ * stop all CPUs and kexec. That is it combines machine_shutdown()
+ * and machine_kexec() in Linux kexec terms.
+ */
+NORET_TYPE void machine_kexec(struct kimage *image)
+{
+       xen_kexec_exec_t xke;
+
+       memset(&xke, 0, sizeof(xke));
+       xke.type = image->type;
+       (void)HYPERVISOR_kexec_op(KEXEC_CMD_kexec, &xke);
+       panic("KEXEC_CMD_kexec hypercall should not return\n");
+}
+
+#ifdef CONFIG_X86
+unsigned long paddr_vmcoreinfo_note(void)
+{
+       return virt_to_machine(&vmcoreinfo_note).maddr;
+}
+#endif
diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c
index 1e0fe01..0dc4f51 100644
--- a/drivers/xen/sys-hypervisor.c
+++ b/drivers/xen/sys-hypervisor.c
@@ -355,6 +355,31 @@ static void xen_properties_destroy(void)
        sysfs_remove_group(hypervisor_kobj, &xen_properties_group);
 }
 
+#ifdef CONFIG_KEXEC
+
+extern size_t vmcoreinfo_size_xen;
+extern unsigned long paddr_vmcoreinfo_xen;
+
+static ssize_t vmcoreinfo_show(struct hyp_sysfs_attr *attr, char *page)
+{
+       return sprintf(page, "%lx %zx\n",
+               paddr_vmcoreinfo_xen, vmcoreinfo_size_xen);
+}
+
+HYPERVISOR_ATTR_RO(vmcoreinfo);
+
+static int __init xen_sysfs_vmcoreinfo_init(void)
+{
+       return sysfs_create_file(hypervisor_kobj, &vmcoreinfo_attr.attr);
+}
+
+static void xen_sysfs_vmcoreinfo_destroy(void)
+{
+       sysfs_remove_file(hypervisor_kobj, &vmcoreinfo_attr.attr);
+}
+
+#endif
+
 static int __init hyper_sysfs_init(void)
 {
        int ret;
@@ -377,9 +402,20 @@ static int __init hyper_sysfs_init(void)
        ret = xen_properties_init();
        if (ret)
                goto prop_out;
+#ifdef CONFIG_KEXEC
+       if (vmcoreinfo_size_xen) {
+               ret = xen_sysfs_vmcoreinfo_init();
+               if (ret)
+                       goto vmcoreinfo_out;
+       }
+#endif
 
        goto out;
 
+#ifdef CONFIG_KEXEC
+vmcoreinfo_out:
+#endif
+       xen_properties_destroy();
 prop_out:
        xen_sysfs_uuid_destroy();
 uuid_out:
@@ -394,6 +430,10 @@ out:
 
 static void __exit hyper_sysfs_exit(void)
 {
+#ifdef CONFIG_KEXEC
+       if (vmcoreinfo_size_xen)
+               xen_sysfs_vmcoreinfo_destroy();
+#endif
        xen_properties_destroy();
        xen_compilation_destroy();
        xen_sysfs_uuid_destroy();
diff --git a/drivers/xen/xenbus/xenbus_probe.c 
b/drivers/xen/xenbus/xenbus_probe.c
index 7397695..4ffe83c 100644
--- a/drivers/xen/xenbus/xenbus_probe.c
+++ b/drivers/xen/xenbus/xenbus_probe.c
@@ -673,8 +673,106 @@ void unregister_xenstore_notifier(struct notifier_block 
*nb)
 }
 EXPORT_SYMBOL_GPL(unregister_xenstore_notifier);
 
+#ifdef CONFIG_CRASH_DUMP
+static DECLARE_WAIT_QUEUE_HEAD(be_state_wq);
+static int be_state;
+
+static void xenbus_reset_state_changed(struct xenbus_watch *w, const char **v, 
unsigned int l)
+{
+       xenbus_scanf(XBT_NIL, v[XS_WATCH_PATH], "", "%i", &be_state);
+       printk(KERN_INFO "XENBUS: %s %s\n", v[XS_WATCH_PATH], 
xenbus_strstate(be_state));
+       wake_up(&be_state_wq);
+}
+
+static int xenbus_reset_check_final(int *st)
+{
+       return *st == XenbusStateInitialising || *st == XenbusStateInitWait;
+}
+
+static void xenbus_reset_frontend_state(char *backend, char *frontend)
+{
+       struct xenbus_watch watch;
+
+       memset(&watch, 0, sizeof(watch));
+       watch.node = kasprintf(GFP_NOIO | __GFP_HIGH, "%s/state", backend);
+       if (!watch.node)
+               return;
+
+       watch.callback = xenbus_reset_state_changed;
+       be_state = XenbusStateUnknown;
+
+       printk(KERN_INFO "XENBUS: triggering reconnect on %s\n", backend);
+       register_xenbus_watch(&watch);
+
+       xenbus_printf(XBT_NIL, frontend, "state", "%d", XenbusStateClosing);
+       wait_event_interruptible(be_state_wq, be_state == XenbusStateClosing);
+
+       xenbus_printf(XBT_NIL, frontend, "state", "%d", XenbusStateClosed);
+       wait_event_interruptible(be_state_wq, be_state == XenbusStateClosed);
+
+       xenbus_printf(XBT_NIL, frontend, "state", "%d", 
XenbusStateInitialising);
+       wait_event_interruptible(be_state_wq, 
xenbus_reset_check_final(&be_state));
+
+       unregister_xenbus_watch(&watch);
+       printk(KERN_INFO "XENBUS: reconnect done on %s\n", backend);
+       kfree(watch.node);
+}
+
+static void xenbus_reset_check_state(char *class, char *dev)
+{
+       int state, err;
+       char *backend, *frontend;
+
+       frontend = kasprintf(GFP_NOIO | __GFP_HIGH, "device/%s/%s", class, dev);
+       if (!frontend)
+               return;
+
+       err = xenbus_scanf(XBT_NIL, frontend, "state", "%i", &state);
+       /* frontend connected? */
+       if (err == 1 && state == XenbusStateConnected) {
+               backend = xenbus_read(XBT_NIL, frontend, "backend", NULL);
+               if (!backend || IS_ERR(backend))
+                       goto out;
+               err = xenbus_scanf(XBT_NIL, backend, "state", "%i", &state);
+               /* backend connected? */
+               if (err == 1 && state == XenbusStateConnected)
+                       xenbus_reset_frontend_state(backend, frontend);
+               kfree(backend);
+       }
+out:
+       kfree(frontend);
+}
+
+static void xenbus_reset_state(void)
+{
+       char **devclass, **dev;
+       int devclass_n, dev_n;
+       int i, j;
+
+       devclass = xenbus_directory(XBT_NIL, "device", "", &devclass_n);
+       if (IS_ERR(devclass))
+               return;
+
+       for (i = 0; i < devclass_n; i++) {
+               dev = xenbus_directory(XBT_NIL, "device", devclass[i], &dev_n);
+               if (IS_ERR(dev))
+                       continue;
+               for (j = 0; j < dev_n; j++)
+                       xenbus_reset_check_state(devclass[i], dev[j]);
+               kfree(dev);
+       }
+       kfree(devclass);
+}
+#endif
+
 void xenbus_probe(struct work_struct *unused)
 {
+#ifdef CONFIG_CRASH_DUMP
+       /* reset devices in XenbusStateConnected state */
+       if (reset_devices)
+               xenbus_reset_state();
+#endif
+
        xenstored_ready = 1;
 
        /* Notify others that xenstore is up */
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index c2478a3..15565c6 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -112,6 +112,12 @@ struct kimage {
 extern void machine_kexec(struct kimage *image);
 extern int machine_kexec_prepare(struct kimage *image);
 extern void machine_kexec_cleanup(struct kimage *image);
+#ifdef CONFIG_XEN
+extern int xen_machine_kexec_load(struct kimage *image);
+extern void xen_machine_kexec_unload(struct kimage *image);
+extern void xen_machine_kexec_setup_resources(void);
+extern void xen_machine_kexec_register_resources(struct resource *res);
+#endif
 extern asmlinkage long sys_kexec_load(unsigned long entry,
                                        unsigned long nr_segments,
                                        struct kexec_segment __user *segments,
@@ -192,8 +198,15 @@ extern struct kimage *kexec_crash_image;
 #define VMCOREINFO_BYTES           (4096)
 #define VMCOREINFO_NOTE_NAME       "VMCOREINFO"
 #define VMCOREINFO_NOTE_NAME_BYTES ALIGN(sizeof(VMCOREINFO_NOTE_NAME), 4)
+#if !defined(CONFIG_XEN) || !defined(CONFIG_X86)
 #define VMCOREINFO_NOTE_SIZE       (KEXEC_NOTE_HEAD_BYTES*2 + VMCOREINFO_BYTES 
\
                                    + VMCOREINFO_NOTE_NAME_BYTES)
+#else
+#define VMCOREINFO_NOTE_SIZE       ALIGN(KEXEC_NOTE_HEAD_BYTES*2 \
+                                        + VMCOREINFO_BYTES \
+                                        + VMCOREINFO_NOTE_NAME_BYTES, \
+                                        PAGE_SIZE)
+#endif
 
 /* Location of a reserved region to hold the crash kernel.
  */
diff --git a/include/xen/interface/kexec.h b/include/xen/interface/kexec.h
new file mode 100644
index 0000000..5fd0495
--- /dev/null
+++ b/include/xen/interface/kexec.h
@@ -0,0 +1,158 @@
+/******************************************************************************
+ * kexec.h - Public portion
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ * 
+ * Xen port written by:
+ * - Simon 'Horms' Horman <horms@xxxxxxxxxxxx>
+ * - Magnus Damm <magnus@xxxxxxxxxxxxx>
+ */
+
+#ifndef _XEN_PUBLIC_KEXEC_H
+#define _XEN_PUBLIC_KEXEC_H
+
+
+/* This file describes the Kexec / Kdump hypercall interface for Xen.
+ *
+ * Kexec under vanilla Linux allows a user to reboot the physical machine 
+ * into a new user-specified kernel. The Xen port extends this idea
+ * to allow rebooting of the machine from dom0. When kexec for dom0
+ * is used to reboot,  both the hypervisor and the domains get replaced
+ * with some other kernel. It is possible to kexec between vanilla
+ * Linux and Xen and back again. Xen to Xen works well too.
+ *
+ * The hypercall interface for kexec can be divided into three main
+ * types of hypercall operations:
+ *
+ * 1) Range information:
+ *    This is used by the dom0 kernel to ask the hypervisor about various 
+ *    address information. This information is needed to allow kexec-tools 
+ *    to fill in the ELF headers for /proc/vmcore properly.
+ *
+ * 2) Load and unload of images:
+ *    There are no big surprises here, the kexec binary from kexec-tools
+ *    runs in userspace in dom0. The tool loads/unloads data into the
+ *    dom0 kernel such as new kernel, initramfs and hypervisor. When
+ *    loaded the dom0 kernel performs a load hypercall operation, and
+ *    before releasing all page references the dom0 kernel calls unload.
+ *
+ * 3) Kexec operation:
+ *    This is used to start a previously loaded kernel.
+ */
+
+#include "xen.h"
+
+#if defined(__i386__) || defined(__x86_64__)
+#define KEXEC_XEN_NO_PAGES 17
+#endif
+
+/*
+ * Prototype for this hypercall is:
+ *  int kexec_op(int cmd, void *args)
+ * @cmd  == KEXEC_CMD_... 
+ *          KEXEC operation to perform
+ * @args == Operation-specific extra arguments (NULL if none).
+ */
+
+/*
+ * Kexec supports two types of operation:
+ * - kexec into a regular kernel, very similar to a standard reboot
+ *   - KEXEC_TYPE_DEFAULT is used to specify this type
+ * - kexec into a special "crash kernel", aka kexec-on-panic
+ *   - KEXEC_TYPE_CRASH is used to specify this type
+ *   - parts of our system may be broken at kexec-on-panic time
+ *     - the code should be kept as simple and self-contained as possible
+ */
+
+#define KEXEC_TYPE_DEFAULT 0
+#define KEXEC_TYPE_CRASH   1
+
+
+/* The kexec implementation for Xen allows the user to load two
+ * types of kernels, KEXEC_TYPE_DEFAULT and KEXEC_TYPE_CRASH.
+ * All data needed for a kexec reboot is kept in one xen_kexec_image_t
+ * per "instance". The data mainly consists of machine address lists to pages
+ * together with destination addresses. The data in xen_kexec_image_t
+ * is passed to the "code page" which is one page of code that performs
+ * the final relocations before jumping to the new kernel.
+ */
+ 
+typedef struct xen_kexec_image {
+#if defined(__i386__) || defined(__x86_64__)
+    unsigned long page_list[KEXEC_XEN_NO_PAGES];
+#endif
+#if defined(__ia64__)
+    unsigned long reboot_code_buffer;
+#endif
+    unsigned long indirection_page;
+    unsigned long start_address;
+} xen_kexec_image_t;
+
+/*
+ * Perform kexec having previously loaded a kexec or kdump kernel
+ * as appropriate.
+ * type == KEXEC_TYPE_DEFAULT or KEXEC_TYPE_CRASH [in]
+ */
+#define KEXEC_CMD_kexec                 0
+typedef struct xen_kexec_exec {
+    int type;
+} xen_kexec_exec_t;
+
+/*
+ * Load/Unload kernel image for kexec or kdump.
+ * type  == KEXEC_TYPE_DEFAULT or KEXEC_TYPE_CRASH [in]
+ * image == relocation information for kexec (ignored for unload) [in]
+ */
+#define KEXEC_CMD_kexec_load            1
+#define KEXEC_CMD_kexec_unload          2
+typedef struct xen_kexec_load {
+    int type;
+    xen_kexec_image_t image;
+} xen_kexec_load_t;
+
+#define KEXEC_RANGE_MA_CRASH      0 /* machine address and size of crash area 
*/
+#define KEXEC_RANGE_MA_XEN        1 /* machine address and size of Xen itself 
*/
+#define KEXEC_RANGE_MA_CPU        2 /* machine address and size of a CPU note 
*/
+#define KEXEC_RANGE_MA_XENHEAP    3 /* machine address and size of xenheap
+                                     * Note that although this is adjacent
+                                     * to Xen it exists in a separate EFI
+                                     * region on ia64, and thus needs to be
+                                     * inserted into iomem_machine separately 
*/
+#define KEXEC_RANGE_MA_BOOT_PARAM 4 /* machine address and size of
+                                     * the ia64_boot_param */
+#define KEXEC_RANGE_MA_EFI_MEMMAP 5 /* machine address and size of
+                                     * of the EFI Memory Map */
+#define KEXEC_RANGE_MA_VMCOREINFO 6 /* machine address and size of vmcoreinfo 
*/
+
+/*
+ * Find the address and size of certain memory areas
+ * range == KEXEC_RANGE_... [in]
+ * nr    == physical CPU number (starting from 0) if KEXEC_RANGE_MA_CPU [in]
+ * size  == number of bytes reserved in window [out]
+ * start == address of the first byte in the window [out]
+ */
+#define KEXEC_CMD_kexec_get_range       3
+typedef struct xen_kexec_range {
+    int range;
+    int nr;
+    unsigned long size;
+    unsigned long start;
+} xen_kexec_range_t;
+
+#endif /* _XEN_PUBLIC_KEXEC_H */
diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h
index 9f2d370..2e23363 100644
--- a/include/xen/interface/xen.h
+++ b/include/xen/interface/xen.h
@@ -58,6 +58,7 @@
 #define __HYPERVISOR_event_channel_op     32
 #define __HYPERVISOR_physdev_op           33
 #define __HYPERVISOR_hvm_op               34
+#define __HYPERVISOR_kexec_op             37
 #define __HYPERVISOR_tmem_op              38
 
 /* Architecture-specific hypercall definitions. */
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 87b77de..b92fdf0 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -35,18 +35,26 @@
 #include <linux/kmsg_dump.h>
 #include <linux/syscore_ops.h>
 
+#include <xen/xen-ops.h>
+
 #include <asm/page.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
 #include <asm/system.h>
 #include <asm/sections.h>
 
+#include <asm/xen/page.h>
+
 /* Per cpu memory for storing cpu states in case of system crash. */
 note_buf_t __percpu *crash_notes;
 
 /* vmcoreinfo stuff */
 static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
+#if defined(CONFIG_XEN) && defined(CONFIG_X86)
+u32 __page_aligned_bss vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
+#else
 u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
+#endif
 size_t vmcoreinfo_size;
 size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
 
@@ -357,13 +365,26 @@ static int kimage_is_destination_range(struct kimage 
*image,
        return 0;
 }
 
-static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
+static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order, 
unsigned long limit)
 {
        struct page *pages;
 
        pages = alloc_pages(gfp_mask, order);
        if (pages) {
                unsigned int count, i;
+#ifdef CONFIG_XEN
+               int address_bits;
+
+               if (limit == ~0UL)
+                       address_bits = BITS_PER_LONG;
+               else
+                       address_bits = ilog2(limit);
+
+               if (xen_create_contiguous_region((unsigned 
long)page_address(pages), order, address_bits) < 0) {
+                       __free_pages(pages, order);
+                       return NULL;
+               }
+#endif
                pages->mapping = NULL;
                set_page_private(pages, order);
                count = 1 << order;
@@ -427,10 +448,10 @@ static struct page 
*kimage_alloc_normal_control_pages(struct kimage *image,
        do {
                unsigned long pfn, epfn, addr, eaddr;
 
-               pages = kimage_alloc_pages(GFP_KERNEL, order);
+               pages = kimage_alloc_pages(GFP_KERNEL, order, 
KEXEC_CONTROL_MEMORY_LIMIT);
                if (!pages)
                        break;
-               pfn   = page_to_pfn(pages);
+               pfn   = pfn_to_mfn(page_to_pfn(pages));
                epfn  = pfn + count;
                addr  = pfn << PAGE_SHIFT;
                eaddr = epfn << PAGE_SHIFT;
@@ -464,6 +485,7 @@ static struct page 
*kimage_alloc_normal_control_pages(struct kimage *image,
        return pages;
 }
 
+#ifndef CONFIG_XEN
 static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
                                                      unsigned int order)
 {
@@ -517,7 +539,7 @@ static struct page *kimage_alloc_crash_control_pages(struct 
kimage *image,
                }
                /* If I don't overlap any segments I have found my hole! */
                if (i == image->nr_segments) {
-                       pages = pfn_to_page(hole_start >> PAGE_SHIFT);
+                       pages = pfn_to_page(mfn_to_pfn(hole_start >> 
PAGE_SHIFT));
                        break;
                }
        }
@@ -544,6 +566,13 @@ struct page *kimage_alloc_control_pages(struct kimage 
*image,
 
        return pages;
 }
+#else /* !CONFIG_XEN */
+struct page *kimage_alloc_control_pages(struct kimage *image,
+                                        unsigned int order)
+{
+       return kimage_alloc_normal_control_pages(image, order);
+}
+#endif
 
 static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
 {
@@ -559,7 +588,7 @@ static int kimage_add_entry(struct kimage *image, 
kimage_entry_t entry)
                        return -ENOMEM;
 
                ind_page = page_address(page);
-               *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
+               *image->entry = virt_to_machine(ind_page).maddr | 
IND_INDIRECTION;
                image->entry = ind_page;
                image->last_entry = ind_page +
                                      ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
@@ -618,13 +647,13 @@ static void kimage_terminate(struct kimage *image)
 #define for_each_kimage_entry(image, ptr, entry) \
        for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
                ptr = (entry & IND_INDIRECTION)? \
-                       phys_to_virt((entry & PAGE_MASK)): ptr +1)
+                       phys_to_virt(machine_to_phys(XMADDR(entry & 
PAGE_MASK)).paddr): ptr +1)
 
 static void kimage_free_entry(kimage_entry_t entry)
 {
        struct page *page;
 
-       page = pfn_to_page(entry >> PAGE_SHIFT);
+       page = pfn_to_page(mfn_to_pfn(entry >> PAGE_SHIFT));
        kimage_free_pages(page);
 }
 
@@ -636,6 +665,10 @@ static void kimage_free(struct kimage *image)
        if (!image)
                return;
 
+#ifdef CONFIG_XEN
+       xen_machine_kexec_unload(image);
+#endif
+
        kimage_free_extra_pages(image);
        for_each_kimage_entry(image, ptr, entry) {
                if (entry & IND_INDIRECTION) {
@@ -711,7 +744,7 @@ static struct page *kimage_alloc_page(struct kimage *image,
         * have a match.
         */
        list_for_each_entry(page, &image->dest_pages, lru) {
-               addr = page_to_pfn(page) << PAGE_SHIFT;
+               addr = pfn_to_mfn(page_to_pfn(page)) << PAGE_SHIFT;
                if (addr == destination) {
                        list_del(&page->lru);
                        return page;
@@ -722,16 +755,16 @@ static struct page *kimage_alloc_page(struct kimage 
*image,
                kimage_entry_t *old;
 
                /* Allocate a page, if we run out of memory give up */
-               page = kimage_alloc_pages(gfp_mask, 0);
+               page = kimage_alloc_pages(gfp_mask, 0, 
KEXEC_SOURCE_MEMORY_LIMIT);
                if (!page)
                        return NULL;
                /* If the page cannot be used file it away */
-               if (page_to_pfn(page) >
+               if (pfn_to_mfn(page_to_pfn(page)) >
                                (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
                        list_add(&page->lru, &image->unuseable_pages);
                        continue;
                }
-               addr = page_to_pfn(page) << PAGE_SHIFT;
+               addr = pfn_to_mfn(page_to_pfn(page)) << PAGE_SHIFT;
 
                /* If it is the destination page we want use it */
                if (addr == destination)
@@ -754,7 +787,7 @@ static struct page *kimage_alloc_page(struct kimage *image,
                        struct page *old_page;
 
                        old_addr = *old & PAGE_MASK;
-                       old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
+                       old_page = pfn_to_page(mfn_to_pfn(old_addr >> 
PAGE_SHIFT));
                        copy_highpage(page, old_page);
                        *old = addr | (*old & ~PAGE_MASK);
 
@@ -810,7 +843,7 @@ static int kimage_load_normal_segment(struct kimage *image,
                        result  = -ENOMEM;
                        goto out;
                }
-               result = kimage_add_page(image, page_to_pfn(page)
+               result = kimage_add_page(image, pfn_to_mfn(page_to_pfn(page))
                                                                << PAGE_SHIFT);
                if (result < 0)
                        goto out;
@@ -842,6 +875,7 @@ out:
        return result;
 }
 
+#ifndef CONFIG_XEN
 static int kimage_load_crash_segment(struct kimage *image,
                                        struct kexec_segment *segment)
 {
@@ -864,7 +898,7 @@ static int kimage_load_crash_segment(struct kimage *image,
                char *ptr;
                size_t uchunk, mchunk;
 
-               page = pfn_to_page(maddr >> PAGE_SHIFT);
+               page = pfn_to_page(mfn_to_pfn(maddr >> PAGE_SHIFT));
                if (!page) {
                        result  = -ENOMEM;
                        goto out;
@@ -913,6 +947,13 @@ static int kimage_load_segment(struct kimage *image,
 
        return result;
 }
+#else /* CONFIG_XEN */
+static int kimage_load_segment(struct kimage *image,
+                               struct kexec_segment *segment)
+{
+       return kimage_load_normal_segment(image, segment);
+}
+#endif
 
 /*
  * Exec Kernel system call: for obvious reasons only root may call it.
@@ -1016,6 +1057,13 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, 
unsigned long, nr_segments,
                }
                kimage_terminate(image);
        }
+#ifdef CONFIG_XEN
+       if (image) {
+               result = xen_machine_kexec_load(image);
+               if (result)
+                       goto out;
+       }
+#endif
        /* Install the new kernel, and  Uninstall the old */
        image = xchg(dest_image, image);
 
@@ -1106,8 +1154,8 @@ void __weak crash_free_reserved_phys_range(unsigned long 
begin,
        unsigned long addr;
 
        for (addr = begin; addr < end; addr += PAGE_SIZE) {
-               ClearPageReserved(pfn_to_page(addr >> PAGE_SHIFT));
-               init_page_count(pfn_to_page(addr >> PAGE_SHIFT));
+               ClearPageReserved(pfn_to_page(mfn_to_pfn(addr >> PAGE_SHIFT)));
+               init_page_count(pfn_to_page(mfn_to_pfn(addr >> PAGE_SHIFT)));
                free_page((unsigned long)__va(addr));
                totalram_pages++;
        }
@@ -1216,6 +1264,7 @@ static int __init crash_notes_memory_init(void)
 module_init(crash_notes_memory_init)
 
 
+#ifndef CONFIG_XEN
 /*
  * parsing the "crashkernel" commandline
  *
@@ -1378,6 +1427,7 @@ int __init parse_crashkernel(char                  
*cmdline,
 
        return 0;
 }
+#endif
 
 
 
@@ -1435,7 +1485,18 @@ static int __init crash_save_vmcoreinfo_init(void)
 
        VMCOREINFO_SYMBOL(init_uts_ns);
        VMCOREINFO_SYMBOL(node_online_map);
+#ifndef CONFIG_X86_XEN
        VMCOREINFO_SYMBOL(swapper_pg_dir);
+#else
+/*
+ * Since for x86-32 Xen swapper_pg_dir is a pointer rather than an array,
+ * make the value stored consistent with native (i.e. the base address of
+ * the page directory).
+ */
+# define swapper_pg_dir *swapper_pg_dir
+       VMCOREINFO_SYMBOL(swapper_pg_dir);
+# undef swapper_pg_dir
+#endif
        VMCOREINFO_SYMBOL(_stext);
        VMCOREINFO_SYMBOL(vmlist);
 

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.