[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH 05/10] kexec: extend hypercall with improved load/unload ops



>>> On 24.06.13 at 19:42, David Vrabel <david.vrabel@xxxxxxxxxx> wrote:
> +static void init_level2_page(l2_pgentry_t *l2, unsigned long addr)
> +{
> +    unsigned long end_addr;
> +
> +    addr &= PAGE_MASK;
> +    end_addr = addr + L2_PAGETABLE_ENTRIES * (1ul << L2_PAGETABLE_SHIFT);

For one, with the code below, this can't be right: "addr" is getting
only page aligned, but the rest of the code assumes it is suitable
as a super-page.

Further, you're risking mapping non-memory here, as the caller
doesn't pass the true end of the memory block to be mapped.

And then the expression is odd - why not just

    end_addr = addr + (L2_PAGETABLE_ENTRIES << L2_PAGETABLE_SHIFT);

> +static int init_level4_page(struct kexec_image *image, l4_pgentry_t *l4,
> +                            unsigned long addr, unsigned long last_addr)
> +{
> +    unsigned long end_addr;
> +    int result;
> +
> +    addr &= PAGE_MASK;
> +    end_addr = addr + L4_PAGETABLE_ENTRIES * (1ul << L4_PAGETABLE_SHIFT);
> +
> +    while ( (addr < last_addr) && (addr < end_addr) )
> +    {
> +        struct page_info *l3_page;
> +        l3_pgentry_t *l3;
> +
> +        l3_page = kimage_alloc_control_page(image, 0);
> +        if ( !l3_page )
> +            return -ENOMEM;
> +        l3 = __map_domain_page(l3_page);
> +        result = init_level3_page(image, l3, addr, last_addr);
> +        unmap_domain_page(l3);
> +        if (result)

Coding style.

> +static int init_transition_pgtable(struct kexec_image *image, l4_pgentry_t 
> *l4)
> +{
> +    struct page_info *l3_page;
> +    struct page_info *l2_page;
> +    struct page_info *l1_page;
> +    unsigned long vaddr, paddr;
> +    l3_pgentry_t *l3 = NULL;
> +    l2_pgentry_t *l2 = NULL;
> +    l1_pgentry_t *l1 = NULL;
> +    int ret = -ENOMEM;
> +
> +    vaddr = (unsigned long)kexec_reloc;
> +    paddr = page_to_maddr(image->control_code_page);
> +
> +    l4 += l4_table_offset(vaddr);
> +    if ( !(l4e_get_flags(*l4) & _PAGE_PRESENT) )
> +    {
> +        l3_page = kimage_alloc_control_page(image, 0);
> +        if ( !l3_page )
> +            goto out;
> +        l4e_write(l4, l4e_from_page(l3_page, __PAGE_HYPERVISOR));
> +    }
> +    else
> +        l3_page = l4e_get_page(*l4);
> +
> +    l3 = __map_domain_page(l3_page);
> +    l3 += l3_table_offset(vaddr);
> +    if ( !(l3e_get_flags(*l3) & _PAGE_PRESENT) )
> +    {
> +        l2_page = kimage_alloc_control_page(image, 0);
> +        if ( !l2_page )
> +            goto out;
> +        l3e_write(l3, l3e_from_page(l2_page, __PAGE_HYPERVISOR));
> +    }
> +    else
> +        l2_page = l3e_get_page(*l3);

Afaict you're done using "l3" here, so you should unmap it in order
to reduce the pressure on the domain page mapping resources.

> +
> +    l2 = __map_domain_page(l2_page);
> +    l2 += l2_table_offset(vaddr);
> +    if ( !(l2e_get_flags(*l2) & _PAGE_PRESENT) )
> +    {
> +        l1_page = kimage_alloc_control_page(image, 0);
> +        if ( !l1_page )
> +            goto out;
> +        l2e_write(l2, l2e_from_page(l1_page, __PAGE_HYPERVISOR));
> +    }
> +    else
> +        l1_page = l2e_get_page(*l2);

Same for "l2" at this point.

> +static int build_reloc_page_table(struct kexec_image *image)
> +{
> +    struct page_info *l4_page;
> +    l4_pgentry_t *l4;
> +    int result;
> +
> +    l4_page = kimage_alloc_control_page(image, 0);
> +    if ( !l4_page )
> +        return -ENOMEM;
> +    l4 = __map_domain_page(l4_page);
> +
> +    result = init_level4_page(image, l4, 0, max_page << PAGE_SHIFT);

What about holes in the physical address space - not just the
MMIO hole below 4Gb is a problem here, but also discontiguous
physical memory.

> --- /dev/null
> +++ b/xen/arch/x86/x86_64/kexec_reloc.S
> @@ -0,0 +1,211 @@
> +/*
> + * Relocate a kexec_image to its destination and call it.
> + *
> + * Copyright (C) 2013 Citrix Systems R&D Ltd.
> + *
> + * Portions derived from Linux's arch/x86/kernel/relocate_kernel_64.S.
> + *
> + *   Copyright (C) 2002-2005 Eric Biederman  <ebiederm@xxxxxxxxxxxx>
> + *
> + * This source code is licensed under the GNU General Public License,
> + * Version 2.  See the file COPYING for more details.
> + */
> +#include <xen/config.h>
> +
> +#include <asm/asm_defns.h>
> +#include <asm/msr.h>
> +#include <asm/page.h>
> +#include <asm/machine_kexec.h>
> +
> +        .text
> +        .align PAGE_SIZE
> +        .code64
> +
> +ENTRY(kexec_reloc)
> +        /* %rdi - code page maddr */
> +        /* %rsi - page table maddr */
> +        /* %rdx - indirection page maddr */
> +        /* %rcx - entry maddr */
> +        /* %r8 - flags */
> +
> +        movq %rdx, %rbx
> +
> +        /* Setup stack. */
> +        leaq (reloc_stack - kexec_reloc)(%rdi), %rsp
> +
> +        /* Load reloc page table. */
> +        movq %rsi, %cr3
> +
> +        /* Jump to identity mapped code. */
> +        leaq (identity_mapped - kexec_reloc)(%rdi), %rax
> +        jmpq *%rax
> +
> +identity_mapped:
> +        pushq %rcx
> +        pushq %rbx
> +        pushq %rsi
> +        pushq %rdi
> +
> +        /*
> +         * Set cr0 to a known state:
> +         *  - Paging enabled
> +         *  - Alignment check disabled
> +         *  - Write protect disabled
> +         *  - No task switch
> +         *  - Don't do FP software emulation.
> +         *  - Proctected mode enabled
> +         */
> +        movq    %cr0, %rax
> +        andq    $~(X86_CR0_AM | X86_CR0_WP | X86_CR0_TS | X86_CR0_EM), %rax
> +        orl     $(X86_CR0_PG | X86_CR0_PE), %eax

Either "andq" and "orq" or "andl" and "orl".

> +        movq    %rax, %cr0
> +
> +        /*
> +         * Set cr4 to a known state:
> +         *  - physical address extension enabled
> +         */
> +        movq    $X86_CR4_PAE, %rax

"movl" suffices here.

> +        movq    %rax, %cr4
> +
> +        movq %rbx, %rdi
> +        call relocate_pages
> +
> +        popq %rdi
> +        popq %rsi
> +        popq %rbx
> +        popq %rcx
> +
> +        /* Need to switch to 32-bit mode? */
> +        testq $KEXEC_RELOC_FLAG_COMPAT, %r8
> +        jnz call_32_bit
> +
> +call_64_bit:
> +        /* Call the image entry point.  This should never return. */
> +        call *%rcx
> +        ud2
> +
> +call_32_bit:
> +        /* Setup IDT. */
> +        lidt compat_mode_idt(%rip)
> +
> +        /* Load compat GDT. */
> +        leaq (compat_mode_gdt - kexec_reloc)(%rdi), %rax
> +        movq %rax, (compat_mode_gdt_desc + 2)(%rip)
> +        lgdt compat_mode_gdt_desc(%rip)
> +
> +        /* Relocate compatibility mode entry point address. */
> +        leal (compatibility_mode - kexec_reloc)(%edi), %eax
> +        movl %eax, compatibility_mode_far(%rip)
> +
> +        /* Enter compatibility mode. */
> +        ljmp *compatibility_mode_far(%rip)

As you're elsewhere using mnemonic suffixes elsewhere even when
not strictly needed, for consistency I'd recommend using one on
this instruction (and perhaps also on the lidt/lgdt above) too.

> +
> +relocate_pages:
> +        /* %rdi - indirection page maddr */
> +        cld
> +        movq    %rdi, %rcx
> +        xorq    %rdi, %rdi
> +        xorq    %rsi, %rsi

I guess performance and code size aren't of highest importance
here, but "xorl" would suffice in both of the above lines.

> +        jmp     1f
> +
> +0:      /* top, read another word for the indirection page */
> +
> +        movq    (%rbx), %rcx
> +        addq    $8,     %rbx
> +1:
> +        testq   $0x1,   %rcx  /* is it a destination page? */

And "testl" (or even "testb") would be sufficient here. There are
more pointless uses of the "q" suffix further down.

In any event the 0x1 here and the other flags tested for below will
want to become manifest constants.

> +        jz      2f
> +        movq    %rcx,   %rdi
> +        andq    $0xfffffffffffff000, %rdi

The number of "f"-s here being correct can't be seen without
actually counting them - either use a manifest constant like
PAGE_MASK, or at least write "$~0xfff".

> +        jmp     0b
> +2:
> +        testq   $0x2,   %rcx  /* is it an indirection page? */
> +        jz      2f
> +        movq    %rcx,   %rbx
> +        andq    $0xfffffffffffff000, %rbx
> +        jmp     0b
> +2:
> +        testq   $0x4,   %rcx  /* is it the done indicator? */
> +        jz      2f
> +        jmp     3f

Just a single (inverse) conditional branch please.

And there are too many "2:" labels here in close succession.

> +2:
> +        testq   $0x8,   %rcx  /* is it the source indicator? */
> +        jz      2f
> +        movq    %rcx,   %rsi  /* For ever source page do a copy */

"every"?

> +        andq    $0xfffffffffffff000, %rsi
> +        movq    $512,   %rcx
> +        rep movsq
> +        jmp     0b
> +2:
> +        testq   $0x10,  %rcx  /* is it the zero indicator? */
> +        jz      0b            /* Ignore it otherwise */
> +        movq    $512,   %rcx  /* Zero the destination page. */
> +        xorq    %rax,   %rax
> +        rep stosq
> +        jmp     0b
> +3:
> +        ret
> +
> +        .code32
> +
> +compatibility_mode:
> +        /* Setup some sane segments. */
> +        movl $0x0008, %eax
> +        movl %eax, %ds
> +        movl %eax, %es
> +        movl %eax, %fs
> +        movl %eax, %gs
> +        movl %eax, %ss
> +
> +        movl %ecx, %ebp
> +
> +        /* Disable paging and therefore leave 64 bit mode. */
> +        movl %cr0, %eax
> +        andl $~X86_CR0_PG, %eax
> +        movl %eax, %cr0
> +
> +        /* Disable long mode */
> +        movl    $MSR_EFER, %ecx
> +        rdmsr
> +        andl    $~EFER_LME, %eax
> +        wrmsr
> +
> +        /* Clear cr4 to disable PAE. */
> +        movl    $0, %eax

        xorl    %eax, %eax

> +        movl    %eax, %cr4
> +
> +        /* Call the image entry point.  This should never return. */
> +        call *%ebp
> +        ud2
> +
> +        .align 16

Why? 4 is all you need.

> +compatibility_mode_far:
> +        .long 0x00000000             /* set in call_32_bit above */
> +        .word 0x0010
> +
> +        .align 16

Even more so here. If you care for alignment, you want 2 mod 8
here.

> +compat_mode_gdt_desc:
> +        .word (3*8)-1
> +        .quad 0x0000000000000000     /* set in call_32_bit above */
> +
> +        .align 16

And just 8 here.

> +compat_mode_gdt:
> +        .quad 0x0000000000000000     /* null                              */
> +        .quad 0x00cf92000000ffff     /* 0x0008 ring 0 data                */
> +        .quad 0x00cf9a000000ffff     /* 0x0010 ring 0 code, compatibility */
> +
> +compat_mode_idt:

compat_mode_idt_desc:

And if you care for alignment, 2 mod 8 again.

> +        .word   0                    /* limit */
> +        .long   0                    /* base */
> +
> +        /*
> +         * 16 words of stack are more than enough.
> +         */
> +        .fill 16,8,0
> +reloc_stack:

And now you don't care for the stack being mis-aligned?

Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.