[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH v2 4/5] x86: Port the basic alternative mechanism from Linux to Xen




> -----Original Message-----
> From: Andrew Cooper [mailto:amc96@xxxxxxxxxxxxxxxx] On Behalf Of
> Andrew Cooper
> Sent: Thursday, May 29, 2014 4:56 PM
> To: Wu, Feng; xen-devel@xxxxxxxxxxxxx
> Cc: keir.xen@xxxxxxxxx; JBeulich@xxxxxxxx; ian.campbell@xxxxxxxxxx;
> stefano.stabellini@xxxxxxxxxx; tim@xxxxxxx
> Subject: Re: [PATCH v2 4/5] x86: Port the basic alternative mechanism from
> Linux to Xen
> 
> On 29/05/2014 06:34, Feng Wu wrote:
> > This patch ports the basic alternative mechanism from Linux to Xen.
> > With this mechanism, we can patch code based on the CPU features.
> >
> > Signed-off-by: Feng Wu <feng.wu@xxxxxxxxx>
> > ---
> >   xen/arch/x86/Makefile             |   1 +
> >   xen/arch/x86/alternative.c        | 213
> ++++++++++++++++++++++++++++++++++++++
> >   xen/arch/x86/setup.c              |   3 +
> >   xen/arch/x86/xen.lds.S            |  15 +++
> >   xen/include/asm-x86/alternative.h |  78 ++++++++++++++
> >   5 files changed, 310 insertions(+)
> >   create mode 100644 xen/arch/x86/alternative.c
> >   create mode 100644 xen/include/asm-x86/alternative.h
> >
> > diff --git a/xen/arch/x86/Makefile b/xen/arch/x86/Makefile
> > index d502bdf..3734884 100644
> > --- a/xen/arch/x86/Makefile
> > +++ b/xen/arch/x86/Makefile
> > @@ -58,6 +58,7 @@ obj-y += crash.o
> >   obj-y += tboot.o
> >   obj-y += hpet.o
> >   obj-y += xstate.o
> > +obj-y += alternative.o
> >
> >   obj-$(crash_debug) += gdbstub.o
> >
> > diff --git a/xen/arch/x86/alternative.c b/xen/arch/x86/alternative.c
> > new file mode 100644
> > index 0000000..3dbc811
> > --- /dev/null
> > +++ b/xen/arch/x86/alternative.c
> > @@ -0,0 +1,213 @@
> >
> +/**************************************************************
> ****************
> > + * alternative.c
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License as published by
> > + * the Free Software Foundation; either version 2 of the License, or
> > + * (at your option) any later version.
> > + *
> > + * This program is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > + * GNU General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU General Public License
> > + * along with this program; if not, write to the Free Software
> > + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301,
> USA.
> > + */
> > +
> > +#include <xen/types.h>
> > +#include <asm/processor.h>
> > +#include <asm/alternative.h>
> > +#include <xen/init.h>
> > +#include <asm/system.h>
> > +#include <asm/traps.h>
> > +#include <asm/nmi.h>
> > +
> > +#define MAX_PATCH_LEN (255-1)
> > +
> > +extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
> > +
> > +#ifdef K8_NOP1
> > +static const unsigned char k8nops[] __initconst = {
> > +    K8_NOP1,
> > +    K8_NOP2,
> > +    K8_NOP3,
> > +    K8_NOP4,
> > +    K8_NOP5,
> > +    K8_NOP6,
> > +    K8_NOP7,
> > +    K8_NOP8
> > +};
> > +static const unsigned char * const k8_nops[ASM_NOP_MAX+1] __initconst =
> {
> > +    NULL,
> > +    k8nops,
> > +    k8nops + 1,
> > +    k8nops + 1 + 2,
> > +    k8nops + 1 + 2 + 3,
> > +    k8nops + 1 + 2 + 3 + 4,
> > +    k8nops + 1 + 2 + 3 + 4 + 5,
> > +    k8nops + 1 + 2 + 3 + 4 + 5 + 6,
> > +    k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7
> > +};
> > +#endif
> > +
> > +#ifdef P6_NOP1
> > +static const unsigned char p6nops[] __initconst = {
> > +    P6_NOP1,
> > +    P6_NOP2,
> > +    P6_NOP3,
> > +    P6_NOP4,
> > +    P6_NOP5,
> > +    P6_NOP6,
> > +    P6_NOP7,
> > +    P6_NOP8
> > +};
> > +static const unsigned char * const p6_nops[ASM_NOP_MAX+1] __initconst
> = {
> > +    NULL,
> > +    p6nops,
> > +    p6nops + 1,
> > +    p6nops + 1 + 2,
> > +    p6nops + 1 + 2 + 3,
> > +    p6nops + 1 + 2 + 3 + 4,
> > +    p6nops + 1 + 2 + 3 + 4 + 5,
> > +    p6nops + 1 + 2 + 3 + 4 + 5 + 6,
> > +    p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7
> > +};
> > +#endif
> > +
> > +static const unsigned char * const *ideal_nops __initdata = k8_nops;
> > +
> > +static int __init mask_nmi_callback(struct cpu_user_regs *regs, int cpu)
> > +{
> > +    return 1;
> > +}
> > +
> > +static void __init arch_init_ideal_nops(void)
> > +{
> > +    /*
> > +     * Due to a decoder implementation quirk, some
> > +     * specific Intel CPUs actually perform better with
> > +     * the "k8_nops" than with the SDM-recommended NOPs.
> > +     */
> > +    if ( (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
> > +         !(boot_cpu_data.x86 == 6 &&
> > +           boot_cpu_data.x86_model >= 0x0f &&
> > +           boot_cpu_data.x86_model != 0x1c &&
> > +           boot_cpu_data.x86_model != 0x26 &&
> > +           boot_cpu_data.x86_model != 0x27 &&
> > +           boot_cpu_data.x86_model < 0x30) )
> > +        ideal_nops = p6_nops;
> > +}
> > +
> > +/* Use this to add nops to a buffer, then text_poke the whole buffer. */
> > +static void __init add_nops(void *insns, unsigned int len)
> > +{
> > +    while ( len > 0 )
> > +    {
> > +        unsigned int noplen = len;
> > +        if ( noplen > ASM_NOP_MAX )
> > +            noplen = ASM_NOP_MAX;
> > +        memcpy(insns, ideal_nops[noplen], noplen);
> > +        insns += noplen;
> > +        len -= noplen;
> > +    }
> > +}
> > +
> > +/*
> > + * text_poke_early - Update instructions on a live kernel at boot time
> > + * @addr: address to modify
> > + * @opcode: source of the copy
> > + * @len: length to copy
> > + *
> > + * When you use this code to patch more than one byte of an instruction
> > + * you need to make sure that other CPUs cannot execute this code in
> parallel.
> > + * Also no thread must be currently preempted in the middle of these
> > + * instructions. And on the local CPU you need to be protected again NMI or
> MCE
> > + * handlers seeing an inconsistent instruction while you patch.
> > + */
> > +static void *__init text_poke_early(void *addr, const void *opcode, size_t
> len)
> > +{
> > +    unsigned long flags;
> > +
> > +    local_irq_save(flags);
> > +    memcpy(addr, opcode, len);
> > +    sync_core();
> > +    local_irq_restore(flags);
> > +
> > +    return addr;
> > +}
> > +
> > +/*
> > + * Replace instructions with better alternatives for this CPU type.
> > + * This runs before SMP is initialized to avoid SMP problems with
> > + * self modifying code. This implies that asymmetric systems where
> > + * APs have less capabilities than the boot processor are not handled.
> > + * Tough. Make sure you disable such features by hand.
> > + */
> > +
> 
> Excess newline
> 
> > +static void __init apply_alternatives(struct alt_instr *start, struct 
> > alt_instr
> *end)
> > +{
> > +    struct alt_instr *a;
> > +    u8 *instr, *replacement;
> > +    u8 insnbuf[MAX_PATCH_LEN];
> > +
> > +    printk(KERN_INFO "alt table %p -> %p\n", start, end);
> > +
> > +    /*
> > +     * The scan order should be from start to end. A later scanned
> > +     * alternative code can overwrite a previous scanned alternative code.
> > +     * Some kernel functions (e.g. memcpy, memset, etc) use this order to
> > +     * patch code.
> > +     *
> > +     * So be careful if you want to change the scan order to any other
> > +     * order.
> > +     */
> > +    for ( a = start; a < end; a++ )
> > +    {
> > +        instr = (u8 *)&a->instr_offset + a->instr_offset;
> > +        replacement = (u8 *)&a->repl_offset + a->repl_offset;
> > +        BUG_ON(a->replacementlen > a->instrlen);
> > +        BUG_ON(a->instrlen > sizeof(insnbuf));
> > +        BUG_ON(a->cpuid >= NCAPINTS * 32);
> > +        if ( !boot_cpu_has(a->cpuid) )
> > +            continue;
> > +
> > +        memcpy(insnbuf, replacement, a->replacementlen);
> > +
> > +        /* 0xe8/0xe9 is a relative jump; fix the offset. */
> > +        if ( (*insnbuf & 0xfe) == 0xe8 && a->replacementlen == 5 )
> > +            *(s32 *)(insnbuf + 1) += replacement - instr;
> > +
> > +        add_nops(insnbuf + a->replacementlen,
> > +                 a->instrlen - a->replacementlen);
> > +        text_poke_early(instr, insnbuf, a->instrlen);
> > +    }
> > +}
> > +
> > +void __init alternative_instructions(void)
> > +{
> > +    nmi_callback_t saved_nmi_callback;
> > +
> > +    arch_init_ideal_nops();
> > +
> > +    /*
> > +     * The patching is not fully atomic, so try to avoid local 
> > interruptions
> > +     * that might execute the to be patched code.
> > +     * Other CPUs are not running.
> > +     */
> > +    saved_nmi_callback = set_nmi_callback(mask_nmi_callback);
> 
> Newline here
> 
> > +    /*
> > +     * Don't stop machine check exceptions while patching.
> > +     * MCEs only happen when something got corrupted and in this
> > +     * case we must do something about the corruption.
> > +     * Ignoring it is worse than a unlikely patching race.
> > +     * Also machine checks tend to be broadcast and if one CPU
> > +     * goes into machine check the others follow quickly, so we don't
> > +     * expect a machine check to cause undue problems during to code
> > +     * patching.
> > +     */
> > +
> 
> but not here.
> 
> > +    apply_alternatives(__alt_instructions, __alt_instructions_end);
> 
> Possibly also here.
> 
> > +    set_nmi_callback(saved_nmi_callback);
> > +}
> > diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
> > index 508649d..d16453a 100644
> > --- a/xen/arch/x86/setup.c
> > +++ b/xen/arch/x86/setup.c
> > @@ -48,6 +48,7 @@
> >   #include <asm/setup.h>
> >   #include <xen/cpu.h>
> >   #include <asm/nmi.h>
> > +#include <asm/alternative.h>
> >
> >   /* opt_nosmp: If true, secondary processors are ignored. */
> >   static bool_t __initdata opt_nosmp;
> > @@ -1288,6 +1289,8 @@ void __init noreturn __start_xen(unsigned long
> mbi_p)
> >       if ( cpu_has_fsgsbase )
> >           set_in_cr4(X86_CR4_FSGSBASE);
> >
> > +    alternative_instructions();
> > +
> 
> Given this ordering, it might be cleaner to have an
> ASSERT(!local_irq_enabled()) in the top of alternative_instructions(),
> and forgo the local_irq_save/restore() in text_poke_early().
> 
> If you can move this higher up before enabling MCEs in CR4, it might be
> slightly more resilient.
> 
> ~Andrew

MCE bit in CR4 is set in identify_cpu() --> mcheck_init() --> 
set_in_cr4(X86_CR4_MCE), but
apply_alternatives() needs boot_cpu_data.x86_capability being ready, since it 
calls boot_cpu_has().
If we put alternative_instructions() before enabling MCEs in CR4, which place 
do you suggest? Thanks!

Thanks,
Feng

> 
> >       local_irq_enable();
> >
> >       pt_pci_init();
> > diff --git a/xen/arch/x86/xen.lds.S b/xen/arch/x86/xen.lds.S
> > index 17db361..d4b1f1a 100644
> > --- a/xen/arch/x86/xen.lds.S
> > +++ b/xen/arch/x86/xen.lds.S
> > @@ -105,6 +105,12 @@ SECTIONS
> >     .init.text : {
> >          _sinittext = .;
> >          *(.init.text)
> > +       /*
> > +        * Here are the replacement instructions. The linker sticks them
> > +        * as binary blobs. The .altinstructions has enough data to get
> > +        * the address and the length of them to patch the kernel safely.
> > +        */
> > +       *(.altinstr_replacement)
> >          _einittext = .;
> >     } :text
> >     .init.data : {
> > @@ -120,6 +126,15 @@ SECTIONS
> >          __trampoline_seg_start = .;
> >          *(.trampoline_seg)
> >          __trampoline_seg_stop = .;
> > +       /*
> > +        * struct alt_inst entries. From the header (alternative.h):
> > +        * "Alternative instructions for different CPU types or 
> > capabilities"
> > +        * Think locking instructions on spinlocks.
> > +        */
> > +       . = ALIGN(8);
> > +        __alt_instructions = .;
> > +        *(.altinstructions)
> > +        __alt_instructions_end = .;
> >
> >          . = ALIGN(8);
> >          __ctors_start = .;
> > diff --git a/xen/include/asm-x86/alternative.h
> b/xen/include/asm-x86/alternative.h
> > new file mode 100644
> > index 0000000..55a6604
> > --- /dev/null
> > +++ b/xen/include/asm-x86/alternative.h
> > @@ -0,0 +1,78 @@
> > +#ifndef __X86_ALTERNATIVE_H__
> > +#define __X86_ALTERNATIVE_H__
> > +
> > +#include <asm/nops.h>
> > +
> > +#ifdef __ASSEMBLY__
> > +.macro altinstruction_entry orig alt feature orig_len alt_len
> > +        .long \orig - .
> > +        .long \alt - .
> > +        .word \feature
> > +        .byte \orig_len
> > +        .byte \alt_len
> > +.endm
> > +#else
> > +#include <xen/types.h>
> > +
> > +struct alt_instr {
> > +    s32 instr_offset;       /* original instruction */
> > +    s32 repl_offset;        /* offset to replacement instruction */
> > +    u16 cpuid;              /* cpuid bit set for replacement */
> > +    u8  instrlen;           /* length of original instruction */
> > +    u8  replacementlen;     /* length of new instruction, <= instrlen */
> > +};
> > +
> > +extern void alternative_instructions(void);
> > +
> > +#define OLDINSTR(oldinstr)      "661:\n\t" oldinstr "\n662:\n"
> > +
> > +#define b_replacement(number)   "663"#number
> > +#define e_replacement(number)   "664"#number
> > +
> > +#define alt_slen "662b-661b"
> > +#define alt_rlen(number)
> e_replacement(number)"f-"b_replacement(number)"f"
> > +
> > +#define ALTINSTR_ENTRY(feature, number)
> \
> > +        " .long 661b - .\n"                             /* label
> */ \
> > +        " .long " b_replacement(number)"f - .\n"        /* new
> instruction */ \
> > +        " .word " STR(feature) "\n"                     /* feature bit
> */ \
> > +        " .byte " alt_slen "\n"                         /* source len
> */ \
> > +        " .byte " alt_rlen(number) "\n"                 /*
> replacement len */
> > +
> > +#define DISCARD_ENTRY(number)                           /* rlen
> <= slen */    \
> > +        " .byte 0xff + (" alt_rlen(number) ") - (" alt_slen ")\n"
> > +
> > +#define ALTINSTR_REPLACEMENT(newinstr, feature, number) /*
> replacement */     \
> > +        b_replacement(number)":\n\t" newinstr "\n"
> e_replacement(number) ":\n\t"
> > +
> > +/* alternative assembly primitive: */
> > +#define ALTERNATIVE(oldinstr, newinstr, feature)
> \
> > +        OLDINSTR(oldinstr)
> \
> > +        ".pushsection .altinstructions,\"a\"\n"
> \
> > +        ALTINSTR_ENTRY(feature, 1)
> \
> > +        ".popsection\n"
> \
> > +        ".pushsection .discard,\"aw\",@progbits\n"
> \
> > +        DISCARD_ENTRY(1)
> \
> > +        ".popsection\n"
> \
> > +        ".pushsection .altinstr_replacement, \"ax\"\n"
> \
> > +        ALTINSTR_REPLACEMENT(newinstr, feature, 1)
> \
> > +        ".popsection"
> > +
> > +/*
> > + * Alternative instructions for different CPU types or capabilities.
> > + *
> > + * This allows to use optimized instructions even on generic binary
> > + * kernels.
> > + *
> > + * length of oldinstr must be longer or equal the length of newinstr
> > + * It can be padded with nops as needed.
> > + *
> > + * For non barrier like inlines please define new variants
> > + * without volatile and memory clobber.
> > + */
> > +#define alternative(oldinstr, newinstr, feature)
> \
> > +        asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : :
> "memory")
> > +
> > +#endif  /*  __ASSEMBLY__  */
> > +
> > +#endif /* __X86_ALTERNATIVE_H__ */


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.