[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH v3 07/23] xsplice: Implement support for applying/reverting/replacing patches. (v5)



On 12/02/16 18:05, Konrad Rzeszutek Wilk wrote:
> diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
> index 9d43f7b..b5995b9 100644
> --- a/xen/arch/x86/domain.c
> +++ b/xen/arch/x86/domain.c
> @@ -36,6 +36,7 @@
>  #include <xen/cpu.h>
>  #include <xen/wait.h>
>  #include <xen/guest_access.h>
> +#include <xen/xsplice.h>
>  #include <public/sysctl.h>
>  #include <public/hvm/hvm_vcpu.h>
>  #include <asm/regs.h>
> @@ -121,6 +122,7 @@ static void idle_loop(void)
>          (*pm_idle)();
>          do_tasklet();
>          do_softirq();
> +        do_xsplice(); /* Must be last. */

Then name "do_xsplice()" is slightly misleading (although it is in equal
company here).  check_for_xsplice_work() would be more accurate.

> diff --git a/xen/arch/x86/xsplice.c b/xen/arch/x86/xsplice.c
> index 814dd52..ae35e91 100644
> --- a/xen/arch/x86/xsplice.c
> +++ b/xen/arch/x86/xsplice.c
> @@ -10,6 +10,25 @@
>                              __func__,__LINE__, x); return x; }
>  #endif
>  
> +#define PATCH_INSN_SIZE 5

Somewhere you should have a BUILD_BUG_ON() confirming that
PATCH_INSN_SIZE fits within the undo array.

Having said that, I think all of xsplice_patch_func should be
arch-specific rather than generic.

> +
> +void xsplice_apply_jmp(struct xsplice_patch_func *func)
> +{
> +    uint32_t val;
> +    uint8_t *old_ptr;
> +
> +    old_ptr = (uint8_t *)func->old_addr;
> +    memcpy(func->undo, old_ptr, PATCH_INSN_SIZE);

At least a newline here please.

> +    *old_ptr++ = 0xe9; /* Relative jump */
> +    val = func->new_addr - func->old_addr - PATCH_INSN_SIZE;

E9 takes a rel32 parameter, which is signed.

I think you need to explicitly cast to intptr_t and used signed
arithmetic throughout this calculation to correctly calculate a
backwards jump.

I think there should also be some sanity checks that both old_addr and
new_addr are in the Xen 1G virtual region.

> +    memcpy(old_ptr, &val, sizeof val);
> +}
> +
> +void xsplice_revert_jmp(struct xsplice_patch_func *func)
> +{
> +    memcpy((void *)func->old_addr, func->undo, PATCH_INSN_SIZE);

_p() is common shorthand in Xen for a cast to (void *)

> +}
> +
>  int xsplice_verify_elf(struct xsplice_elf *elf, uint8_t *data)
>  {
>  
> diff --git a/xen/common/xsplice.c b/xen/common/xsplice.c
> index fbd6129..b854c0a 100644
> --- a/xen/common/xsplice.c
> +++ b/xen/common/xsplice.c
> @@ -3,6 +3,7 @@
>   *
>   */
>  
> +#include <xen/cpu.h>
>  #include <xen/guest_access.h>
>  #include <xen/keyhandler.h>
>  #include <xen/lib.h>
> @@ -10,16 +11,25 @@
>  #include <xen/mm.h>
>  #include <xen/sched.h>
>  #include <xen/smp.h>
> +#include <xen/softirq.h>
>  #include <xen/spinlock.h>
> +#include <xen/wait.h>
>  #include <xen/xsplice_elf.h>
>  #include <xen/xsplice.h>
>  
>  #include <asm/event.h>
> +#include <asm/nmi.h>
>  #include <public/sysctl.h>
>  
> +/*
> + * Protects against payload_list operations and also allows only one
> + * caller in schedule_work.
> + */
>  static DEFINE_SPINLOCK(payload_lock);
>  static LIST_HEAD(payload_list);
>  
> +static LIST_HEAD(applied_list);
> +
>  static unsigned int payload_cnt;
>  static unsigned int payload_version = 1;
>  
> @@ -29,6 +39,9 @@ struct payload {
>      struct list_head list;               /* Linked to 'payload_list'. */
>      void *payload_address;               /* Virtual address mapped. */
>      size_t payload_pages;                /* Nr of the pages. */
> +    struct list_head applied_list;       /* Linked to 'applied_list'. */
> +    struct xsplice_patch_func *funcs;    /* The array of functions to patch. 
> */
> +    unsigned int nfuncs;                 /* Nr of functions to patch. */
>  
>      char name[XEN_XSPLICE_NAME_SIZE + 1];/* Name of it. */
>  };
> @@ -36,6 +49,24 @@ struct payload {
>  static int load_payload_data(struct payload *payload, uint8_t *raw, ssize_t 
> len);
>  static void free_payload_data(struct payload *payload);
>  
> +/* Defines an outstanding patching action. */
> +struct xsplice_work
> +{
> +    atomic_t semaphore;          /* Used for rendezvous. First to grab it 
> will
> +                                    do the patching. */
> +    atomic_t irq_semaphore;      /* Used to signal all IRQs disabled. */
> +    uint32_t timeout;                    /* Timeout to do the operation. */
> +    struct payload *data;        /* The payload on which to act. */
> +    volatile bool_t do_work;     /* Signals work to do. */
> +    volatile bool_t ready;       /* Signals all CPUs synchronized. */
> +    uint32_t cmd;                /* Action request: XSPLICE_ACTION_* */
> +};
> +
> +/* There can be only one outstanding patching action. */
> +static struct xsplice_work xsplice_work;

This is a scalability issue, specifically that every cpu on the
return-to-guest path polls the bytes making up do_work.  See below for
my suggestion to fix this.

> +
> +static int schedule_work(struct payload *data, uint32_t cmd, uint32_t 
> timeout);
> +
>  static const char *state2str(int32_t state)
>  {
>  #define STATE(x) [XSPLICE_STATE_##x] = #x
> @@ -61,14 +92,23 @@ static const char *state2str(int32_t state)
>  static void xsplice_printall(unsigned char key)
>  {
>      struct payload *data;
> +    unsigned int i;
>  
>      spin_lock(&payload_lock);
>  
>      list_for_each_entry ( data, &payload_list, list )
> -        printk(" name=%s state=%s(%d) %p using %zu pages.\n", data->name,
> +    {
> +        printk(" name=%s state=%s(%d) %p using %zu pages:\n", data->name,
>                 state2str(data->state), data->state, data->payload_address,
>                 data->payload_pages);
>  
> +        for ( i = 0; i < data->nfuncs; i++ )
> +        {
> +            struct xsplice_patch_func *f = &(data->funcs[i]);
> +            printk("    %s patch 0x%"PRIx64"(%u) with 0x%"PRIx64"(%u)\n",
> +                   f->name, f->old_addr, f->old_size, f->new_addr, 
> f->new_size);
> +        }

For a large patch, this could be thousands of entries.  You need to
periodically process pending softirqs to avoid a watchdog timeout.

>  static int load_payload_data(struct payload *payload, uint8_t *raw, ssize_t 
> len)
>  {
>      struct xsplice_elf elf;
> @@ -605,7 +695,14 @@ static int load_payload_data(struct payload *payload, 
> uint8_t *raw, ssize_t len)
>      if ( rc )
>          goto err_payload;
>  
> -    /* Free our temporary data structure. */
> +    rc = check_special_sections(payload, &elf);
> +    if ( rc )
> +        goto err_payload;
> +
> +    rc = find_special_sections(payload, &elf);
> +    if ( rc )
> +        goto err_payload;
> +
>      xsplice_elf_free(&elf);
>      return 0;
>  
> @@ -617,6 +714,253 @@ static int load_payload_data(struct payload *payload, 
> uint8_t *raw, ssize_t len)
>      return rc;
>  }
>  
> +
> +/*
> + * The following functions get the CPUs into an appropriate state and
> + * apply (or revert) each of the module's functions.
> + */
> +
> +/*
> + * This function is executed having all other CPUs with no stack (we may
> + * have cpu_idle on it) and IRQs disabled. We guard against NMI by 
> temporarily
> + * installing our NOP NMI handler.
> + */
> +static int apply_payload(struct payload *data)
> +{
> +    unsigned int i;
> +
> +    printk(XENLOG_DEBUG "%s: Applying %u functions.\n", data->name,
> +           data->nfuncs);
> +
> +    for ( i = 0; i < data->nfuncs; i++ )
> +        xsplice_apply_jmp(data->funcs + i);

In cases such as this, it is better to use data->funcs[i], as the
compiler can more easily perform pointer alias analysis.

> +
> +    list_add_tail(&data->applied_list, &applied_list);
> +
> +    return 0;
> +}
> +
> +/*
> + * This function is executed having all other CPUs with no stack (we may
> + * have cpu_idle on it) and IRQs disabled.
> + */
> +static int revert_payload(struct payload *data)
> +{
> +    unsigned int i;
> +
> +    printk(XENLOG_DEBUG "%s: Reverting.\n", data->name);
> +
> +    for ( i = 0; i < data->nfuncs; i++ )
> +        xsplice_revert_jmp(data->funcs + i);
> +
> +    list_del(&data->applied_list);
> +
> +    return 0;
> +}
> +
> +/* Must be holding the payload_lock. */
> +static int schedule_work(struct payload *data, uint32_t cmd, uint32_t 
> timeout)
> +{
> +    /* Fail if an operation is already scheduled. */
> +    if ( xsplice_work.do_work )
> +        return -EBUSY;
> +
> +    xsplice_work.cmd = cmd;
> +    xsplice_work.data = data;
> +    xsplice_work.timeout = timeout ? timeout : MILLISECS(30);

Can shorten to "timeout ?: MILLISECS(30);"

> +
> +    printk(XENLOG_DEBUG "%s: timeout is %"PRI_stime"ms\n", data->name,
> +           xsplice_work.timeout / MILLISECS(1));
> +
> +    atomic_set(&xsplice_work.semaphore, -1);
> +    atomic_set(&xsplice_work.irq_semaphore, -1);
> +
> +    xsplice_work.ready = 0;
> +    smp_wmb();
> +    xsplice_work.do_work = 1;
> +    smp_wmb();
> +
> +    return 0;
> +}
> +
> +/*
> + * Note that because of this NOP code the do_nmi is not safely patchable.
> + * Also if we do receive 'real' NMIs we have lost them.

The MCE path needs consideration as well.  Unlike the NMI path however,
that one cannot be ignored.

In both cases, it might be best to see about raising a tasklet or
softirq to pick up some deferred work.

> + */
> +static int mask_nmi_callback(const struct cpu_user_regs *regs, int cpu)
> +{
> +    return 1;
> +}
> +
> +static void reschedule_fn(void *unused)
> +{
> +    smp_mb(); /* Synchronize with setting do_work */
> +    raise_softirq(SCHEDULE_SOFTIRQ);

As you have to IPI each processor to raise a schedule softirq, you can
set a per-cpu "xsplice enter rendezvous" variable.  This prevents the
need for the return-to-guest path to poll one single byte.

> +}
> +
> +static int xsplice_do_wait(atomic_t *counter, s_time_t timeout,
> +                           unsigned int total_cpus, const char *s)
> +{
> +    int rc = 0;
> +
> +    while ( atomic_read(counter) != total_cpus && NOW() < timeout )
> +        cpu_relax();
> +
> +    /* Log & abort. */
> +    if ( atomic_read(counter) != total_cpus )
> +    {
> +        printk(XENLOG_DEBUG "%s: %s %u/%u\n", xsplice_work.data->name,
> +               s, atomic_read(counter), total_cpus);
> +        rc = -EBUSY;
> +        xsplice_work.data->rc = rc;
> +        xsplice_work.do_work = 0;
> +        smp_wmb();
> +        return rc;
> +    }
> +    return rc;
> +}
> +
> +static void xsplice_do_single(unsigned int total_cpus)
> +{
> +    nmi_callback_t saved_nmi_callback;
> +    struct payload *data, *tmp;
> +    s_time_t timeout;
> +    int rc;
> +
> +    data = xsplice_work.data;
> +    timeout = xsplice_work.timeout + NOW();
> +    if ( xsplice_do_wait(&xsplice_work.semaphore, timeout, total_cpus,
> +                         "Timed out on CPU semaphore") )
> +        return;
> +
> +    /* "Mask" NMIs. */
> +    saved_nmi_callback = set_nmi_callback(mask_nmi_callback);
> +
> +    /* All CPUs are waiting, now signal to disable IRQs. */
> +    xsplice_work.ready = 1;
> +    smp_wmb();
> +
> +    atomic_inc(&xsplice_work.irq_semaphore);
> +    if ( xsplice_do_wait(&xsplice_work.irq_semaphore, timeout, total_cpus,
> +                         "Timed out on IRQ semaphore.") )

This path "leaks" the NMI mask.  It would be better use "goto out;"
handling in the function to make it clearer that the error paths are
taken care of.

> +        return;
> +
> +    local_irq_disable();

local_irq_save().  Easier to restore on an error path.

> +    /* Now this function should be the only one on any stack.
> +     * No need to lock the payload list or applied list. */
> +    switch ( xsplice_work.cmd )
> +    {
> +    case XSPLICE_ACTION_APPLY:
> +        rc = apply_payload(data);
> +        if ( rc == 0 )
> +            data->state = XSPLICE_STATE_APPLIED;
> +        break;
> +    case XSPLICE_ACTION_REVERT:
> +        rc = revert_payload(data);
> +        if ( rc == 0 )
> +            data->state = XSPLICE_STATE_CHECKED;
> +        break;
> +    case XSPLICE_ACTION_REPLACE:
> +        list_for_each_entry_safe_reverse ( data, tmp, &applied_list, list )
> +        {
> +            data->rc = revert_payload(data);
> +            if ( data->rc == 0 )
> +                data->state = XSPLICE_STATE_CHECKED;
> +            else
> +            {
> +                rc = -EINVAL;
> +                break;
> +            }
> +        }
> +        if ( rc != -EINVAL )
> +        {
> +            rc = apply_payload(xsplice_work.data);
> +            if ( rc == 0 )
> +                xsplice_work.data->state = XSPLICE_STATE_APPLIED;
> +        }
> +        break;
> +    default:
> +        rc = -EINVAL;
> +        break;
> +    }

Once code modification is complete, you must execute a serialising
instruction such as cpuid, to flush the pipeline, on all cpus.  (Refer
to Intel SDM 3 8.1.4 "Handling Self- and Cross-Modifying Code")

> +
> +    xsplice_work.data->rc = rc;
> +
> +    local_irq_enable();
> +    set_nmi_callback(saved_nmi_callback);
> +
> +    xsplice_work.do_work = 0;
> +    smp_wmb(); /* Synchronize with waiting CPUs. */
> +}
> +
> +/*
> + * The main function which manages the work of quiescing the system and
> + * patching code.
> + */
> +void do_xsplice(void)
> +{
> +    struct payload *p = xsplice_work.data;
> +    unsigned int cpu = smp_processor_id();
> +
> +    /* Fast path: no work to do. */
> +    if ( likely(!xsplice_work.do_work) )
> +        return;
> +    ASSERT(local_irq_is_enabled());
> +
> +    /* Set at -1, so will go up to num_online_cpus - 1 */
> +    if ( atomic_inc_and_test(&xsplice_work.semaphore) )
> +    {
> +        unsigned int total_cpus;
> +
> +        if ( !get_cpu_maps() )
> +        {
> +            printk(XENLOG_DEBUG "%s: CPU%u - unable to get cpu_maps lock.\n",
> +                   p->name, cpu);
> +            xsplice_work.data->rc = -EBUSY;
> +            xsplice_work.do_work = 0;
> +            return;

This error path leaves a ref in the semaphore.

> +        }
> +
> +        barrier(); /* MUST do it after get_cpu_maps. */
> +        total_cpus = num_online_cpus() - 1;
> +
> +        if ( total_cpus )
> +        {
> +            printk(XENLOG_DEBUG "%s: CPU%u - IPIing the %u CPUs.\n", p->name,
> +                   cpu, total_cpus);
> +            smp_call_function(reschedule_fn, NULL, 0);
> +        }
> +        (void)xsplice_do_single(total_cpus);
> +
> +        ASSERT(local_irq_is_enabled());
> +
> +        put_cpu_maps();
> +
> +        printk(XENLOG_DEBUG "%s finished with rc=%d\n", p->name, p->rc);
> +    }
> +    else
> +    {
> +        /* Wait for all CPUs to rendezvous. */
> +        while ( xsplice_work.do_work && !xsplice_work.ready )
> +        {
> +            cpu_relax();
> +            smp_rmb();
> +        }
> +

What happens here if the rendezvous initiator times out?  Looks like we
will spin forever waiting for do_work which will never drop back to 0.

> +        /* Disable IRQs and signal. */
> +        local_irq_disable();
> +        atomic_inc(&xsplice_work.irq_semaphore);
> +
> +        /* Wait for patching to complete. */
> +        while ( xsplice_work.do_work )
> +        {
> +            cpu_relax();
> +            smp_rmb();
> +        }
> +        local_irq_enable();

Splitting the modification of do_work and ready across multiple
functions makes it particularly hard to reason about the correctness of
the rendezvous.  It would be better to have a xsplice_rendezvous()
function whose purpose was to negotiate the rendezvous only, using local
static state.  The action can then be just the switch() from
xsplice_do_single().

> +    }
> +}
> +
> diff --git a/xen/include/asm-arm/nmi.h b/xen/include/asm-arm/nmi.h
> index a60587e..82aff35 100644
> --- a/xen/include/asm-arm/nmi.h
> +++ b/xen/include/asm-arm/nmi.h
> @@ -4,6 +4,19 @@
>  #define register_guest_nmi_callback(a)  (-ENOSYS)
>  #define unregister_guest_nmi_callback() (-ENOSYS)
>  
> +typedef int (*nmi_callback_t)(const struct cpu_user_regs *regs, int cpu);
> +
> +/**
> + * set_nmi_callback
> + *
> + * Set a handler for an NMI. Only one handler may be
> + * set. Return the old nmi callback handler.
> + */
> +static inline nmi_callback_t set_nmi_callback(nmi_callback_t callback)
> +{
> +    return NULL;
> +}
> +

This addition suggests that there should probably be an
arch_xsplice_prepair_rendezvous() and arch_xsplice_finish_rendezvous().

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.