 
	
| [Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-devel] [PATCH RFC 8/9] xen: Generic instruction re-execution mechanism for execute faults
 The Xen emulator is incapable of handling some instructions, which
leads to the injection of an Invalid Opcode exception (#UD) inside
the guest once an unsupported instruction is encountered.
A new mechanism has been added which is able to generically re-execute
instructions, by temporarily granting permissions inside the EPT and
re-executing the instruction with all other vcpus paused and with the
monitor trap flag set. The mechanism is re-entrant, meaning that is
capable of handling different violations caused by the same instruction.
Usually, a security appliance will decide when and what instructions
must be re-executed this way (instructions that lie in non-executable
pages and instructions that cause the setting of Accessed and/or Dirty
flags inside page tables are two examples).
Signed-off-by: Andrei Lutas <vlutas@xxxxxxxxxxxxxxx>
---
 xen/arch/x86/hvm/vmx/vmx.c |   51 ++++++++++++
 xen/arch/x86/mm/p2m.c      |  188 +++++++++++++++++++++++++++++++++++++++++++-
 xen/common/domain.c        |    6 ++
 xen/include/xen/sched.h    |   17 ++++
 4 files changed, 260 insertions(+), 2 deletions(-)
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index 4a9a7c8..4976215 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -2568,12 +2568,60 @@ void vmx_handle_EOI_induced_exit(struct vlapic *vlapic, 
int vector)
     vlapic_handle_EOI_induced_exit(vlapic, vector);
 }
 
+static int vmx_stop_reexecute_instruction(struct vcpu *v)
+{
+    int ret = 0, i;
+    struct vcpu *a;
+
+    if ( 0 == v->rexec_level )
+        return 0;
+
+    /* Step 1: Restore original EPT access rights for each GPA. */
+    for ( i = v->rexec_level - 1; i >= 0; i-- )
+    {
+        if ( 0 != p2m_set_mem_access(v->domain, v->rexec_context[i].gpa >> 
PAGE_SHIFT,
+                                     1, 0, 0xff, 
v->rexec_context[i].old_access) )
+        {
+            ret = -1;
+            return ret;
+        }
+
+        v->rexec_context[i].gpa = 0;
+    }
+
+    spin_lock(&v->domain->rexec_lock);
+
+    /* Step 2: Reset the nesting level to zero. */
+    v->rexec_level = 0;
+
+    /* Step 3: Resume all other VCPUs. */
+    for_each_vcpu ( v->domain, a )
+    {
+        if ( a == v )
+            continue;
+
+        /* Unpause the VCPU. */
+        vcpu_unpause(a);
+    }
+
+    /* Step 4: Remove the MONITOR trap flag.
+     * - this is already done when handling the exit. */
+
+    /* Step 5: We're done! */
+
+    spin_unlock(&v->domain->rexec_lock);
+
+    return ret;
+}
+
 void vmx_vmexit_handler(struct cpu_user_regs *regs)
 {
     unsigned long exit_qualification, exit_reason, idtv_info, intr_info = 0;
     unsigned int vector = 0;
     struct vcpu *v = current;
 
+    v->in_host = 1;
+
     __vmread(GUEST_RIP,    ®s->rip);
     __vmread(GUEST_RSP,    ®s->rsp);
     __vmread(GUEST_RFLAGS, ®s->rflags);
@@ -3074,6 +3122,7 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
     case EXIT_REASON_MONITOR_TRAP_FLAG:
         v->arch.hvm_vmx.exec_control &= ~CPU_BASED_MONITOR_TRAP_FLAG;
         vmx_update_cpu_exec_control(v);
+        vmx_stop_reexecute_instruction(v);
         if ( v->arch.hvm_vcpu.single_step ) {
           hvm_memory_event_single_step(regs->eip);
           if ( v->domain->debugger_attached )
@@ -3191,6 +3240,8 @@ void vmx_vmenter_helper(const struct cpu_user_regs *regs)
     check_pf_injection();
 
  out:
+    curr->in_host = 0;
+
     HVMTRACE_ND(VMENTRY, 0, 1/*cycles*/, 0, 0, 0, 0, 0, 0, 0);
 
     __vmwrite(GUEST_RIP,    regs->rip);
diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
index 4dd3f1b..ff67b09 100644
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -34,6 +34,7 @@
 #include <public/mem_event.h>
 #include <asm/mem_sharing.h>
 #include <xen/event.h>
+#include <xen/hypercall.h>
 #include <asm/hvm/nestedhvm.h>
 #include <asm/hvm/svm/amd-iommu-proto.h>
 #include <xsm/xsm.h>
@@ -1394,6 +1395,181 @@ static void p2m_set_ad_bits(struct vcpu *v, struct 
p2m_domain *p2m,
     v->sse_pg_dirty.gla = ga;
 }
 
+static int vmx_start_reexecute_instruction(struct vcpu *v,
+                                           unsigned long gpa,
+                                           xenmem_access_t required_access)
+{
+    /* NOTE: Some required_accesses may be invalid. For example, one
+     * cannot grant only write access on a given page; read/write
+     * access must be granted instead. These inconsistencies are NOT
+     * checked here. The caller must ensure that "required_access" is
+     * an allowed combination. */
+
+    int ret = 0, i, found = 0, r = 0, w = 0, x = 0, level = 0, leave = 0;
+    xenmem_access_t old_access, new_access;
+    struct vcpu *a;
+
+    spin_lock(&v->domain->rexec_lock);
+
+    level = v->rexec_level;
+
+    /* Step 1: Make sure someone else didn't get to start an
+     * instruction re-execution */
+    for_each_vcpu ( v->domain, a )
+    {
+        /* We're interested in pausing all the VCPUs except self/v. */
+        if ( a == v )
+            continue;
+
+        /* Check if "a" started an instruction re-execution. If so,
+         * return success, as we'll re-execute our instruction later. */
+        if ( 0 != a->rexec_level )
+        {
+            /* We should be paused. */
+            ret = 0;
+            leave = 1;
+            goto release_and_exit;
+        }
+    }
+
+    /* Step 2: Make sure we're not exceeding the max re-execution depth. */
+    if ( level >= REEXECUTION_MAX_DEPTH )
+    {
+        ret = -1;
+        leave = 1;
+        goto release_and_exit;
+    }
+
+    /* Step 2: Pause all the VCPUs, except self. Note that we have to do
+     * this only if we're at nesting level 0; if we're at a higher level
+     * of nested re-exec, the vcpus are already paused. */
+    if ( 0 == level )
+    {
+        for_each_vcpu ( v->domain, a )
+        {
+            /* We're interested in pausing all the VCPUs except self/v. */
+            if ( a == v )
+                continue;
+
+            /* Pause, NO SYNC! We're gonna do our own syncing. */
+            vcpu_pause_nosync(a);
+        }
+
+        /* Step 3: Wait for all the paused VCPUs to actually leave the VMX
+         * non-root realm and enter VMX root. */
+        for_each_vcpu ( v->domain, a )
+        {
+            /* We're interested in pausing all the VCPUs except self/v. */
+            if ( a == v )
+                continue;
+
+            /* Pause, synced. */
+            while ( !a->in_host )
+                cpu_relax();
+        }
+    }
+
+    /* Update the rexecution nexting level. */
+    v->rexec_level++;
+
+release_and_exit:
+    spin_unlock(&v->domain->rexec_lock);
+
+    /* If we've got errors so far, return. */
+    if ( leave )
+        return ret;
+
+    /* Step 4: Save the current gpa & old access rights. Also, check if this
+     * is a "double-fault" on the exact same GPA, in which case, we will
+     * promote the rights of this particular GPA, and try again. */
+    for ( i = 0; i < level; i++ )
+    {
+        if (v->rexec_context[i].gpa == gpa)
+        {
+            /* This GPA is already in the queue. */
+
+            found = 1;
+
+            switch (v->rexec_context[i].cur_access) {
+                case XENMEM_access_r: r = 1; break;
+                case XENMEM_access_w: w = 1; break;
+                case XENMEM_access_x: x = 1; break;
+                case XENMEM_access_rx: r = x = 1; break;
+                case XENMEM_access_wx: w = x = 1;  break;
+                case XENMEM_access_rw: r = w = 1; break;
+                case XENMEM_access_rwx: r = w = x = 1; break;
+                default: break; // We don't care about any other case.
+            }
+        }
+    }
+
+    /* Get the current EPT access rights. They will be restored when we're 
done.
+     * Note that the restoration is done in reverse-order, in order to ensure
+     * that the original access rights are restore correctly. Otherwise, we may
+     * restore whatever access rights were modified by another re-execution
+     * request, and that would be bad. */
+    if ( 0 != p2m_get_mem_access(v->domain, gpa >> PAGE_SHIFT, &old_access) )
+        return -1;
+
+    v->rexec_context[level].gpa = gpa;
+    v->rexec_context[level].old_access = old_access;
+
+    /* Step 5: Make the GPA with the required access, so we can re-execute
+     * the instruction. */
+    switch ( required_access )
+    {
+        case XENMEM_access_r: r = 1; break;
+        case XENMEM_access_w: w = 1; break;
+        case XENMEM_access_x: x = 1; break;
+        case XENMEM_access_rx: r = x = 1; break;
+        case XENMEM_access_wx: w = x = 1;  break;
+        case XENMEM_access_rw: r = w = 1; break;
+        case XENMEM_access_rwx: r = w = x = 1; break;
+        default: break; // We don't care about any other case.
+    }
+
+    /* Now transform our RWX values in a XENMEM_access_* constant. */
+    if ( 0 == r && 0 == w && 0 == x )
+        new_access = XENMEM_access_n;
+    else if ( 0 == r && 0 == w && 1 == x )
+        new_access = XENMEM_access_x;
+    else if ( 0 == r && 1 == w && 0 == x )
+        new_access = XENMEM_access_w;
+    else if ( 0 == r && 1 == w && 1 == x )
+        new_access = XENMEM_access_wx;
+    else if ( 1 == r && 0 == w && 0 == x )
+        new_access = XENMEM_access_r;
+    else if ( 1 == r && 0 == w && 1 == x )
+        new_access = XENMEM_access_rx;
+    else if ( 1 == r && 1 == w && 0 == x )
+        new_access = XENMEM_access_rw;
+    else if ( 1 == r && 1 == w && 1 == x )
+        new_access = XENMEM_access_rwx;
+    else
+        new_access = required_access; /* Should never get here. */
+
+    /* And save the current access rights. */
+    v->rexec_context[level].cur_access = new_access;
+
+    /* Apply the changes inside the EPT. */
+    if ( 0 != p2m_set_mem_access(v->domain, gpa >> PAGE_SHIFT,
+                                 1, 0, 0xff, new_access) )
+        return -1;
+
+    /* Step 6: Reconfigure the VMCS, so it suits our needs. We want a
+     * VM-exit to be generated after the instruction has been
+     * successfully re-executed. */
+    if ( 0 == level )
+    {
+        v->arch.hvm_vmx.exec_control |= CPU_BASED_MONITOR_TRAP_FLAG;
+        vmx_update_cpu_exec_control(v);
+    }
+
+    /* Step 8: We should be done! */
+
+    return ret;
+}
+
 bool_t p2m_mem_access_check(paddr_t gpa, bool_t gla_valid, unsigned long gla, 
                           bool_t access_r, bool_t access_w, bool_t access_x,
                           mem_event_request_t **req_ptr)
@@ -1472,7 +1648,10 @@ bool_t p2m_mem_access_check(paddr_t gpa, bool_t 
gla_valid, unsigned long gla,
             v->arch.mem_event.gpa = 0;
             v->arch.mem_event.eip = 0;
 
-            p2m_set_ad_bits(v, p2m, gla);
+            if ( 0 == gpa )
+                p2m_set_ad_bits(v, p2m, gla);
+            else
+                vmx_start_reexecute_instruction(v, gpa, XENMEM_access_rw);
             return 1;
         }
     }
@@ -1500,7 +1679,12 @@ bool_t p2m_mem_access_check(paddr_t gpa, bool_t 
gla_valid, unsigned long gla,
         else if ( v->arch.mem_event.emulate_flags & 
MEM_EVENT_FLAG_EMULATE_NOWRITE )
             hvm_emulate_one_full(1);
         else
-            hvm_emulate_one_full(0);
+        {
+            if ( access_x )
+                vmx_start_reexecute_instruction(v, gpa, XENMEM_access_x);
+            else
+                hvm_emulate_one_full(0);
+        }
 
         v->arch.mem_event.emulate_flags = 0;
         return 1;
diff --git a/xen/common/domain.c b/xen/common/domain.c
index 44d2919..175c898 100644
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -124,6 +124,10 @@ struct vcpu *alloc_vcpu(
     v->sse_pg_dirty.eip = 0;
     v->sse_pg_dirty.gla = 0;
 
+    v->rexec_level = 0;
+    memset(v->rexec_context, 0, sizeof(v->rexec_context));
+    v->in_host = 0;
+
     spin_lock_init(&v->virq_lock);
 
     tasklet_init(&v->continue_hypercall_tasklet, NULL, 0);
@@ -263,6 +267,8 @@ struct domain *domain_create(
     d->fault_info.virtual_address = 0;
     d->fault_info.write_access = 0;
 
+    spin_lock_init(&d->rexec_lock);
+
     lock_profile_register_struct(LOCKPROF_TYPE_PERDOM, d, domid, "Domain");
 
     if ( (err = xsm_alloc_security_domain(d)) != 0 )
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
index 567a124..07ee19f 100644
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -24,6 +24,7 @@
 #include <public/sysctl.h>
 #include <public/vcpu.h>
 #include <public/mem_event.h>
+#include <public/memory.h>
 #include <public/event_channel.h>
 
 #ifdef CONFIG_COMPAT
@@ -250,6 +251,20 @@ struct vcpu
            unsigned long eip;
            unsigned long gla;
     } sse_pg_dirty;
+
+#define REEXECUTION_MAX_DEPTH 8
+
+    struct rexec_context_t {
+        unsigned long gpa;
+        xenmem_access_t old_access;
+        xenmem_access_t cur_access;
+    } rexec_context[REEXECUTION_MAX_DEPTH];
+
+    int rexec_level;
+
+    /* Will be true when the vcpu is in VMX root,
+     * false when it is not. */
+    bool_t in_host;
 };
 
 /* Per-domain lock can be recursively acquired in fault handlers. */
@@ -459,6 +474,8 @@ struct domain
         uint64_t virtual_address;
         uint32_t write_access;
     } fault_info;
+
+    spinlock_t rexec_lock;
 };
 
 struct domain_setup_info
-- 
1.7.9.5
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel
 
 
 | 
|  | Lists.xenproject.org is hosted with RackSpace, monitoring our |