|
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-devel] [PATCH RFC 8/9] xen: Generic instruction re-execution mechanism for execute faults
The Xen emulator is incapable of handling some instructions, which
leads to the injection of an Invalid Opcode exception (#UD) inside
the guest once an unsupported instruction is encountered.
A new mechanism has been added which is able to generically re-execute
instructions, by temporarily granting permissions inside the EPT and
re-executing the instruction with all other vcpus paused and with the
monitor trap flag set. The mechanism is re-entrant, meaning that is
capable of handling different violations caused by the same instruction.
Usually, a security appliance will decide when and what instructions
must be re-executed this way (instructions that lie in non-executable
pages and instructions that cause the setting of Accessed and/or Dirty
flags inside page tables are two examples).
Signed-off-by: Andrei Lutas <vlutas@xxxxxxxxxxxxxxx>
---
xen/arch/x86/hvm/vmx/vmx.c | 51 ++++++++++++
xen/arch/x86/mm/p2m.c | 188 +++++++++++++++++++++++++++++++++++++++++++-
xen/common/domain.c | 6 ++
xen/include/xen/sched.h | 17 ++++
4 files changed, 260 insertions(+), 2 deletions(-)
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index 4a9a7c8..4976215 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -2568,12 +2568,60 @@ void vmx_handle_EOI_induced_exit(struct vlapic *vlapic,
int vector)
vlapic_handle_EOI_induced_exit(vlapic, vector);
}
+static int vmx_stop_reexecute_instruction(struct vcpu *v)
+{
+ int ret = 0, i;
+ struct vcpu *a;
+
+ if ( 0 == v->rexec_level )
+ return 0;
+
+ /* Step 1: Restore original EPT access rights for each GPA. */
+ for ( i = v->rexec_level - 1; i >= 0; i-- )
+ {
+ if ( 0 != p2m_set_mem_access(v->domain, v->rexec_context[i].gpa >>
PAGE_SHIFT,
+ 1, 0, 0xff,
v->rexec_context[i].old_access) )
+ {
+ ret = -1;
+ return ret;
+ }
+
+ v->rexec_context[i].gpa = 0;
+ }
+
+ spin_lock(&v->domain->rexec_lock);
+
+ /* Step 2: Reset the nesting level to zero. */
+ v->rexec_level = 0;
+
+ /* Step 3: Resume all other VCPUs. */
+ for_each_vcpu ( v->domain, a )
+ {
+ if ( a == v )
+ continue;
+
+ /* Unpause the VCPU. */
+ vcpu_unpause(a);
+ }
+
+ /* Step 4: Remove the MONITOR trap flag.
+ * - this is already done when handling the exit. */
+
+ /* Step 5: We're done! */
+
+ spin_unlock(&v->domain->rexec_lock);
+
+ return ret;
+}
+
void vmx_vmexit_handler(struct cpu_user_regs *regs)
{
unsigned long exit_qualification, exit_reason, idtv_info, intr_info = 0;
unsigned int vector = 0;
struct vcpu *v = current;
+ v->in_host = 1;
+
__vmread(GUEST_RIP, ®s->rip);
__vmread(GUEST_RSP, ®s->rsp);
__vmread(GUEST_RFLAGS, ®s->rflags);
@@ -3074,6 +3122,7 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
case EXIT_REASON_MONITOR_TRAP_FLAG:
v->arch.hvm_vmx.exec_control &= ~CPU_BASED_MONITOR_TRAP_FLAG;
vmx_update_cpu_exec_control(v);
+ vmx_stop_reexecute_instruction(v);
if ( v->arch.hvm_vcpu.single_step ) {
hvm_memory_event_single_step(regs->eip);
if ( v->domain->debugger_attached )
@@ -3191,6 +3240,8 @@ void vmx_vmenter_helper(const struct cpu_user_regs *regs)
check_pf_injection();
out:
+ curr->in_host = 0;
+
HVMTRACE_ND(VMENTRY, 0, 1/*cycles*/, 0, 0, 0, 0, 0, 0, 0);
__vmwrite(GUEST_RIP, regs->rip);
diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
index 4dd3f1b..ff67b09 100644
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -34,6 +34,7 @@
#include <public/mem_event.h>
#include <asm/mem_sharing.h>
#include <xen/event.h>
+#include <xen/hypercall.h>
#include <asm/hvm/nestedhvm.h>
#include <asm/hvm/svm/amd-iommu-proto.h>
#include <xsm/xsm.h>
@@ -1394,6 +1395,181 @@ static void p2m_set_ad_bits(struct vcpu *v, struct
p2m_domain *p2m,
v->sse_pg_dirty.gla = ga;
}
+static int vmx_start_reexecute_instruction(struct vcpu *v,
+ unsigned long gpa,
+ xenmem_access_t required_access)
+{
+ /* NOTE: Some required_accesses may be invalid. For example, one
+ * cannot grant only write access on a given page; read/write
+ * access must be granted instead. These inconsistencies are NOT
+ * checked here. The caller must ensure that "required_access" is
+ * an allowed combination. */
+
+ int ret = 0, i, found = 0, r = 0, w = 0, x = 0, level = 0, leave = 0;
+ xenmem_access_t old_access, new_access;
+ struct vcpu *a;
+
+ spin_lock(&v->domain->rexec_lock);
+
+ level = v->rexec_level;
+
+ /* Step 1: Make sure someone else didn't get to start an
+ * instruction re-execution */
+ for_each_vcpu ( v->domain, a )
+ {
+ /* We're interested in pausing all the VCPUs except self/v. */
+ if ( a == v )
+ continue;
+
+ /* Check if "a" started an instruction re-execution. If so,
+ * return success, as we'll re-execute our instruction later. */
+ if ( 0 != a->rexec_level )
+ {
+ /* We should be paused. */
+ ret = 0;
+ leave = 1;
+ goto release_and_exit;
+ }
+ }
+
+ /* Step 2: Make sure we're not exceeding the max re-execution depth. */
+ if ( level >= REEXECUTION_MAX_DEPTH )
+ {
+ ret = -1;
+ leave = 1;
+ goto release_and_exit;
+ }
+
+ /* Step 2: Pause all the VCPUs, except self. Note that we have to do
+ * this only if we're at nesting level 0; if we're at a higher level
+ * of nested re-exec, the vcpus are already paused. */
+ if ( 0 == level )
+ {
+ for_each_vcpu ( v->domain, a )
+ {
+ /* We're interested in pausing all the VCPUs except self/v. */
+ if ( a == v )
+ continue;
+
+ /* Pause, NO SYNC! We're gonna do our own syncing. */
+ vcpu_pause_nosync(a);
+ }
+
+ /* Step 3: Wait for all the paused VCPUs to actually leave the VMX
+ * non-root realm and enter VMX root. */
+ for_each_vcpu ( v->domain, a )
+ {
+ /* We're interested in pausing all the VCPUs except self/v. */
+ if ( a == v )
+ continue;
+
+ /* Pause, synced. */
+ while ( !a->in_host )
+ cpu_relax();
+ }
+ }
+
+ /* Update the rexecution nexting level. */
+ v->rexec_level++;
+
+release_and_exit:
+ spin_unlock(&v->domain->rexec_lock);
+
+ /* If we've got errors so far, return. */
+ if ( leave )
+ return ret;
+
+ /* Step 4: Save the current gpa & old access rights. Also, check if this
+ * is a "double-fault" on the exact same GPA, in which case, we will
+ * promote the rights of this particular GPA, and try again. */
+ for ( i = 0; i < level; i++ )
+ {
+ if (v->rexec_context[i].gpa == gpa)
+ {
+ /* This GPA is already in the queue. */
+
+ found = 1;
+
+ switch (v->rexec_context[i].cur_access) {
+ case XENMEM_access_r: r = 1; break;
+ case XENMEM_access_w: w = 1; break;
+ case XENMEM_access_x: x = 1; break;
+ case XENMEM_access_rx: r = x = 1; break;
+ case XENMEM_access_wx: w = x = 1; break;
+ case XENMEM_access_rw: r = w = 1; break;
+ case XENMEM_access_rwx: r = w = x = 1; break;
+ default: break; // We don't care about any other case.
+ }
+ }
+ }
+
+ /* Get the current EPT access rights. They will be restored when we're
done.
+ * Note that the restoration is done in reverse-order, in order to ensure
+ * that the original access rights are restore correctly. Otherwise, we may
+ * restore whatever access rights were modified by another re-execution
+ * request, and that would be bad. */
+ if ( 0 != p2m_get_mem_access(v->domain, gpa >> PAGE_SHIFT, &old_access) )
+ return -1;
+
+ v->rexec_context[level].gpa = gpa;
+ v->rexec_context[level].old_access = old_access;
+
+ /* Step 5: Make the GPA with the required access, so we can re-execute
+ * the instruction. */
+ switch ( required_access )
+ {
+ case XENMEM_access_r: r = 1; break;
+ case XENMEM_access_w: w = 1; break;
+ case XENMEM_access_x: x = 1; break;
+ case XENMEM_access_rx: r = x = 1; break;
+ case XENMEM_access_wx: w = x = 1; break;
+ case XENMEM_access_rw: r = w = 1; break;
+ case XENMEM_access_rwx: r = w = x = 1; break;
+ default: break; // We don't care about any other case.
+ }
+
+ /* Now transform our RWX values in a XENMEM_access_* constant. */
+ if ( 0 == r && 0 == w && 0 == x )
+ new_access = XENMEM_access_n;
+ else if ( 0 == r && 0 == w && 1 == x )
+ new_access = XENMEM_access_x;
+ else if ( 0 == r && 1 == w && 0 == x )
+ new_access = XENMEM_access_w;
+ else if ( 0 == r && 1 == w && 1 == x )
+ new_access = XENMEM_access_wx;
+ else if ( 1 == r && 0 == w && 0 == x )
+ new_access = XENMEM_access_r;
+ else if ( 1 == r && 0 == w && 1 == x )
+ new_access = XENMEM_access_rx;
+ else if ( 1 == r && 1 == w && 0 == x )
+ new_access = XENMEM_access_rw;
+ else if ( 1 == r && 1 == w && 1 == x )
+ new_access = XENMEM_access_rwx;
+ else
+ new_access = required_access; /* Should never get here. */
+
+ /* And save the current access rights. */
+ v->rexec_context[level].cur_access = new_access;
+
+ /* Apply the changes inside the EPT. */
+ if ( 0 != p2m_set_mem_access(v->domain, gpa >> PAGE_SHIFT,
+ 1, 0, 0xff, new_access) )
+ return -1;
+
+ /* Step 6: Reconfigure the VMCS, so it suits our needs. We want a
+ * VM-exit to be generated after the instruction has been
+ * successfully re-executed. */
+ if ( 0 == level )
+ {
+ v->arch.hvm_vmx.exec_control |= CPU_BASED_MONITOR_TRAP_FLAG;
+ vmx_update_cpu_exec_control(v);
+ }
+
+ /* Step 8: We should be done! */
+
+ return ret;
+}
+
bool_t p2m_mem_access_check(paddr_t gpa, bool_t gla_valid, unsigned long gla,
bool_t access_r, bool_t access_w, bool_t access_x,
mem_event_request_t **req_ptr)
@@ -1472,7 +1648,10 @@ bool_t p2m_mem_access_check(paddr_t gpa, bool_t
gla_valid, unsigned long gla,
v->arch.mem_event.gpa = 0;
v->arch.mem_event.eip = 0;
- p2m_set_ad_bits(v, p2m, gla);
+ if ( 0 == gpa )
+ p2m_set_ad_bits(v, p2m, gla);
+ else
+ vmx_start_reexecute_instruction(v, gpa, XENMEM_access_rw);
return 1;
}
}
@@ -1500,7 +1679,12 @@ bool_t p2m_mem_access_check(paddr_t gpa, bool_t
gla_valid, unsigned long gla,
else if ( v->arch.mem_event.emulate_flags &
MEM_EVENT_FLAG_EMULATE_NOWRITE )
hvm_emulate_one_full(1);
else
- hvm_emulate_one_full(0);
+ {
+ if ( access_x )
+ vmx_start_reexecute_instruction(v, gpa, XENMEM_access_x);
+ else
+ hvm_emulate_one_full(0);
+ }
v->arch.mem_event.emulate_flags = 0;
return 1;
diff --git a/xen/common/domain.c b/xen/common/domain.c
index 44d2919..175c898 100644
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -124,6 +124,10 @@ struct vcpu *alloc_vcpu(
v->sse_pg_dirty.eip = 0;
v->sse_pg_dirty.gla = 0;
+ v->rexec_level = 0;
+ memset(v->rexec_context, 0, sizeof(v->rexec_context));
+ v->in_host = 0;
+
spin_lock_init(&v->virq_lock);
tasklet_init(&v->continue_hypercall_tasklet, NULL, 0);
@@ -263,6 +267,8 @@ struct domain *domain_create(
d->fault_info.virtual_address = 0;
d->fault_info.write_access = 0;
+ spin_lock_init(&d->rexec_lock);
+
lock_profile_register_struct(LOCKPROF_TYPE_PERDOM, d, domid, "Domain");
if ( (err = xsm_alloc_security_domain(d)) != 0 )
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
index 567a124..07ee19f 100644
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -24,6 +24,7 @@
#include <public/sysctl.h>
#include <public/vcpu.h>
#include <public/mem_event.h>
+#include <public/memory.h>
#include <public/event_channel.h>
#ifdef CONFIG_COMPAT
@@ -250,6 +251,20 @@ struct vcpu
unsigned long eip;
unsigned long gla;
} sse_pg_dirty;
+
+#define REEXECUTION_MAX_DEPTH 8
+
+ struct rexec_context_t {
+ unsigned long gpa;
+ xenmem_access_t old_access;
+ xenmem_access_t cur_access;
+ } rexec_context[REEXECUTION_MAX_DEPTH];
+
+ int rexec_level;
+
+ /* Will be true when the vcpu is in VMX root,
+ * false when it is not. */
+ bool_t in_host;
};
/* Per-domain lock can be recursively acquired in fault handlers. */
@@ -459,6 +474,8 @@ struct domain
uint64_t virtual_address;
uint32_t write_access;
} fault_info;
+
+ spinlock_t rexec_lock;
};
struct domain_setup_info
--
1.7.9.5
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel
|
![]() |
Lists.xenproject.org is hosted with RackSpace, monitoring our |