Xen project Mailing List

[Xen-devel] [PATCH RFC 8/9] xen: Generic instruction re-execution mechanism for execute faults

From: Razvan Cojocaru <rcojocaru@xxxxxxxxxxxxxxx>

Date: Wed, 2 Jul 2014 16:34:00 +0300

Cc: Andrei Lutas <vlutas@xxxxxxxxxxxxxxx>, tim@xxxxxxx, Razvan Cojocaru <rcojocaru@xxxxxxxxxxxxxxx>

Comment: DomainKeys? See http://domainkeys.sourceforge.net/

Delivery-date: Wed, 02 Jul 2014 13:34:28 +0000

Domainkey-signature: a=rsa-sha1; q=dns; c=nofws; s=default; d=bitdefender.com; b=Pi/7NEwBlE0wXNItnj226SPKfzbzb4Uho++OwNBqWMWB7PRHE4xqH6Hnvh7weAN+JUI1IBhltGXQmRT/SipzpJ6D2nREz+MJJrXjYAma0XxskRzEPuUAFzysF1IZpg41paPggl1Hi6GUVRckezjMu/pZuKOAEqeVpRmt55j4dw10FKq9igWWleykGaxaXbNCDwP291dYv1+fwHUqAZFz6JtytDqIdQ7+1KWN9OEklP0TB0M6b4P1J9uVn0MNB+fyHUTUhLlYfG8Yu34oI8nSQwHB9lMRrdF0TwgMcRNAt2U/zFWj96bbyuteodzIc23Q2HcrpTPkHTOdMmzSbkqp/w==; h=Received:Received:Received:Received:From:To:Cc:Subject:Date:Message-Id:X-Mailer:In-Reply-To:References:X-BitDefender-Scanner:X-BitDefender-Spam:X-BitDefender-SpamStamp:X-BitDefender-CF-Stamp;

List-id: Xen developer discussion <xen-devel.lists.xen.org>

The Xen emulator is incapable of handling some instructions, which leads to the injection of an Invalid Opcode exception (#UD) inside the guest once an unsupported instruction is encountered. A new mechanism has been added which is able to generically re-execute instructions, by temporarily granting permissions inside the EPT and re-executing the instruction with all other vcpus paused and with the monitor trap flag set. The mechanism is re-entrant, meaning that is capable of handling different violations caused by the same instruction. Usually, a security appliance will decide when and what instructions must be re-executed this way (instructions that lie in non-executable pages and instructions that cause the setting of Accessed and/or Dirty flags inside page tables are two examples). Signed-off-by: Andrei Lutas <vlutas@xxxxxxxxxxxxxxx> --- xen/arch/x86/hvm/vmx/vmx.c | 51 ++++++++++++ xen/arch/x86/mm/p2m.c | 188 +++++++++++++++++++++++++++++++++++++++++++- xen/common/domain.c | 6 ++ xen/include/xen/sched.h | 17 ++++ 4 files changed, 260 insertions(+), 2 deletions(-) diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c index 4a9a7c8..4976215 100644 --- a/xen/arch/x86/hvm/vmx/vmx.c +++ b/xen/arch/x86/hvm/vmx/vmx.c @@ -2568,12 +2568,60 @@ void vmx_handle_EOI_induced_exit(struct vlapic *vlapic, int vector) vlapic_handle_EOI_induced_exit(vlapic, vector); } +static int vmx_stop_reexecute_instruction(struct vcpu *v) +{ + int ret = 0, i; + struct vcpu *a; + + if ( 0 == v->rexec_level ) + return 0; + + /* Step 1: Restore original EPT access rights for each GPA. */ + for ( i = v->rexec_level - 1; i >= 0; i-- ) + { + if ( 0 != p2m_set_mem_access(v->domain, v->rexec_context[i].gpa >> PAGE_SHIFT, + 1, 0, 0xff, v->rexec_context[i].old_access) ) + { + ret = -1; + return ret; + } + + v->rexec_context[i].gpa = 0; + } + + spin_lock(&v->domain->rexec_lock); + + /* Step 2: Reset the nesting level to zero. */ + v->rexec_level = 0; + + /* Step 3: Resume all other VCPUs. */ + for_each_vcpu ( v->domain, a ) + { + if ( a == v ) + continue; + + /* Unpause the VCPU. */ + vcpu_unpause(a); + } + + /* Step 4: Remove the MONITOR trap flag. + * - this is already done when handling the exit. */ + + /* Step 5: We're done! */ + + spin_unlock(&v->domain->rexec_lock); + + return ret; +} + void vmx_vmexit_handler(struct cpu_user_regs *regs) { unsigned long exit_qualification, exit_reason, idtv_info, intr_info = 0; unsigned int vector = 0; struct vcpu *v = current; + v->in_host = 1; + __vmread(GUEST_RIP, &regs->rip); __vmread(GUEST_RSP, &regs->rsp); __vmread(GUEST_RFLAGS, &regs->rflags); @@ -3074,6 +3122,7 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs) case EXIT_REASON_MONITOR_TRAP_FLAG: v->arch.hvm_vmx.exec_control &= ~CPU_BASED_MONITOR_TRAP_FLAG; vmx_update_cpu_exec_control(v); + vmx_stop_reexecute_instruction(v); if ( v->arch.hvm_vcpu.single_step ) { hvm_memory_event_single_step(regs->eip); if ( v->domain->debugger_attached ) @@ -3191,6 +3240,8 @@ void vmx_vmenter_helper(const struct cpu_user_regs *regs) check_pf_injection(); out: + curr->in_host = 0; + HVMTRACE_ND(VMENTRY, 0, 1/*cycles*/, 0, 0, 0, 0, 0, 0, 0); __vmwrite(GUEST_RIP, regs->rip); diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c index 4dd3f1b..ff67b09 100644 --- a/xen/arch/x86/mm/p2m.c +++ b/xen/arch/x86/mm/p2m.c @@ -34,6 +34,7 @@ #include <public/mem_event.h> #include <asm/mem_sharing.h> #include <xen/event.h> +#include <xen/hypercall.h> #include <asm/hvm/nestedhvm.h> #include <asm/hvm/svm/amd-iommu-proto.h> #include <xsm/xsm.h> @@ -1394,6 +1395,181 @@ static void p2m_set_ad_bits(struct vcpu *v, struct p2m_domain *p2m, v->sse_pg_dirty.gla = ga; } +static int vmx_start_reexecute_instruction(struct vcpu *v, + unsigned long gpa, + xenmem_access_t required_access) +{ + /* NOTE: Some required_accesses may be invalid. For example, one + * cannot grant only write access on a given page; read/write + * access must be granted instead. These inconsistencies are NOT + * checked here. The caller must ensure that "required_access" is + * an allowed combination. */ + + int ret = 0, i, found = 0, r = 0, w = 0, x = 0, level = 0, leave = 0; + xenmem_access_t old_access, new_access; + struct vcpu *a; + + spin_lock(&v->domain->rexec_lock); + + level = v->rexec_level; + + /* Step 1: Make sure someone else didn't get to start an + * instruction re-execution */ + for_each_vcpu ( v->domain, a ) + { + /* We're interested in pausing all the VCPUs except self/v. */ + if ( a == v ) + continue; + + /* Check if "a" started an instruction re-execution. If so, + * return success, as we'll re-execute our instruction later. */ + if ( 0 != a->rexec_level ) + { + /* We should be paused. */ + ret = 0; + leave = 1; + goto release_and_exit; + } + } + + /* Step 2: Make sure we're not exceeding the max re-execution depth. */ + if ( level >= REEXECUTION_MAX_DEPTH ) + { + ret = -1; + leave = 1; + goto release_and_exit; + } + + /* Step 2: Pause all the VCPUs, except self. Note that we have to do + * this only if we're at nesting level 0; if we're at a higher level + * of nested re-exec, the vcpus are already paused. */ + if ( 0 == level ) + { + for_each_vcpu ( v->domain, a ) + { + /* We're interested in pausing all the VCPUs except self/v. */ + if ( a == v ) + continue; + + /* Pause, NO SYNC! We're gonna do our own syncing. */ + vcpu_pause_nosync(a); + } + + /* Step 3: Wait for all the paused VCPUs to actually leave the VMX + * non-root realm and enter VMX root. */ + for_each_vcpu ( v->domain, a ) + { + /* We're interested in pausing all the VCPUs except self/v. */ + if ( a == v ) + continue; + + /* Pause, synced. */ + while ( !a->in_host ) + cpu_relax(); + } + } + + /* Update the rexecution nexting level. */ + v->rexec_level++; + +release_and_exit: + spin_unlock(&v->domain->rexec_lock); + + /* If we've got errors so far, return. */ + if ( leave ) + return ret; + + /* Step 4: Save the current gpa & old access rights. Also, check if this + * is a "double-fault" on the exact same GPA, in which case, we will + * promote the rights of this particular GPA, and try again. */ + for ( i = 0; i < level; i++ ) + { + if (v->rexec_context[i].gpa == gpa) + { + /* This GPA is already in the queue. */ + + found = 1; + + switch (v->rexec_context[i].cur_access) { + case XENMEM_access_r: r = 1; break; + case XENMEM_access_w: w = 1; break; + case XENMEM_access_x: x = 1; break; + case XENMEM_access_rx: r = x = 1; break; + case XENMEM_access_wx: w = x = 1; break; + case XENMEM_access_rw: r = w = 1; break; + case XENMEM_access_rwx: r = w = x = 1; break; + default: break; // We don't care about any other case. + } + } + } + + /* Get the current EPT access rights. They will be restored when we're done. + * Note that the restoration is done in reverse-order, in order to ensure + * that the original access rights are restore correctly. Otherwise, we may + * restore whatever access rights were modified by another re-execution + * request, and that would be bad. */ + if ( 0 != p2m_get_mem_access(v->domain, gpa >> PAGE_SHIFT, &old_access) ) + return -1; + + v->rexec_context[level].gpa = gpa; + v->rexec_context[level].old_access = old_access; + + /* Step 5: Make the GPA with the required access, so we can re-execute + * the instruction. */ + switch ( required_access ) + { + case XENMEM_access_r: r = 1; break; + case XENMEM_access_w: w = 1; break; + case XENMEM_access_x: x = 1; break; + case XENMEM_access_rx: r = x = 1; break; + case XENMEM_access_wx: w = x = 1; break; + case XENMEM_access_rw: r = w = 1; break; + case XENMEM_access_rwx: r = w = x = 1; break; + default: break; // We don't care about any other case. + } + + /* Now transform our RWX values in a XENMEM_access_* constant. */ + if ( 0 == r && 0 == w && 0 == x ) + new_access = XENMEM_access_n; + else if ( 0 == r && 0 == w && 1 == x ) + new_access = XENMEM_access_x; + else if ( 0 == r && 1 == w && 0 == x ) + new_access = XENMEM_access_w; + else if ( 0 == r && 1 == w && 1 == x ) + new_access = XENMEM_access_wx; + else if ( 1 == r && 0 == w && 0 == x ) + new_access = XENMEM_access_r; + else if ( 1 == r && 0 == w && 1 == x ) + new_access = XENMEM_access_rx; + else if ( 1 == r && 1 == w && 0 == x ) + new_access = XENMEM_access_rw; + else if ( 1 == r && 1 == w && 1 == x ) + new_access = XENMEM_access_rwx; + else + new_access = required_access; /* Should never get here. */ + + /* And save the current access rights. */ + v->rexec_context[level].cur_access = new_access; + + /* Apply the changes inside the EPT. */ + if ( 0 != p2m_set_mem_access(v->domain, gpa >> PAGE_SHIFT, + 1, 0, 0xff, new_access) ) + return -1; + + /* Step 6: Reconfigure the VMCS, so it suits our needs. We want a + * VM-exit to be generated after the instruction has been + * successfully re-executed. */ + if ( 0 == level ) + { + v->arch.hvm_vmx.exec_control |= CPU_BASED_MONITOR_TRAP_FLAG; + vmx_update_cpu_exec_control(v); + } + + /* Step 8: We should be done! */ + + return ret; +} + bool_t p2m_mem_access_check(paddr_t gpa, bool_t gla_valid, unsigned long gla, bool_t access_r, bool_t access_w, bool_t access_x, mem_event_request_t **req_ptr) @@ -1472,7 +1648,10 @@ bool_t p2m_mem_access_check(paddr_t gpa, bool_t gla_valid, unsigned long gla, v->arch.mem_event.gpa = 0; v->arch.mem_event.eip = 0; - p2m_set_ad_bits(v, p2m, gla); + if ( 0 == gpa ) + p2m_set_ad_bits(v, p2m, gla); + else + vmx_start_reexecute_instruction(v, gpa, XENMEM_access_rw); return 1; } } @@ -1500,7 +1679,12 @@ bool_t p2m_mem_access_check(paddr_t gpa, bool_t gla_valid, unsigned long gla, else if ( v->arch.mem_event.emulate_flags & MEM_EVENT_FLAG_EMULATE_NOWRITE ) hvm_emulate_one_full(1); else - hvm_emulate_one_full(0); + { + if ( access_x ) + vmx_start_reexecute_instruction(v, gpa, XENMEM_access_x); + else + hvm_emulate_one_full(0); + } v->arch.mem_event.emulate_flags = 0; return 1; diff --git a/xen/common/domain.c b/xen/common/domain.c index 44d2919..175c898 100644 --- a/xen/common/domain.c +++ b/xen/common/domain.c @@ -124,6 +124,10 @@ struct vcpu *alloc_vcpu( v->sse_pg_dirty.eip = 0; v->sse_pg_dirty.gla = 0; + v->rexec_level = 0; + memset(v->rexec_context, 0, sizeof(v->rexec_context)); + v->in_host = 0; + spin_lock_init(&v->virq_lock); tasklet_init(&v->continue_hypercall_tasklet, NULL, 0); @@ -263,6 +267,8 @@ struct domain *domain_create( d->fault_info.virtual_address = 0; d->fault_info.write_access = 0; + spin_lock_init(&d->rexec_lock); + lock_profile_register_struct(LOCKPROF_TYPE_PERDOM, d, domid, "Domain"); if ( (err = xsm_alloc_security_domain(d)) != 0 ) diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h index 567a124..07ee19f 100644 --- a/xen/include/xen/sched.h +++ b/xen/include/xen/sched.h @@ -24,6 +24,7 @@ #include <public/sysctl.h> #include <public/vcpu.h> #include <public/mem_event.h> +#include <public/memory.h> #include <public/event_channel.h> #ifdef CONFIG_COMPAT @@ -250,6 +251,20 @@ struct vcpu unsigned long eip; unsigned long gla; } sse_pg_dirty; + +#define REEXECUTION_MAX_DEPTH 8 + + struct rexec_context_t { + unsigned long gpa; + xenmem_access_t old_access; + xenmem_access_t cur_access; + } rexec_context[REEXECUTION_MAX_DEPTH]; + + int rexec_level; + + /* Will be true when the vcpu is in VMX root, + * false when it is not. */ + bool_t in_host; }; /* Per-domain lock can be recursively acquired in fault handlers. */ @@ -459,6 +474,8 @@ struct domain uint64_t virtual_address; uint32_t write_access; } fault_info; + + spinlock_t rexec_lock; }; struct domain_setup_info -- 1.7.9.5 _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxx http://lists.xen.org/xen-devel

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.