[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-devel] [PATCH v1] x86/hvm: Generic instruction re-execution mechanism for execute faults
A new mechanism has been added which is able to generically re-execute instructions, by temporarily granting permissions inside the EPT and re-executing the instruction with all other vcpus paused and with the monitor trap flag set. The mechanism is re-entrant, meaning that is capable of handling different violations caused by the same instruction. Usually, a security appliance will decide when and what instructions must be re-executed this way instructions that lie in non-executable pages and instructions that cause the setting of Accessed and/or Dirty flags inside page tables are two examples. Signed-off-by: Alexandru Isaila <aisaila@xxxxxxxxxxxxxxx> Signed-off-by: Andrei Lutas <vlutas@xxxxxxxxxxxxxxx> Signed-off-by: Mihai Donțu <mdontu@xxxxxxxxxxxxxxx> Signed-off-by: Anshul Makkar <anshul.makkar@xxxxxxxxxx> --- xen/arch/x86/domain.c | 3 + xen/arch/x86/hvm/vmx/vmx.c | 255 ++++++++++++++++++++++++++++++++++ xen/arch/x86/mm/mem_access.c | 20 ++- xen/include/asm-x86/domain.h | 18 +++ xen/include/asm-x86/hvm/hvm.h | 2 + 5 files changed, 295 insertions(+), 3 deletions(-) diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c index 295b10c48c..b0680a76f1 100644 --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -343,6 +343,7 @@ int arch_vcpu_create(struct vcpu *v) int rc; v->arch.flags = TF_kernel_mode; + v->arch.in_host = 1; rc = mapcache_vcpu_init(v); if ( rc ) @@ -482,6 +483,8 @@ int arch_domain_create(struct domain *d, spin_lock_init(&d->arch.e820_lock); spin_lock_init(&d->arch.vtsc_lock); + spin_lock_init(&d->arch.rexec_lock); + /* Minimal initialisation for the idle domain. */ if ( unlikely(is_idle_domain(d)) ) { diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c index 365eeb2886..84f8648fc0 100644 --- a/xen/arch/x86/hvm/vmx/vmx.c +++ b/xen/arch/x86/hvm/vmx/vmx.c @@ -2289,6 +2289,255 @@ static bool vmx_get_pending_event(struct vcpu *v, struct x86_event *info) return true; } +static int vmx_start_reexecute_instruction(struct vcpu *v, unsigned long gpa, + xenmem_access_t required_access) +{ + /* + * NOTE: Some required_accesses may be invalid. For example, one + * cannot grant only write access on a given page; read/write + * access must be granted instead. These inconsistencies are NOT + * checked here. The caller must ensure that "required_access" is + * an allowed combination. + */ + + int ret = 0, i, found = 0, r = 0, w = 0, x = 0, level = 0, leave = 0; + xenmem_access_t old_access, new_access; + struct vcpu *a; + unsigned int altp2m_idx = + altp2m_active(v->domain) ? altp2m_vcpu_idx(v) : 0; + + spin_lock(&v->domain->arch.rexec_lock); + + level = v->arch.rexec_level; + + /* + * Step 1: Make sure someone else didn't get to start an + * instruction re-execution. + */ + for_each_vcpu ( v->domain, a ) + { + /* We're interested in pausing all the VCPUs except self/v. */ + if ( a == v ) + continue; + + /* + * Check if "a" started an instruction re-execution. If so, + * return success, as we'll re-execute our instruction later. + */ + if ( a->arch.rexec_level != 0 ) + { + /* We should be paused. */ + ret = 0; + leave = 1; + goto release_and_exit; + } + } + + /* Step 2: Make sure we're not exceeding the max re-execution depth. */ + if ( level >= REEXECUTION_MAX_DEPTH ) + { + ret = -1; + leave = 1; + goto release_and_exit; + } + + /* + * Step 2: Pause all the VCPUs, except self. Note that we have to do + * this only if we're at nesting level 0; if we're at a higher level + * of nested re-exec, the vcpus are already paused. + */ + if ( level == 0 ) + { + for_each_vcpu ( v->domain, a ) + { + /* We're interested in pausing all the VCPUs except self/v. */ + if ( a == v ) + continue; + + /* Pause, NO SYNC! We're gonna do our own syncing. */ + vcpu_pause_nosync(a); + } + + /* + * Step 3: Wait for all the paused VCPUs to actually leave the VMX + * non-root realm and enter VMX root. + */ + for_each_vcpu ( v->domain, a ) + { + /* We're interested in pausing all the VCPUs except self/v. */ + if ( a == v ) + continue; + + /* Pause, synced. */ + while ( !a->arch.in_host ) + cpu_relax(); + } + } + + /* Update the rexecution nexting level. */ + v->arch.rexec_level++; + +release_and_exit: + spin_unlock(&v->domain->arch.rexec_lock); + + /* If we've got errors so far, return. */ + if ( leave ) + return ret; + + /* + * Step 4: Save the current gpa & old access rights. Also, check if this + * is a "double-fault" on the exact same GPA, in which case, we will + * promote the rights of this particular GPA, and try again. + */ + for ( i = 0; i < level; i++ ) + { + if ( (v->arch.rexec_context[i].gpa >> PAGE_SHIFT) == + (gpa >> PAGE_SHIFT) ) + { + /* This GPA is already in the queue. */ + found = 1; + + switch (v->arch.rexec_context[i].cur_access) { + case XENMEM_access_r: r = 1; break; + case XENMEM_access_w: w = 1; break; + case XENMEM_access_x: x = 1; break; + case XENMEM_access_rx: r = x = 1; break; + case XENMEM_access_wx: w = x = 1; break; + case XENMEM_access_rw: r = w = 1; break; + case XENMEM_access_rwx: r = w = x = 1; break; + default: break; /* We don't care about any other case. */ + } + } + } + + /* + * Get the current EPT access rights. They will be restored when we're done. + * Note that the restoration is done in reverse-order, in order to ensure + * that the original access rights are restore correctly. Otherwise, we may + * restore whatever access rights were modified by another re-execution + * request, and that would be bad. + */ + if ( p2m_get_mem_access(v->domain, _gfn(gpa >> PAGE_SHIFT), + &old_access, altp2m_idx) != 0 ) + return -1; + + v->arch.rexec_context[level].gpa = gpa; + v->arch.rexec_context[level].old_access = old_access; + v->arch.rexec_context[level].old_single_step = v->arch.hvm.single_step; + + /* + * Step 5: Make the GPA with the required access, so we can re-execute + * the instruction. + */ + switch ( required_access ) + { + case XENMEM_access_r: r = 1; break; + case XENMEM_access_w: w = 1; break; + case XENMEM_access_x: x = 1; break; + case XENMEM_access_rx: r = x = 1; break; + case XENMEM_access_wx: w = x = 1; break; + case XENMEM_access_rw: r = w = 1; break; + case XENMEM_access_rwx: r = w = x = 1; break; + default: break; /* We don't care about any other case. */ + } + + /* Now transform our RWX values in a XENMEM_access_* constant. */ + if ( r == 0 && w == 0 && x == 0 ) + new_access = XENMEM_access_n; + else if ( r == 0 && w == 0 && x == 1 ) + new_access = XENMEM_access_x; + else if ( r == 0 && w == 1 && x == 0 ) + new_access = XENMEM_access_w; + else if ( r == 0 && w == 1 && x == 1 ) + new_access = XENMEM_access_wx; + else if ( r == 1 && w == 0 && x == 0 ) + new_access = XENMEM_access_r; + else if ( r == 1 && w == 0 && x == 1 ) + new_access = XENMEM_access_rx; + else if ( r == 1 && w == 1 && x == 0 ) + new_access = XENMEM_access_rw; + else if ( r == 1 && w == 1 && x == 1 ) + new_access = XENMEM_access_rwx; + else + new_access = required_access; /* Should never get here. */ + + /* And save the current access rights. */ + v->arch.rexec_context[level].cur_access = new_access; + + /* Apply the changes inside the EPT. */ + if ( p2m_set_mem_access(v->domain, _gfn(gpa >> PAGE_SHIFT), + 1, 0, MEMOP_CMD_MASK, new_access, + altp2m_idx) != 0 ) + return -1; + + /* + * Step 6: Reconfigure the VMCS, so it suits our needs. We want a + * VM-exit to be generated after the instruction has been + * successfully re-executed. + */ + if ( level == 0 ) + v->arch.hvm.single_step = 1; + + /* Step 8: We should be done! */ + + return ret; +} + +static int vmx_stop_reexecute_instruction(struct vcpu *v) +{ + int ret = 0, i; + struct vcpu *a; + unsigned int altp2m_idx = + altp2m_active(v->domain) ? altp2m_vcpu_idx(v) : 0; + + if ( v->arch.rexec_level == 0 ) + return 0; + + /* Step 1: Restore original EPT access rights for each GPA. */ + for ( i = v->arch.rexec_level - 1; i >= 0; i-- ) + { + if ( v->arch.rexec_context[i].gpa != mfn_x(INVALID_MFN) && + p2m_set_mem_access(v->domain, + _gfn(v->arch.rexec_context[i].gpa >> PAGE_SHIFT), + 1, 0, MEMOP_CMD_MASK, + v->arch.rexec_context[i].old_access, + altp2m_idx) != 0 ) + { + ret = -1; + return ret; + } + + v->arch.rexec_context[i].gpa = 0; + v->arch.hvm.single_step = v->arch.rexec_context[i].old_single_step; + } + + spin_lock(&v->domain->arch.rexec_lock); + + /* Step 2: Reset the nesting level to zero. */ + v->arch.rexec_level = 0; + + /* Step 3: Resume all other VCPUs. */ + for_each_vcpu ( v->domain, a ) + { + if ( a == v ) + continue; + + /* Unpause the VCPU. */ + vcpu_unpause(a); + } + + /* + * Step 4: Remove the MONITOR trap flag. + * - this is already done when handling the exit. + */ + + /* Step 5: We're done! */ + + spin_unlock(&v->domain->arch.rexec_lock); + + return ret; +} + static struct hvm_function_table __initdata vmx_function_table = { .name = "VMX", .cpu_up_prepare = vmx_cpu_up_prepare, @@ -2324,6 +2573,7 @@ static struct hvm_function_table __initdata vmx_function_table = { .invlpg = vmx_invlpg, .cpu_up = vmx_cpu_up, .cpu_down = vmx_cpu_down, + .start_reexecute_instruction = vmx_start_reexecute_instruction, .wbinvd_intercept = vmx_wbinvd_intercept, .fpu_dirty_intercept = vmx_fpu_dirty_intercept, .msr_read_intercept = vmx_msr_read_intercept, @@ -3590,6 +3840,8 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs) unsigned int vector = 0, mode; struct vcpu *v = current; + v->arch.in_host = 1; + __vmread(GUEST_RIP, ®s->rip); __vmread(GUEST_RSP, ®s->rsp); __vmread(GUEST_RFLAGS, ®s->rflags); @@ -4112,6 +4364,7 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs) case EXIT_REASON_MONITOR_TRAP_FLAG: v->arch.hvm.vmx.exec_control &= ~CPU_BASED_MONITOR_TRAP_FLAG; vmx_update_cpu_exec_control(v); + vmx_stop_reexecute_instruction(v); if ( v->arch.hvm.single_step ) { hvm_monitor_debug(regs->rip, @@ -4330,6 +4583,8 @@ bool vmx_vmenter_helper(const struct cpu_user_regs *regs) if ( unlikely(curr->arch.hvm.vmx.lbr_flags & LBR_FIXUP_MASK) ) lbr_fixup(); + curr->arch.in_host = 0; + HVMTRACE_ND(VMENTRY, 0, 1/*cycles*/, 0, 0, 0, 0, 0, 0, 0); __vmwrite(GUEST_RIP, regs->rip); diff --git a/xen/arch/x86/mm/mem_access.c b/xen/arch/x86/mm/mem_access.c index 2f1295e56a..5ae3a61b5c 100644 --- a/xen/arch/x86/mm/mem_access.c +++ b/xen/arch/x86/mm/mem_access.c @@ -212,10 +212,11 @@ bool p2m_mem_access_check(paddr_t gpa, unsigned long gla, } if ( vm_event_check_ring(d->vm_event_monitor) && d->arch.monitor.inguest_pagefault_disabled && - npfec.kind != npfec_kind_with_gla ) /* don't send a mem_event */ + npfec.kind != npfec_kind_with_gla && + hvm_funcs.start_reexecute_instruction ) /* don't send a mem_event */ { - hvm_emulate_one_vm_event(EMUL_KIND_NORMAL, TRAP_invalid_op, X86_EVENT_NO_EC); - + v->arch.vm_event->emulate_flags = 0; + hvm_funcs.start_reexecute_instruction(v, gpa, XENMEM_access_rw); return true; } @@ -226,6 +227,7 @@ bool p2m_mem_access_check(paddr_t gpa, unsigned long gla, *req_ptr = req; req->reason = VM_EVENT_REASON_MEM_ACCESS; + req->u.mem_access.gfn = gfn_x(gfn); req->u.mem_access.offset = gpa & ((1 << PAGE_SHIFT) - 1); @@ -377,6 +379,8 @@ long p2m_set_mem_access(struct domain *d, gfn_t gfn, uint32_t nr, p2m_access_t a; unsigned long gfn_l; long rc = 0; + struct vcpu *v; + int i; /* altp2m view 0 is treated as the hostp2m */ #ifdef CONFIG_HVM @@ -413,6 +417,16 @@ long p2m_set_mem_access(struct domain *d, gfn_t gfn, uint32_t nr, if ( rc ) break; + for_each_vcpu(d, v) + { + if ( !v->arch.rexec_level ) + continue; + + for ( i = v->arch.rexec_level - 1; i >= 0; i-- ) + if ( (v->arch.rexec_context[i].gpa >> PAGE_SHIFT) == gfn_x(gfn) ) + v->arch.rexec_context[i].gpa = mfn_x(INVALID_MFN); + } + /* Check for continuation if it's not the last iteration. */ if ( nr > ++start && !(start & mask) && hypercall_preempt_check() ) { diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h index 277f99f633..dbb68f108a 100644 --- a/xen/include/asm-x86/domain.h +++ b/xen/include/asm-x86/domain.h @@ -438,6 +438,8 @@ struct arch_domain /* Emulated devices enabled bitmap. */ uint32_t emulation_flags; + + spinlock_t rexec_lock; } __cacheline_aligned; #ifdef CONFIG_HVM @@ -629,6 +631,22 @@ struct arch_vcpu /* A secondary copy of the vcpu time info. */ XEN_GUEST_HANDLE(vcpu_time_info_t) time_info_guest; +#define REEXECUTION_MAX_DEPTH 8 + struct rexec_context_t { + unsigned long gpa; + xenmem_access_t old_access; + xenmem_access_t cur_access; + bool_t old_single_step; + } rexec_context[REEXECUTION_MAX_DEPTH]; + + int rexec_level; + + /* + * Will be true when the vcpu is in VMX root, + * false when it is not. + */ + bool_t in_host; + struct arch_vm_event *vm_event; struct vcpu_msrs *msrs; diff --git a/xen/include/asm-x86/hvm/hvm.h b/xen/include/asm-x86/hvm/hvm.h index 3d3250dff0..1f5d43a98d 100644 --- a/xen/include/asm-x86/hvm/hvm.h +++ b/xen/include/asm-x86/hvm/hvm.h @@ -167,6 +167,8 @@ struct hvm_function_table { int (*cpu_up)(void); void (*cpu_down)(void); + int (*start_reexecute_instruction)(struct vcpu *v, unsigned long gpa, + xenmem_access_t required_access); /* Copy up to 15 bytes from cached instruction bytes at current rIP. */ unsigned int (*get_insn_bytes)(struct vcpu *v, uint8_t *buf); -- 2.17.1 _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxxx https://lists.xenproject.org/mailman/listinfo/xen-devel
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |