[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] Re: [Xen-devel] [PATCH for-next v3 01/22] x86/traps: move privilege instruction emulation code
I forgot to move gpr_switch.S. Here is an updated version. ---8<--- From 58df816b937dc7a3598de01f053a6030e631057e Mon Sep 17 00:00:00 2001 From: Wei Liu <wei.liu2@xxxxxxxxxx> Date: Thu, 18 May 2017 16:18:56 +0100 Subject: [PATCH] x86/traps: move privilege instruction emulation code Move relevant code to pv/emulate.c. Export emulate_privileged_op in pv/traps.h. Note that read_descriptor is duplicated in emulate.c. The duplication will be gone once all emulation code is moved. Also move gpr_switch.S to pv/ because the code in that file is only used by privilege instruction emulation. No functional change. Signed-off-by: Wei Liu <wei.liu2@xxxxxxxxxx> --- xen/arch/x86/pv/Makefile | 2 + xen/arch/x86/pv/emulate.c | 1470 ++++++++++++++++++++++++++++++ xen/arch/x86/{x86_64 => pv}/gpr_switch.S | 0 xen/arch/x86/traps.c | 1358 +-------------------------- xen/arch/x86/x86_64/Makefile | 1 - xen/include/asm-x86/pv/traps.h | 48 + 6 files changed, 1522 insertions(+), 1357 deletions(-) create mode 100644 xen/arch/x86/pv/emulate.c rename xen/arch/x86/{x86_64 => pv}/gpr_switch.S (100%) create mode 100644 xen/include/asm-x86/pv/traps.h diff --git a/xen/arch/x86/pv/Makefile b/xen/arch/x86/pv/Makefile index 489a9f59cb..f272f607d4 100644 --- a/xen/arch/x86/pv/Makefile +++ b/xen/arch/x86/pv/Makefile @@ -3,3 +3,5 @@ obj-y += traps.o obj-bin-y += dom0_build.init.o obj-y += domain.o +obj-y += emulate.o +obj-bin-y += gpr_switch.o diff --git a/xen/arch/x86/pv/emulate.c b/xen/arch/x86/pv/emulate.c new file mode 100644 index 0000000000..fb0d066a3b --- /dev/null +++ b/xen/arch/x86/pv/emulate.c @@ -0,0 +1,1470 @@ +/****************************************************************************** + * arch/x86/pv/emulate.c + * + * PV emulation code + * + * Modifications to Linux original are copyright (c) 2002-2004, K A Fraser + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; If not, see <http://www.gnu.org/licenses/>. + */ + +#include <xen/errno.h> +#include <xen/event.h> +#include <xen/guest_access.h> +#include <xen/iocap.h> +#include <xen/spinlock.h> +#include <xen/trace.h> + +#include <asm/apic.h> +#include <asm/debugreg.h> +#include <asm/hpet.h> +#include <asm/hypercall.h> +#include <asm/mc146818rtc.h> +#include <asm/p2m.h> +#include <asm/pv/traps.h> +#include <asm/shared.h> +#include <asm/traps.h> +#include <asm/x86_emulate.h> + +#include <xsm/xsm.h> + +#include "../x86_64/mmconfig.h" + +/****************** + * Helper functions + */ + +static int read_descriptor(unsigned int sel, + const struct vcpu *v, + unsigned long *base, + unsigned long *limit, + unsigned int *ar, + bool_t insn_fetch) +{ + struct desc_struct desc; + + if ( sel < 4) + desc.b = desc.a = 0; + else if ( __get_user(desc, + (const struct desc_struct *)(!(sel & 4) + ? GDT_VIRT_START(v) + : LDT_VIRT_START(v)) + + (sel >> 3)) ) + return 0; + if ( !insn_fetch ) + desc.b &= ~_SEGMENT_L; + + *ar = desc.b & 0x00f0ff00; + if ( !(desc.b & _SEGMENT_L) ) + { + *base = ((desc.a >> 16) + ((desc.b & 0xff) << 16) + + (desc.b & 0xff000000)); + *limit = (desc.a & 0xffff) | (desc.b & 0x000f0000); + if ( desc.b & _SEGMENT_G ) + *limit = ((*limit + 1) << 12) - 1; +#ifndef NDEBUG + if ( sel > 3 ) + { + unsigned int a, l; + unsigned char valid; + + asm volatile ( + "larl %2,%0 ; setz %1" + : "=r" (a), "=qm" (valid) : "rm" (sel)); + BUG_ON(valid && ((a & 0x00f0ff00) != *ar)); + asm volatile ( + "lsll %2,%0 ; setz %1" + : "=r" (l), "=qm" (valid) : "rm" (sel)); + BUG_ON(valid && (l != *limit)); + } +#endif + } + else + { + *base = 0UL; + *limit = ~0UL; + } + + return 1; +} + +/*********************** + * I/O emulation support + */ + +struct priv_op_ctxt { + struct x86_emulate_ctxt ctxt; + struct { + unsigned long base, limit; + } cs; + char *io_emul_stub; + unsigned int bpmatch; + unsigned int tsc; +#define TSC_BASE 1 +#define TSC_AUX 2 +}; + +/* I/O emulation support. Helper routines for, and type of, the stack stub.*/ +void host_to_guest_gpr_switch(struct cpu_user_regs *); +unsigned long guest_to_host_gpr_switch(unsigned long); + +void (*pv_post_outb_hook)(unsigned int port, u8 value); + +typedef void io_emul_stub_t(struct cpu_user_regs *); + +static io_emul_stub_t *io_emul_stub_setup(struct priv_op_ctxt *ctxt, u8 opcode, + unsigned int port, unsigned int bytes) +{ + if ( !ctxt->io_emul_stub ) + ctxt->io_emul_stub = map_domain_page(_mfn(this_cpu(stubs.mfn))) + + (this_cpu(stubs.addr) & + ~PAGE_MASK) + + STUB_BUF_SIZE / 2; + + /* movq $host_to_guest_gpr_switch,%rcx */ + ctxt->io_emul_stub[0] = 0x48; + ctxt->io_emul_stub[1] = 0xb9; + *(void **)&ctxt->io_emul_stub[2] = (void *)host_to_guest_gpr_switch; + /* callq *%rcx */ + ctxt->io_emul_stub[10] = 0xff; + ctxt->io_emul_stub[11] = 0xd1; + /* data16 or nop */ + ctxt->io_emul_stub[12] = (bytes != 2) ? 0x90 : 0x66; + /* <io-access opcode> */ + ctxt->io_emul_stub[13] = opcode; + /* imm8 or nop */ + ctxt->io_emul_stub[14] = !(opcode & 8) ? port : 0x90; + /* ret (jumps to guest_to_host_gpr_switch) */ + ctxt->io_emul_stub[15] = 0xc3; + BUILD_BUG_ON(STUB_BUF_SIZE / 2 < 16); + + if ( ioemul_handle_quirk ) + ioemul_handle_quirk(opcode, &ctxt->io_emul_stub[12], ctxt->ctxt.regs); + + /* Handy function-typed pointer to the stub. */ + return (void *)(this_cpu(stubs.addr) + STUB_BUF_SIZE / 2); +} + + +/* Perform IOPL check between the vcpu's shadowed IOPL, and the assumed cpl. */ +static bool_t iopl_ok(const struct vcpu *v, const struct cpu_user_regs *regs) +{ + unsigned int cpl = guest_kernel_mode(v, regs) ? + (VM_ASSIST(v->domain, architectural_iopl) ? 0 : 1) : 3; + + ASSERT((v->arch.pv_vcpu.iopl & ~X86_EFLAGS_IOPL) == 0); + + return IOPL(cpl) <= v->arch.pv_vcpu.iopl; +} + +/* Has the guest requested sufficient permission for this I/O access? */ +static int guest_io_okay( + unsigned int port, unsigned int bytes, + struct vcpu *v, struct cpu_user_regs *regs) +{ + /* If in user mode, switch to kernel mode just to read I/O bitmap. */ + int user_mode = !(v->arch.flags & TF_kernel_mode); +#define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v) + + if ( iopl_ok(v, regs) ) + return 1; + + if ( v->arch.pv_vcpu.iobmp_limit > (port + bytes) ) + { + union { uint8_t bytes[2]; uint16_t mask; } x; + + /* + * Grab permission bytes from guest space. Inaccessible bytes are + * read as 0xff (no access allowed). + */ + TOGGLE_MODE(); + switch ( __copy_from_guest_offset(x.bytes, v->arch.pv_vcpu.iobmp, + port>>3, 2) ) + { + default: x.bytes[0] = ~0; + /* fallthrough */ + case 1: x.bytes[1] = ~0; + /* fallthrough */ + case 0: break; + } + TOGGLE_MODE(); + + if ( (x.mask & (((1<<bytes)-1) << (port&7))) == 0 ) + return 1; + } + + return 0; +} + +/* Has the administrator granted sufficient permission for this I/O access? */ +static bool_t admin_io_okay(unsigned int port, unsigned int bytes, + const struct domain *d) +{ + /* + * Port 0xcf8 (CONFIG_ADDRESS) is only visible for DWORD accesses. + * We never permit direct access to that register. + */ + if ( (port == 0xcf8) && (bytes == 4) ) + return 0; + + /* We also never permit direct access to the RTC/CMOS registers. */ + if ( ((port & ~1) == RTC_PORT(0)) ) + return 0; + + return ioports_access_permitted(d, port, port + bytes - 1); +} + +static bool_t pci_cfg_ok(struct domain *currd, unsigned int start, + unsigned int size, uint32_t *write) +{ + uint32_t machine_bdf; + + if ( !is_hardware_domain(currd) ) + return 0; + + if ( !CF8_ENABLED(currd->arch.pci_cf8) ) + return 1; + + machine_bdf = CF8_BDF(currd->arch.pci_cf8); + if ( write ) + { + const unsigned long *ro_map = pci_get_ro_map(0); + + if ( ro_map && test_bit(machine_bdf, ro_map) ) + return 0; + } + start |= CF8_ADDR_LO(currd->arch.pci_cf8); + /* AMD extended configuration space access? */ + if ( CF8_ADDR_HI(currd->arch.pci_cf8) && + boot_cpu_data.x86_vendor == X86_VENDOR_AMD && + boot_cpu_data.x86 >= 0x10 && boot_cpu_data.x86 <= 0x17 ) + { + uint64_t msr_val; + + if ( rdmsr_safe(MSR_AMD64_NB_CFG, msr_val) ) + return 0; + if ( msr_val & (1ULL << AMD64_NB_CFG_CF8_EXT_ENABLE_BIT) ) + start |= CF8_ADDR_HI(currd->arch.pci_cf8); + } + + return !write ? + xsm_pci_config_permission(XSM_HOOK, currd, machine_bdf, + start, start + size - 1, 0) == 0 : + pci_conf_write_intercept(0, machine_bdf, start, size, write) >= 0; +} + +uint32_t guest_io_read(unsigned int port, unsigned int bytes, + struct domain *currd) +{ + uint32_t data = 0; + unsigned int shift = 0; + + if ( admin_io_okay(port, bytes, currd) ) + { + switch ( bytes ) + { + case 1: return inb(port); + case 2: return inw(port); + case 4: return inl(port); + } + } + + while ( bytes != 0 ) + { + unsigned int size = 1; + uint32_t sub_data = ~0; + + if ( (port == 0x42) || (port == 0x43) || (port == 0x61) ) + { + sub_data = pv_pit_handler(port, 0, 0); + } + else if ( port == RTC_PORT(0) ) + { + sub_data = currd->arch.cmos_idx; + } + else if ( (port == RTC_PORT(1)) && + ioports_access_permitted(currd, RTC_PORT(0), RTC_PORT(1)) ) + { + unsigned long flags; + + spin_lock_irqsave(&rtc_lock, flags); + outb(currd->arch.cmos_idx & 0x7f, RTC_PORT(0)); + sub_data = inb(RTC_PORT(1)); + spin_unlock_irqrestore(&rtc_lock, flags); + } + else if ( (port == 0xcf8) && (bytes == 4) ) + { + size = 4; + sub_data = currd->arch.pci_cf8; + } + else if ( (port & 0xfffc) == 0xcfc ) + { + size = min(bytes, 4 - (port & 3)); + if ( size == 3 ) + size = 2; + if ( pci_cfg_ok(currd, port & 3, size, NULL) ) + sub_data = pci_conf_read(currd->arch.pci_cf8, port & 3, size); + } + + if ( size == 4 ) + return sub_data; + + data |= (sub_data & ((1u << (size * 8)) - 1)) << shift; + shift += size * 8; + port += size; + bytes -= size; + } + + return data; +} + +static unsigned int check_guest_io_breakpoint(struct vcpu *v, + unsigned int port, unsigned int len) +{ + unsigned int width, i, match = 0; + unsigned long start; + + if ( !(v->arch.debugreg[5]) || + !(v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_DE) ) + return 0; + + for ( i = 0; i < 4; i++ ) + { + if ( !(v->arch.debugreg[5] & + (3 << (i * DR_ENABLE_SIZE))) ) + continue; + + start = v->arch.debugreg[i]; + width = 0; + + switch ( (v->arch.debugreg[7] >> + (DR_CONTROL_SHIFT + i * DR_CONTROL_SIZE)) & 0xc ) + { + case DR_LEN_1: width = 1; break; + case DR_LEN_2: width = 2; break; + case DR_LEN_4: width = 4; break; + case DR_LEN_8: width = 8; break; + } + + if ( (start < (port + len)) && ((start + width) > port) ) + match |= 1 << i; + } + + return match; +} + +static int priv_op_read_io(unsigned int port, unsigned int bytes, + unsigned long *val, struct x86_emulate_ctxt *ctxt) +{ + struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt); + struct vcpu *curr = current; + struct domain *currd = current->domain; + + /* INS must not come here. */ + ASSERT((ctxt->opcode & ~9) == 0xe4); + + if ( !guest_io_okay(port, bytes, curr, ctxt->regs) ) + return X86EMUL_UNHANDLEABLE; + + poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes); + + if ( admin_io_okay(port, bytes, currd) ) + { + io_emul_stub_t *io_emul = + io_emul_stub_setup(poc, ctxt->opcode, port, bytes); + + mark_regs_dirty(ctxt->regs); + io_emul(ctxt->regs); + return X86EMUL_DONE; + } + + *val = guest_io_read(port, bytes, currd); + + return X86EMUL_OKAY; +} + +void guest_io_write(unsigned int port, unsigned int bytes, uint32_t data, + struct domain *currd) +{ + if ( admin_io_okay(port, bytes, currd) ) + { + switch ( bytes ) { + case 1: + outb((uint8_t)data, port); + if ( pv_post_outb_hook ) + pv_post_outb_hook(port, (uint8_t)data); + break; + case 2: + outw((uint16_t)data, port); + break; + case 4: + outl(data, port); + break; + } + return; + } + + while ( bytes != 0 ) + { + unsigned int size = 1; + + if ( (port == 0x42) || (port == 0x43) || (port == 0x61) ) + { + pv_pit_handler(port, (uint8_t)data, 1); + } + else if ( port == RTC_PORT(0) ) + { + currd->arch.cmos_idx = data; + } + else if ( (port == RTC_PORT(1)) && + ioports_access_permitted(currd, RTC_PORT(0), RTC_PORT(1)) ) + { + unsigned long flags; + + if ( pv_rtc_handler ) + pv_rtc_handler(currd->arch.cmos_idx & 0x7f, data); + spin_lock_irqsave(&rtc_lock, flags); + outb(currd->arch.cmos_idx & 0x7f, RTC_PORT(0)); + outb(data, RTC_PORT(1)); + spin_unlock_irqrestore(&rtc_lock, flags); + } + else if ( (port == 0xcf8) && (bytes == 4) ) + { + size = 4; + currd->arch.pci_cf8 = data; + } + else if ( (port & 0xfffc) == 0xcfc ) + { + size = min(bytes, 4 - (port & 3)); + if ( size == 3 ) + size = 2; + if ( pci_cfg_ok(currd, port & 3, size, &data) ) + pci_conf_write(currd->arch.pci_cf8, port & 3, size, data); + } + + if ( size == 4 ) + return; + + port += size; + bytes -= size; + data >>= size * 8; + } +} + +static int priv_op_write_io(unsigned int port, unsigned int bytes, + unsigned long val, struct x86_emulate_ctxt *ctxt) +{ + struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt); + struct vcpu *curr = current; + struct domain *currd = current->domain; + + /* OUTS must not come here. */ + ASSERT((ctxt->opcode & ~9) == 0xe6); + + if ( !guest_io_okay(port, bytes, curr, ctxt->regs) ) + return X86EMUL_UNHANDLEABLE; + + poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes); + + if ( admin_io_okay(port, bytes, currd) ) + { + io_emul_stub_t *io_emul = + io_emul_stub_setup(poc, ctxt->opcode, port, bytes); + + mark_regs_dirty(ctxt->regs); + io_emul(ctxt->regs); + if ( (bytes == 1) && pv_post_outb_hook ) + pv_post_outb_hook(port, val); + return X86EMUL_DONE; + } + + guest_io_write(port, bytes, val, currd); + + return X86EMUL_OKAY; +} + +static int priv_op_read_segment(enum x86_segment seg, + struct segment_register *reg, + struct x86_emulate_ctxt *ctxt) +{ + /* Check if this is an attempt to access the I/O bitmap. */ + if ( seg == x86_seg_tr ) + { + switch ( ctxt->opcode ) + { + case 0x6c ... 0x6f: /* ins / outs */ + case 0xe4 ... 0xe7: /* in / out (immediate port) */ + case 0xec ... 0xef: /* in / out (port in %dx) */ + /* Defer the check to priv_op_{read,write}_io(). */ + return X86EMUL_DONE; + } + } + + if ( ctxt->addr_size < 64 ) + { + unsigned long limit; + unsigned int sel, ar; + + switch ( seg ) + { + case x86_seg_cs: sel = ctxt->regs->cs; break; + case x86_seg_ds: sel = read_sreg(ds); break; + case x86_seg_es: sel = read_sreg(es); break; + case x86_seg_fs: sel = read_sreg(fs); break; + case x86_seg_gs: sel = read_sreg(gs); break; + case x86_seg_ss: sel = ctxt->regs->ss; break; + default: return X86EMUL_UNHANDLEABLE; + } + + if ( !read_descriptor(sel, current, ®->base, &limit, &ar, 0) ) + return X86EMUL_UNHANDLEABLE; + + reg->limit = limit; + reg->attr.bytes = ar >> 8; + } + else + { + switch ( seg ) + { + default: + if ( !is_x86_user_segment(seg) ) + return X86EMUL_UNHANDLEABLE; + reg->base = 0; + break; + case x86_seg_fs: + reg->base = rdfsbase(); + break; + case x86_seg_gs: + reg->base = rdgsbase(); + break; + } + + reg->limit = ~0U; + + reg->attr.bytes = 0; + reg->attr.fields.type = _SEGMENT_WR >> 8; + if ( seg == x86_seg_cs ) + { + reg->attr.fields.type |= _SEGMENT_CODE >> 8; + reg->attr.fields.l = 1; + } + else + reg->attr.fields.db = 1; + reg->attr.fields.s = 1; + reg->attr.fields.dpl = 3; + reg->attr.fields.p = 1; + reg->attr.fields.g = 1; + } + + /* + * For x86_emulate.c's mode_ring0() to work, fake a DPL of zero. + * Also do this for consistency for non-conforming code segments. + */ + if ( (seg == x86_seg_ss || + (seg == x86_seg_cs && + !(reg->attr.fields.type & (_SEGMENT_EC >> 8)))) && + guest_kernel_mode(current, ctxt->regs) ) + reg->attr.fields.dpl = 0; + + return X86EMUL_OKAY; +} + +static int pv_emul_virt_to_linear(unsigned long base, unsigned long offset, + unsigned int bytes, unsigned long limit, + enum x86_segment seg, + struct x86_emulate_ctxt *ctxt, + unsigned long *addr) +{ + int rc = X86EMUL_OKAY; + + *addr = base + offset; + + if ( ctxt->addr_size < 64 ) + { + if ( limit < bytes - 1 || offset > limit - bytes + 1 ) + rc = X86EMUL_EXCEPTION; + *addr = (uint32_t)*addr; + } + else if ( !__addr_ok(*addr) ) + rc = X86EMUL_EXCEPTION; + + if ( unlikely(rc == X86EMUL_EXCEPTION) ) + x86_emul_hw_exception(seg != x86_seg_ss ? TRAP_gp_fault + : TRAP_stack_error, + 0, ctxt); + + return rc; +} + +static int priv_op_rep_ins(uint16_t port, + enum x86_segment seg, unsigned long offset, + unsigned int bytes_per_rep, unsigned long *reps, + struct x86_emulate_ctxt *ctxt) +{ + struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt); + struct vcpu *curr = current; + struct domain *currd = current->domain; + unsigned long goal = *reps; + struct segment_register sreg; + int rc; + + ASSERT(seg == x86_seg_es); + + *reps = 0; + + if ( !guest_io_okay(port, bytes_per_rep, curr, ctxt->regs) ) + return X86EMUL_UNHANDLEABLE; + + rc = priv_op_read_segment(x86_seg_es, &sreg, ctxt); + if ( rc != X86EMUL_OKAY ) + return rc; + + if ( !sreg.attr.fields.p ) + return X86EMUL_UNHANDLEABLE; + if ( !sreg.attr.fields.s || + (sreg.attr.fields.type & (_SEGMENT_CODE >> 8)) || + !(sreg.attr.fields.type & (_SEGMENT_WR >> 8)) ) + { + x86_emul_hw_exception(TRAP_gp_fault, 0, ctxt); + return X86EMUL_EXCEPTION; + } + + poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes_per_rep); + + while ( *reps < goal ) + { + unsigned int data = guest_io_read(port, bytes_per_rep, currd); + unsigned long addr; + + rc = pv_emul_virt_to_linear(sreg.base, offset, bytes_per_rep, + sreg.limit, x86_seg_es, ctxt, &addr); + if ( rc != X86EMUL_OKAY ) + return rc; + + if ( (rc = __copy_to_user((void *)addr, &data, bytes_per_rep)) != 0 ) + { + x86_emul_pagefault(PFEC_write_access, + addr + bytes_per_rep - rc, ctxt); + return X86EMUL_EXCEPTION; + } + + ++*reps; + + if ( poc->bpmatch || hypercall_preempt_check() ) + break; + + /* x86_emulate() clips the repetition count to ensure we don't wrap. */ + if ( unlikely(ctxt->regs->eflags & X86_EFLAGS_DF) ) + offset -= bytes_per_rep; + else + offset += bytes_per_rep; + } + + return X86EMUL_OKAY; +} + +static int priv_op_rep_outs(enum x86_segment seg, unsigned long offset, + uint16_t port, + unsigned int bytes_per_rep, unsigned long *reps, + struct x86_emulate_ctxt *ctxt) +{ + struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt); + struct vcpu *curr = current; + struct domain *currd = current->domain; + unsigned long goal = *reps; + struct segment_register sreg; + int rc; + + *reps = 0; + + if ( !guest_io_okay(port, bytes_per_rep, curr, ctxt->regs) ) + return X86EMUL_UNHANDLEABLE; + + rc = priv_op_read_segment(seg, &sreg, ctxt); + if ( rc != X86EMUL_OKAY ) + return rc; + + if ( !sreg.attr.fields.p ) + return X86EMUL_UNHANDLEABLE; + if ( !sreg.attr.fields.s || + ((sreg.attr.fields.type & (_SEGMENT_CODE >> 8)) && + !(sreg.attr.fields.type & (_SEGMENT_WR >> 8))) ) + { + x86_emul_hw_exception(seg != x86_seg_ss ? TRAP_gp_fault + : TRAP_stack_error, + 0, ctxt); + return X86EMUL_EXCEPTION; + } + + poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes_per_rep); + + while ( *reps < goal ) + { + unsigned int data = 0; + unsigned long addr; + + rc = pv_emul_virt_to_linear(sreg.base, offset, bytes_per_rep, + sreg.limit, seg, ctxt, &addr); + if ( rc != X86EMUL_OKAY ) + return rc; + + if ( (rc = __copy_from_user(&data, (void *)addr, bytes_per_rep)) != 0 ) + { + x86_emul_pagefault(0, addr + bytes_per_rep - rc, ctxt); + return X86EMUL_EXCEPTION; + } + + guest_io_write(port, bytes_per_rep, data, currd); + + ++*reps; + + if ( poc->bpmatch || hypercall_preempt_check() ) + break; + + /* x86_emulate() clips the repetition count to ensure we don't wrap. */ + if ( unlikely(ctxt->regs->eflags & X86_EFLAGS_DF) ) + offset -= bytes_per_rep; + else + offset += bytes_per_rep; + } + + return X86EMUL_OKAY; +} + +static int priv_op_read_cr(unsigned int reg, unsigned long *val, + struct x86_emulate_ctxt *ctxt) +{ + const struct vcpu *curr = current; + + switch ( reg ) + { + case 0: /* Read CR0 */ + *val = (read_cr0() & ~X86_CR0_TS) | curr->arch.pv_vcpu.ctrlreg[0]; + return X86EMUL_OKAY; + + case 2: /* Read CR2 */ + case 4: /* Read CR4 */ + *val = curr->arch.pv_vcpu.ctrlreg[reg]; + return X86EMUL_OKAY; + + case 3: /* Read CR3 */ + { + const struct domain *currd = curr->domain; + unsigned long mfn; + + if ( !is_pv_32bit_domain(currd) ) + { + mfn = pagetable_get_pfn(curr->arch.guest_table); + *val = xen_pfn_to_cr3(mfn_to_gmfn(currd, mfn)); + } + else + { + l4_pgentry_t *pl4e = + map_domain_page(_mfn(pagetable_get_pfn(curr->arch.guest_table))); + + mfn = l4e_get_pfn(*pl4e); + unmap_domain_page(pl4e); + *val = compat_pfn_to_cr3(mfn_to_gmfn(currd, mfn)); + } + /* PTs should not be shared */ + BUG_ON(page_get_owner(mfn_to_page(mfn)) == dom_cow); + return X86EMUL_OKAY; + } + } + + return X86EMUL_UNHANDLEABLE; +} + +static int priv_op_write_cr(unsigned int reg, unsigned long val, + struct x86_emulate_ctxt *ctxt) +{ + struct vcpu *curr = current; + + switch ( reg ) + { + case 0: /* Write CR0 */ + if ( (val ^ read_cr0()) & ~X86_CR0_TS ) + { + gdprintk(XENLOG_WARNING, + "Attempt to change unmodifiable CR0 flags\n"); + break; + } + do_fpu_taskswitch(!!(val & X86_CR0_TS)); + return X86EMUL_OKAY; + + case 2: /* Write CR2 */ + curr->arch.pv_vcpu.ctrlreg[2] = val; + arch_set_cr2(curr, val); + return X86EMUL_OKAY; + + case 3: /* Write CR3 */ + { + struct domain *currd = curr->domain; + unsigned long gfn; + struct page_info *page; + int rc; + + gfn = !is_pv_32bit_domain(currd) + ? xen_cr3_to_pfn(val) : compat_cr3_to_pfn(val); + page = get_page_from_gfn(currd, gfn, NULL, P2M_ALLOC); + if ( !page ) + break; + rc = new_guest_cr3(page_to_mfn(page)); + put_page(page); + + switch ( rc ) + { + case 0: + return X86EMUL_OKAY; + case -ERESTART: /* retry after preemption */ + return X86EMUL_RETRY; + } + break; + } + + case 4: /* Write CR4 */ + curr->arch.pv_vcpu.ctrlreg[4] = pv_guest_cr4_fixup(curr, val); + write_cr4(pv_guest_cr4_to_real_cr4(curr)); + ctxt_switch_levelling(curr); + return X86EMUL_OKAY; + } + + return X86EMUL_UNHANDLEABLE; +} + +static int priv_op_read_dr(unsigned int reg, unsigned long *val, + struct x86_emulate_ctxt *ctxt) +{ + unsigned long res = do_get_debugreg(reg); + + if ( IS_ERR_VALUE(res) ) + return X86EMUL_UNHANDLEABLE; + + *val = res; + + return X86EMUL_OKAY; +} + +static int priv_op_write_dr(unsigned int reg, unsigned long val, + struct x86_emulate_ctxt *ctxt) +{ + return do_set_debugreg(reg, val) == 0 + ? X86EMUL_OKAY : X86EMUL_UNHANDLEABLE; +} + +static inline uint64_t guest_misc_enable(uint64_t val) +{ + val &= ~(MSR_IA32_MISC_ENABLE_PERF_AVAIL | + MSR_IA32_MISC_ENABLE_MONITOR_ENABLE); + val |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL | + MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL | + MSR_IA32_MISC_ENABLE_XTPR_DISABLE; + return val; +} + +static inline bool is_cpufreq_controller(const struct domain *d) +{ + return ((cpufreq_controller == FREQCTL_dom0_kernel) && + is_hardware_domain(d)); +} + +static int priv_op_read_msr(unsigned int reg, uint64_t *val, + struct x86_emulate_ctxt *ctxt) +{ + struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt); + const struct vcpu *curr = current; + const struct domain *currd = curr->domain; + bool vpmu_msr = false; + + switch ( reg ) + { + int rc; + + case MSR_FS_BASE: + if ( is_pv_32bit_domain(currd) ) + break; + *val = cpu_has_fsgsbase ? __rdfsbase() : curr->arch.pv_vcpu.fs_base; + return X86EMUL_OKAY; + + case MSR_GS_BASE: + if ( is_pv_32bit_domain(currd) ) + break; + *val = cpu_has_fsgsbase ? __rdgsbase() + : curr->arch.pv_vcpu.gs_base_kernel; + return X86EMUL_OKAY; + + case MSR_SHADOW_GS_BASE: + if ( is_pv_32bit_domain(currd) ) + break; + *val = curr->arch.pv_vcpu.gs_base_user; + return X86EMUL_OKAY; + + /* + * In order to fully retain original behavior, defer calling + * pv_soft_rdtsc() until after emulation. This may want/need to be + * reconsidered. + */ + case MSR_IA32_TSC: + poc->tsc |= TSC_BASE; + goto normal; + + case MSR_TSC_AUX: + poc->tsc |= TSC_AUX; + if ( cpu_has_rdtscp ) + goto normal; + *val = 0; + return X86EMUL_OKAY; + + case MSR_EFER: + *val = read_efer(); + if ( is_pv_32bit_domain(currd) ) + *val &= ~(EFER_LME | EFER_LMA | EFER_LMSLE); + return X86EMUL_OKAY; + + case MSR_K7_FID_VID_CTL: + case MSR_K7_FID_VID_STATUS: + case MSR_K8_PSTATE_LIMIT: + case MSR_K8_PSTATE_CTRL: + case MSR_K8_PSTATE_STATUS: + case MSR_K8_PSTATE0: + case MSR_K8_PSTATE1: + case MSR_K8_PSTATE2: + case MSR_K8_PSTATE3: + case MSR_K8_PSTATE4: + case MSR_K8_PSTATE5: + case MSR_K8_PSTATE6: + case MSR_K8_PSTATE7: + if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ) + break; + if ( unlikely(is_cpufreq_controller(currd)) ) + goto normal; + *val = 0; + return X86EMUL_OKAY; + + case MSR_IA32_UCODE_REV: + BUILD_BUG_ON(MSR_IA32_UCODE_REV != MSR_AMD_PATCHLEVEL); + if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ) + { + if ( wrmsr_safe(MSR_IA32_UCODE_REV, 0) ) + break; + /* As documented in the SDM: Do a CPUID 1 here */ + cpuid_eax(1); + } + goto normal; + + case MSR_IA32_MISC_ENABLE: + if ( rdmsr_safe(reg, *val) ) + break; + *val = guest_misc_enable(*val); + return X86EMUL_OKAY; + + case MSR_AMD64_DR0_ADDRESS_MASK: + if ( !boot_cpu_has(X86_FEATURE_DBEXT) ) + break; + *val = curr->arch.pv_vcpu.dr_mask[0]; + return X86EMUL_OKAY; + + case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK: + if ( !boot_cpu_has(X86_FEATURE_DBEXT) ) + break; + *val = curr->arch.pv_vcpu.dr_mask[reg - MSR_AMD64_DR1_ADDRESS_MASK + 1]; + return X86EMUL_OKAY; + + case MSR_IA32_PERF_CAPABILITIES: + /* No extra capabilities are supported. */ + *val = 0; + return X86EMUL_OKAY; + + case MSR_INTEL_PLATFORM_INFO: + if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || + rdmsr_safe(MSR_INTEL_PLATFORM_INFO, *val) ) + break; + *val = 0; + if ( this_cpu(cpuid_faulting_enabled) ) + *val |= MSR_PLATFORM_INFO_CPUID_FAULTING; + return X86EMUL_OKAY; + + case MSR_INTEL_MISC_FEATURES_ENABLES: + if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || + rdmsr_safe(MSR_INTEL_MISC_FEATURES_ENABLES, *val) ) + break; + *val = 0; + if ( curr->arch.cpuid_faulting ) + *val |= MSR_MISC_FEATURES_CPUID_FAULTING; + return X86EMUL_OKAY; + + case MSR_P6_PERFCTR(0)...MSR_P6_PERFCTR(7): + case MSR_P6_EVNTSEL(0)...MSR_P6_EVNTSEL(3): + case MSR_CORE_PERF_FIXED_CTR0...MSR_CORE_PERF_FIXED_CTR2: + case MSR_CORE_PERF_FIXED_CTR_CTRL...MSR_CORE_PERF_GLOBAL_OVF_CTRL: + if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ) + { + vpmu_msr = true; + /* fall through */ + case MSR_AMD_FAM15H_EVNTSEL0...MSR_AMD_FAM15H_PERFCTR5: + case MSR_K7_EVNTSEL0...MSR_K7_PERFCTR3: + if ( vpmu_msr || (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) ) + { + if ( vpmu_do_rdmsr(reg, val) ) + break; + return X86EMUL_OKAY; + } + } + /* fall through */ + default: + if ( rdmsr_hypervisor_regs(reg, val) ) + return X86EMUL_OKAY; + + rc = vmce_rdmsr(reg, val); + if ( rc < 0 ) + break; + if ( rc ) + return X86EMUL_OKAY; + /* fall through */ + normal: + /* Everyone can read the MSR space. */ + /* gdprintk(XENLOG_WARNING, "Domain attempted RDMSR %08x\n", reg); */ + if ( rdmsr_safe(reg, *val) ) + break; + return X86EMUL_OKAY; + } + + return X86EMUL_UNHANDLEABLE; +} + +static int priv_op_write_msr(unsigned int reg, uint64_t val, + struct x86_emulate_ctxt *ctxt) +{ + struct vcpu *curr = current; + const struct domain *currd = curr->domain; + bool vpmu_msr = false; + + switch ( reg ) + { + uint64_t temp; + int rc; + + case MSR_FS_BASE: + if ( is_pv_32bit_domain(currd) || !is_canonical_address(val) ) + break; + wrfsbase(val); + curr->arch.pv_vcpu.fs_base = val; + return X86EMUL_OKAY; + + case MSR_GS_BASE: + if ( is_pv_32bit_domain(currd) || !is_canonical_address(val) ) + break; + wrgsbase(val); + curr->arch.pv_vcpu.gs_base_kernel = val; + return X86EMUL_OKAY; + + case MSR_SHADOW_GS_BASE: + if ( is_pv_32bit_domain(currd) || !is_canonical_address(val) ) + break; + wrmsrl(MSR_SHADOW_GS_BASE, val); + curr->arch.pv_vcpu.gs_base_user = val; + return X86EMUL_OKAY; + + case MSR_K7_FID_VID_STATUS: + case MSR_K7_FID_VID_CTL: + case MSR_K8_PSTATE_LIMIT: + case MSR_K8_PSTATE_CTRL: + case MSR_K8_PSTATE_STATUS: + case MSR_K8_PSTATE0: + case MSR_K8_PSTATE1: + case MSR_K8_PSTATE2: + case MSR_K8_PSTATE3: + case MSR_K8_PSTATE4: + case MSR_K8_PSTATE5: + case MSR_K8_PSTATE6: + case MSR_K8_PSTATE7: + case MSR_K8_HWCR: + if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ) + break; + if ( likely(!is_cpufreq_controller(currd)) || + wrmsr_safe(reg, val) == 0 ) + return X86EMUL_OKAY; + break; + + case MSR_AMD64_NB_CFG: + if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD || + boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x17 ) + break; + if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) ) + return X86EMUL_OKAY; + if ( (rdmsr_safe(MSR_AMD64_NB_CFG, temp) != 0) || + ((val ^ temp) & ~(1ULL << AMD64_NB_CFG_CF8_EXT_ENABLE_BIT)) ) + goto invalid; + if ( wrmsr_safe(MSR_AMD64_NB_CFG, val) == 0 ) + return X86EMUL_OKAY; + break; + + case MSR_FAM10H_MMIO_CONF_BASE: + if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD || + boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x17 ) + break; + if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) ) + return X86EMUL_OKAY; + if ( rdmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, temp) != 0 ) + break; + if ( (pci_probe & PCI_PROBE_MASK) == PCI_PROBE_MMCONF ? + temp != val : + ((temp ^ val) & + ~(FAM10H_MMIO_CONF_ENABLE | + (FAM10H_MMIO_CONF_BUSRANGE_MASK << + FAM10H_MMIO_CONF_BUSRANGE_SHIFT) | + ((u64)FAM10H_MMIO_CONF_BASE_MASK << + FAM10H_MMIO_CONF_BASE_SHIFT))) ) + goto invalid; + if ( wrmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, val) == 0 ) + return X86EMUL_OKAY; + break; + + case MSR_IA32_UCODE_REV: + if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ) + break; + if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) ) + return X86EMUL_OKAY; + if ( rdmsr_safe(reg, temp) ) + break; + if ( val ) + goto invalid; + return X86EMUL_OKAY; + + case MSR_IA32_MISC_ENABLE: + if ( rdmsr_safe(reg, temp) ) + break; + if ( val != guest_misc_enable(temp) ) + goto invalid; + return X86EMUL_OKAY; + + case MSR_IA32_MPERF: + case MSR_IA32_APERF: + if ( (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) && + (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) ) + break; + if ( likely(!is_cpufreq_controller(currd)) || + wrmsr_safe(reg, val) == 0 ) + return X86EMUL_OKAY; + break; + + case MSR_IA32_PERF_CTL: + if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ) + break; + if ( likely(!is_cpufreq_controller(currd)) || + wrmsr_safe(reg, val) == 0 ) + return X86EMUL_OKAY; + break; + + case MSR_IA32_THERM_CONTROL: + case MSR_IA32_ENERGY_PERF_BIAS: + if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ) + break; + if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) || + wrmsr_safe(reg, val) == 0 ) + return X86EMUL_OKAY; + break; + + case MSR_AMD64_DR0_ADDRESS_MASK: + if ( !boot_cpu_has(X86_FEATURE_DBEXT) || (val >> 32) ) + break; + curr->arch.pv_vcpu.dr_mask[0] = val; + if ( curr->arch.debugreg[7] & DR7_ACTIVE_MASK ) + wrmsrl(MSR_AMD64_DR0_ADDRESS_MASK, val); + return X86EMUL_OKAY; + + case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK: + if ( !boot_cpu_has(X86_FEATURE_DBEXT) || (val >> 32) ) + break; + curr->arch.pv_vcpu.dr_mask[reg - MSR_AMD64_DR1_ADDRESS_MASK + 1] = val; + if ( curr->arch.debugreg[7] & DR7_ACTIVE_MASK ) + wrmsrl(reg, val); + return X86EMUL_OKAY; + + case MSR_INTEL_PLATFORM_INFO: + if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || + val || rdmsr_safe(MSR_INTEL_PLATFORM_INFO, val) ) + break; + return X86EMUL_OKAY; + + case MSR_INTEL_MISC_FEATURES_ENABLES: + if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || + (val & ~MSR_MISC_FEATURES_CPUID_FAULTING) || + rdmsr_safe(MSR_INTEL_MISC_FEATURES_ENABLES, temp) ) + break; + if ( (val & MSR_MISC_FEATURES_CPUID_FAULTING) && + !this_cpu(cpuid_faulting_enabled) ) + break; + curr->arch.cpuid_faulting = !!(val & MSR_MISC_FEATURES_CPUID_FAULTING); + return X86EMUL_OKAY; + + case MSR_P6_PERFCTR(0)...MSR_P6_PERFCTR(7): + case MSR_P6_EVNTSEL(0)...MSR_P6_EVNTSEL(3): + case MSR_CORE_PERF_FIXED_CTR0...MSR_CORE_PERF_FIXED_CTR2: + case MSR_CORE_PERF_FIXED_CTR_CTRL...MSR_CORE_PERF_GLOBAL_OVF_CTRL: + if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ) + { + vpmu_msr = true; + case MSR_AMD_FAM15H_EVNTSEL0...MSR_AMD_FAM15H_PERFCTR5: + case MSR_K7_EVNTSEL0...MSR_K7_PERFCTR3: + if ( vpmu_msr || (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) ) + { + if ( (vpmu_mode & XENPMU_MODE_ALL) && + !is_hardware_domain(currd) ) + return X86EMUL_OKAY; + + if ( vpmu_do_wrmsr(reg, val, 0) ) + break; + return X86EMUL_OKAY; + } + } + /* fall through */ + default: + if ( wrmsr_hypervisor_regs(reg, val) == 1 ) + return X86EMUL_OKAY; + + rc = vmce_wrmsr(reg, val); + if ( rc < 0 ) + break; + if ( rc ) + return X86EMUL_OKAY; + + if ( (rdmsr_safe(reg, temp) != 0) || (val != temp) ) + invalid: + gdprintk(XENLOG_WARNING, + "Domain attempted WRMSR %08x from 0x%016"PRIx64" to 0x%016"PRIx64"\n", + reg, temp, val); + return X86EMUL_OKAY; + } + + return X86EMUL_UNHANDLEABLE; +} + +static int priv_op_wbinvd(struct x86_emulate_ctxt *ctxt) +{ + /* Ignore the instruction if unprivileged. */ + if ( !cache_flush_permitted(current->domain) ) + /* + * Non-physdev domain attempted WBINVD; ignore for now since + * newer linux uses this in some start-of-day timing loops. + */ + ; + else + wbinvd(); + + return X86EMUL_OKAY; +} + +int pv_emul_cpuid(uint32_t leaf, uint32_t subleaf, + struct cpuid_leaf *res, struct x86_emulate_ctxt *ctxt) +{ + guest_cpuid(current, leaf, subleaf, res); + + return X86EMUL_OKAY; +} + +static int priv_op_validate(const struct x86_emulate_state *state, + struct x86_emulate_ctxt *ctxt) +{ + switch ( ctxt->opcode ) + { + case 0x6c ... 0x6f: /* ins / outs */ + case 0xe4 ... 0xe7: /* in / out (immediate port) */ + case 0xec ... 0xef: /* in / out (port in %dx) */ + case X86EMUL_OPC(0x0f, 0x06): /* clts */ + case X86EMUL_OPC(0x0f, 0x09): /* wbinvd */ + case X86EMUL_OPC(0x0f, 0x20) ... + X86EMUL_OPC(0x0f, 0x23): /* mov to/from cr/dr */ + case X86EMUL_OPC(0x0f, 0x30): /* wrmsr */ + case X86EMUL_OPC(0x0f, 0x31): /* rdtsc */ + case X86EMUL_OPC(0x0f, 0x32): /* rdmsr */ + case X86EMUL_OPC(0x0f, 0xa2): /* cpuid */ + return X86EMUL_OKAY; + + case 0xfa: case 0xfb: /* cli / sti */ + if ( !iopl_ok(current, ctxt->regs) ) + break; + /* + * This is just too dangerous to allow, in my opinion. Consider if the + * caller then tries to reenable interrupts using POPF: we can't trap + * that and we'll end up with hard-to-debug lockups. Fast & loose will + * do for us. :-) + vcpu_info(current, evtchn_upcall_mask) = (ctxt->opcode == 0xfa); + */ + return X86EMUL_DONE; + + case X86EMUL_OPC(0x0f, 0x01): + { + unsigned int modrm_rm, modrm_reg; + + if ( x86_insn_modrm(state, &modrm_rm, &modrm_reg) != 3 || + (modrm_rm & 7) != 1 ) + break; + switch ( modrm_reg & 7 ) + { + case 2: /* xsetbv */ + case 7: /* rdtscp */ + return X86EMUL_OKAY; + } + break; + } + } + + return X86EMUL_UNHANDLEABLE; +} + +static int priv_op_insn_fetch(enum x86_segment seg, + unsigned long offset, + void *p_data, + unsigned int bytes, + struct x86_emulate_ctxt *ctxt) +{ + const struct priv_op_ctxt *poc = + container_of(ctxt, struct priv_op_ctxt, ctxt); + unsigned int rc; + unsigned long addr = poc->cs.base + offset; + + ASSERT(seg == x86_seg_cs); + + /* We don't mean to emulate any branches. */ + if ( !bytes ) + return X86EMUL_UNHANDLEABLE; + + rc = pv_emul_virt_to_linear(poc->cs.base, offset, bytes, poc->cs.limit, + x86_seg_cs, ctxt, &addr); + if ( rc != X86EMUL_OKAY ) + return rc; + + if ( (rc = __copy_from_user(p_data, (void *)addr, bytes)) != 0 ) + { + /* + * TODO: This should report PFEC_insn_fetch when goc->insn_fetch && + * cpu_has_nx, but we'd then need a "fetch" variant of + * __copy_from_user() respecting NX, SMEP, and protection keys. + */ + x86_emul_pagefault(0, addr + bytes - rc, ctxt); + return X86EMUL_EXCEPTION; + } + + return X86EMUL_OKAY; +} + + +static const struct x86_emulate_ops priv_op_ops = { + .insn_fetch = priv_op_insn_fetch, + .read = x86emul_unhandleable_rw, + .validate = priv_op_validate, + .read_io = priv_op_read_io, + .write_io = priv_op_write_io, + .rep_ins = priv_op_rep_ins, + .rep_outs = priv_op_rep_outs, + .read_segment = priv_op_read_segment, + .read_cr = priv_op_read_cr, + .write_cr = priv_op_write_cr, + .read_dr = priv_op_read_dr, + .write_dr = priv_op_write_dr, + .read_msr = priv_op_read_msr, + .write_msr = priv_op_write_msr, + .cpuid = pv_emul_cpuid, + .wbinvd = priv_op_wbinvd, +}; + +int emulate_privileged_op(struct cpu_user_regs *regs) +{ + struct vcpu *curr = current; + struct domain *currd = curr->domain; + struct priv_op_ctxt ctxt = { + .ctxt.regs = regs, + .ctxt.vendor = currd->arch.cpuid->x86_vendor, + .ctxt.lma = !is_pv_32bit_domain(currd), + }; + int rc; + unsigned int eflags, ar; + + if ( !read_descriptor(regs->cs, curr, &ctxt.cs.base, &ctxt.cs.limit, + &ar, 1) || + !(ar & _SEGMENT_S) || + !(ar & _SEGMENT_P) || + !(ar & _SEGMENT_CODE) ) + return 0; + + /* Mirror virtualized state into EFLAGS. */ + ASSERT(regs->eflags & X86_EFLAGS_IF); + if ( vcpu_info(curr, evtchn_upcall_mask) ) + regs->eflags &= ~X86_EFLAGS_IF; + else + regs->eflags |= X86_EFLAGS_IF; + ASSERT(!(regs->eflags & X86_EFLAGS_IOPL)); + regs->eflags |= curr->arch.pv_vcpu.iopl; + eflags = regs->eflags; + + ctxt.ctxt.addr_size = ar & _SEGMENT_L ? 64 : ar & _SEGMENT_DB ? 32 : 16; + /* Leave zero in ctxt.ctxt.sp_size, as it's not needed. */ + rc = x86_emulate(&ctxt.ctxt, &priv_op_ops); + + if ( ctxt.io_emul_stub ) + unmap_domain_page(ctxt.io_emul_stub); + + /* + * Un-mirror virtualized state from EFLAGS. + * Nothing we allow to be emulated can change anything other than the + * arithmetic bits, and the resume flag. + */ + ASSERT(!((regs->eflags ^ eflags) & + ~(X86_EFLAGS_RF | X86_EFLAGS_ARITH_MASK))); + regs->eflags |= X86_EFLAGS_IF; + regs->eflags &= ~X86_EFLAGS_IOPL; + + switch ( rc ) + { + case X86EMUL_OKAY: + if ( ctxt.tsc & TSC_BASE ) + { + if ( ctxt.tsc & TSC_AUX ) + pv_soft_rdtsc(curr, regs, 1); + else if ( currd->arch.vtsc ) + pv_soft_rdtsc(curr, regs, 0); + else + msr_split(regs, rdtsc()); + } + + if ( ctxt.ctxt.retire.singlestep ) + ctxt.bpmatch |= DR_STEP; + if ( ctxt.bpmatch ) + { + curr->arch.debugreg[6] |= ctxt.bpmatch | DR_STATUS_RESERVED_ONE; + if ( !(curr->arch.pv_vcpu.trap_bounce.flags & TBF_EXCEPTION) ) + pv_inject_hw_exception(TRAP_debug, X86_EVENT_NO_EC); + } + /* fall through */ + case X86EMUL_RETRY: + return EXCRET_fault_fixed; + + case X86EMUL_EXCEPTION: + pv_inject_event(&ctxt.ctxt.event); + return EXCRET_fault_fixed; + } + + return 0; +} + +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/xen/arch/x86/x86_64/gpr_switch.S b/xen/arch/x86/pv/gpr_switch.S similarity index 100% rename from xen/arch/x86/x86_64/gpr_switch.S rename to xen/arch/x86/pv/gpr_switch.S diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c index cd8ca20398..cd43e9f44c 100644 --- a/xen/arch/x86/traps.c +++ b/xen/arch/x86/traps.c @@ -78,6 +78,8 @@ #include <asm/cpuid.h> #include <xsm/xsm.h> +#include <asm/pv/traps.h> + /* * opt_nmi: one of 'ignore', 'dom0', or 'fatal'. * fatal: Xen prints diagnostic message and then hangs. @@ -705,41 +707,6 @@ static void instruction_done(struct cpu_user_regs *regs, unsigned long rip) } } -static unsigned int check_guest_io_breakpoint(struct vcpu *v, - unsigned int port, unsigned int len) -{ - unsigned int width, i, match = 0; - unsigned long start; - - if ( !(v->arch.debugreg[5]) || - !(v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_DE) ) - return 0; - - for ( i = 0; i < 4; i++ ) - { - if ( !(v->arch.debugreg[5] & - (3 << (i * DR_ENABLE_SIZE))) ) - continue; - - start = v->arch.debugreg[i]; - width = 0; - - switch ( (v->arch.debugreg[7] >> - (DR_CONTROL_SHIFT + i * DR_CONTROL_SIZE)) & 0xc ) - { - case DR_LEN_1: width = 1; break; - case DR_LEN_2: width = 2; break; - case DR_LEN_4: width = 4; break; - case DR_LEN_8: width = 8; break; - } - - if ( (start < (port + len)) && ((start + width) > port) ) - match |= 1 << i; - } - - return match; -} - /* * Called from asm to set up the MCE trapbounce info. * Returns 0 if no callback is set up, else 1. @@ -1733,1327 +1700,6 @@ static int read_gate_descriptor(unsigned int gate_sel, return 1; } -static int pv_emul_virt_to_linear(unsigned long base, unsigned long offset, - unsigned int bytes, unsigned long limit, - enum x86_segment seg, - struct x86_emulate_ctxt *ctxt, - unsigned long *addr) -{ - int rc = X86EMUL_OKAY; - - *addr = base + offset; - - if ( ctxt->addr_size < 64 ) - { - if ( limit < bytes - 1 || offset > limit - bytes + 1 ) - rc = X86EMUL_EXCEPTION; - *addr = (uint32_t)*addr; - } - else if ( !__addr_ok(*addr) ) - rc = X86EMUL_EXCEPTION; - - if ( unlikely(rc == X86EMUL_EXCEPTION) ) - x86_emul_hw_exception(seg != x86_seg_ss ? TRAP_gp_fault - : TRAP_stack_error, - 0, ctxt); - - return rc; -} - -struct priv_op_ctxt { - struct x86_emulate_ctxt ctxt; - struct { - unsigned long base, limit; - } cs; - char *io_emul_stub; - unsigned int bpmatch; - unsigned int tsc; -#define TSC_BASE 1 -#define TSC_AUX 2 -}; - -static int priv_op_insn_fetch(enum x86_segment seg, - unsigned long offset, - void *p_data, - unsigned int bytes, - struct x86_emulate_ctxt *ctxt) -{ - const struct priv_op_ctxt *poc = - container_of(ctxt, struct priv_op_ctxt, ctxt); - unsigned int rc; - unsigned long addr = poc->cs.base + offset; - - ASSERT(seg == x86_seg_cs); - - /* We don't mean to emulate any branches. */ - if ( !bytes ) - return X86EMUL_UNHANDLEABLE; - - rc = pv_emul_virt_to_linear(poc->cs.base, offset, bytes, poc->cs.limit, - x86_seg_cs, ctxt, &addr); - if ( rc != X86EMUL_OKAY ) - return rc; - - if ( (rc = __copy_from_user(p_data, (void *)addr, bytes)) != 0 ) - { - /* - * TODO: This should report PFEC_insn_fetch when goc->insn_fetch && - * cpu_has_nx, but we'd then need a "fetch" variant of - * __copy_from_user() respecting NX, SMEP, and protection keys. - */ - x86_emul_pagefault(0, addr + bytes - rc, ctxt); - return X86EMUL_EXCEPTION; - } - - return X86EMUL_OKAY; -} - -static int priv_op_read_segment(enum x86_segment seg, - struct segment_register *reg, - struct x86_emulate_ctxt *ctxt) -{ - /* Check if this is an attempt to access the I/O bitmap. */ - if ( seg == x86_seg_tr ) - { - switch ( ctxt->opcode ) - { - case 0x6c ... 0x6f: /* ins / outs */ - case 0xe4 ... 0xe7: /* in / out (immediate port) */ - case 0xec ... 0xef: /* in / out (port in %dx) */ - /* Defer the check to priv_op_{read,write}_io(). */ - return X86EMUL_DONE; - } - } - - if ( ctxt->addr_size < 64 ) - { - unsigned long limit; - unsigned int sel, ar; - - switch ( seg ) - { - case x86_seg_cs: sel = ctxt->regs->cs; break; - case x86_seg_ds: sel = read_sreg(ds); break; - case x86_seg_es: sel = read_sreg(es); break; - case x86_seg_fs: sel = read_sreg(fs); break; - case x86_seg_gs: sel = read_sreg(gs); break; - case x86_seg_ss: sel = ctxt->regs->ss; break; - default: return X86EMUL_UNHANDLEABLE; - } - - if ( !read_descriptor(sel, current, ®->base, &limit, &ar, 0) ) - return X86EMUL_UNHANDLEABLE; - - reg->limit = limit; - reg->attr.bytes = ar >> 8; - } - else - { - switch ( seg ) - { - default: - if ( !is_x86_user_segment(seg) ) - return X86EMUL_UNHANDLEABLE; - reg->base = 0; - break; - case x86_seg_fs: - reg->base = rdfsbase(); - break; - case x86_seg_gs: - reg->base = rdgsbase(); - break; - } - - reg->limit = ~0U; - - reg->attr.bytes = 0; - reg->attr.fields.type = _SEGMENT_WR >> 8; - if ( seg == x86_seg_cs ) - { - reg->attr.fields.type |= _SEGMENT_CODE >> 8; - reg->attr.fields.l = 1; - } - else - reg->attr.fields.db = 1; - reg->attr.fields.s = 1; - reg->attr.fields.dpl = 3; - reg->attr.fields.p = 1; - reg->attr.fields.g = 1; - } - - /* - * For x86_emulate.c's mode_ring0() to work, fake a DPL of zero. - * Also do this for consistency for non-conforming code segments. - */ - if ( (seg == x86_seg_ss || - (seg == x86_seg_cs && - !(reg->attr.fields.type & (_SEGMENT_EC >> 8)))) && - guest_kernel_mode(current, ctxt->regs) ) - reg->attr.fields.dpl = 0; - - return X86EMUL_OKAY; -} - -/* Perform IOPL check between the vcpu's shadowed IOPL, and the assumed cpl. */ -static bool_t iopl_ok(const struct vcpu *v, const struct cpu_user_regs *regs) -{ - unsigned int cpl = guest_kernel_mode(v, regs) ? - (VM_ASSIST(v->domain, architectural_iopl) ? 0 : 1) : 3; - - ASSERT((v->arch.pv_vcpu.iopl & ~X86_EFLAGS_IOPL) == 0); - - return IOPL(cpl) <= v->arch.pv_vcpu.iopl; -} - -/* Has the guest requested sufficient permission for this I/O access? */ -static int guest_io_okay( - unsigned int port, unsigned int bytes, - struct vcpu *v, struct cpu_user_regs *regs) -{ - /* If in user mode, switch to kernel mode just to read I/O bitmap. */ - int user_mode = !(v->arch.flags & TF_kernel_mode); -#define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v) - - if ( iopl_ok(v, regs) ) - return 1; - - if ( v->arch.pv_vcpu.iobmp_limit > (port + bytes) ) - { - union { uint8_t bytes[2]; uint16_t mask; } x; - - /* - * Grab permission bytes from guest space. Inaccessible bytes are - * read as 0xff (no access allowed). - */ - TOGGLE_MODE(); - switch ( __copy_from_guest_offset(x.bytes, v->arch.pv_vcpu.iobmp, - port>>3, 2) ) - { - default: x.bytes[0] = ~0; - /* fallthrough */ - case 1: x.bytes[1] = ~0; - /* fallthrough */ - case 0: break; - } - TOGGLE_MODE(); - - if ( (x.mask & (((1<<bytes)-1) << (port&7))) == 0 ) - return 1; - } - - return 0; -} - -/* Has the administrator granted sufficient permission for this I/O access? */ -static bool_t admin_io_okay(unsigned int port, unsigned int bytes, - const struct domain *d) -{ - /* - * Port 0xcf8 (CONFIG_ADDRESS) is only visible for DWORD accesses. - * We never permit direct access to that register. - */ - if ( (port == 0xcf8) && (bytes == 4) ) - return 0; - - /* We also never permit direct access to the RTC/CMOS registers. */ - if ( ((port & ~1) == RTC_PORT(0)) ) - return 0; - - return ioports_access_permitted(d, port, port + bytes - 1); -} - -static bool_t pci_cfg_ok(struct domain *currd, unsigned int start, - unsigned int size, uint32_t *write) -{ - uint32_t machine_bdf; - - if ( !is_hardware_domain(currd) ) - return 0; - - if ( !CF8_ENABLED(currd->arch.pci_cf8) ) - return 1; - - machine_bdf = CF8_BDF(currd->arch.pci_cf8); - if ( write ) - { - const unsigned long *ro_map = pci_get_ro_map(0); - - if ( ro_map && test_bit(machine_bdf, ro_map) ) - return 0; - } - start |= CF8_ADDR_LO(currd->arch.pci_cf8); - /* AMD extended configuration space access? */ - if ( CF8_ADDR_HI(currd->arch.pci_cf8) && - boot_cpu_data.x86_vendor == X86_VENDOR_AMD && - boot_cpu_data.x86 >= 0x10 && boot_cpu_data.x86 <= 0x17 ) - { - uint64_t msr_val; - - if ( rdmsr_safe(MSR_AMD64_NB_CFG, msr_val) ) - return 0; - if ( msr_val & (1ULL << AMD64_NB_CFG_CF8_EXT_ENABLE_BIT) ) - start |= CF8_ADDR_HI(currd->arch.pci_cf8); - } - - return !write ? - xsm_pci_config_permission(XSM_HOOK, currd, machine_bdf, - start, start + size - 1, 0) == 0 : - pci_conf_write_intercept(0, machine_bdf, start, size, write) >= 0; -} - -uint32_t guest_io_read(unsigned int port, unsigned int bytes, - struct domain *currd) -{ - uint32_t data = 0; - unsigned int shift = 0; - - if ( admin_io_okay(port, bytes, currd) ) - { - switch ( bytes ) - { - case 1: return inb(port); - case 2: return inw(port); - case 4: return inl(port); - } - } - - while ( bytes != 0 ) - { - unsigned int size = 1; - uint32_t sub_data = ~0; - - if ( (port == 0x42) || (port == 0x43) || (port == 0x61) ) - { - sub_data = pv_pit_handler(port, 0, 0); - } - else if ( port == RTC_PORT(0) ) - { - sub_data = currd->arch.cmos_idx; - } - else if ( (port == RTC_PORT(1)) && - ioports_access_permitted(currd, RTC_PORT(0), RTC_PORT(1)) ) - { - unsigned long flags; - - spin_lock_irqsave(&rtc_lock, flags); - outb(currd->arch.cmos_idx & 0x7f, RTC_PORT(0)); - sub_data = inb(RTC_PORT(1)); - spin_unlock_irqrestore(&rtc_lock, flags); - } - else if ( (port == 0xcf8) && (bytes == 4) ) - { - size = 4; - sub_data = currd->arch.pci_cf8; - } - else if ( (port & 0xfffc) == 0xcfc ) - { - size = min(bytes, 4 - (port & 3)); - if ( size == 3 ) - size = 2; - if ( pci_cfg_ok(currd, port & 3, size, NULL) ) - sub_data = pci_conf_read(currd->arch.pci_cf8, port & 3, size); - } - - if ( size == 4 ) - return sub_data; - - data |= (sub_data & ((1u << (size * 8)) - 1)) << shift; - shift += size * 8; - port += size; - bytes -= size; - } - - return data; -} - -void guest_io_write(unsigned int port, unsigned int bytes, uint32_t data, - struct domain *currd) -{ - if ( admin_io_okay(port, bytes, currd) ) - { - switch ( bytes ) { - case 1: - outb((uint8_t)data, port); - if ( pv_post_outb_hook ) - pv_post_outb_hook(port, (uint8_t)data); - break; - case 2: - outw((uint16_t)data, port); - break; - case 4: - outl(data, port); - break; - } - return; - } - - while ( bytes != 0 ) - { - unsigned int size = 1; - - if ( (port == 0x42) || (port == 0x43) || (port == 0x61) ) - { - pv_pit_handler(port, (uint8_t)data, 1); - } - else if ( port == RTC_PORT(0) ) - { - currd->arch.cmos_idx = data; - } - else if ( (port == RTC_PORT(1)) && - ioports_access_permitted(currd, RTC_PORT(0), RTC_PORT(1)) ) - { - unsigned long flags; - - if ( pv_rtc_handler ) - pv_rtc_handler(currd->arch.cmos_idx & 0x7f, data); - spin_lock_irqsave(&rtc_lock, flags); - outb(currd->arch.cmos_idx & 0x7f, RTC_PORT(0)); - outb(data, RTC_PORT(1)); - spin_unlock_irqrestore(&rtc_lock, flags); - } - else if ( (port == 0xcf8) && (bytes == 4) ) - { - size = 4; - currd->arch.pci_cf8 = data; - } - else if ( (port & 0xfffc) == 0xcfc ) - { - size = min(bytes, 4 - (port & 3)); - if ( size == 3 ) - size = 2; - if ( pci_cfg_ok(currd, port & 3, size, &data) ) - pci_conf_write(currd->arch.pci_cf8, port & 3, size, data); - } - - if ( size == 4 ) - return; - - port += size; - bytes -= size; - data >>= size * 8; - } -} - -/* I/O emulation support. Helper routines for, and type of, the stack stub.*/ -void host_to_guest_gpr_switch(struct cpu_user_regs *); -unsigned long guest_to_host_gpr_switch(unsigned long); - -void (*pv_post_outb_hook)(unsigned int port, u8 value); - -typedef void io_emul_stub_t(struct cpu_user_regs *); - -static io_emul_stub_t *io_emul_stub_setup(struct priv_op_ctxt *ctxt, u8 opcode, - unsigned int port, unsigned int bytes) -{ - if ( !ctxt->io_emul_stub ) - ctxt->io_emul_stub = map_domain_page(_mfn(this_cpu(stubs.mfn))) + - (this_cpu(stubs.addr) & - ~PAGE_MASK) + - STUB_BUF_SIZE / 2; - - /* movq $host_to_guest_gpr_switch,%rcx */ - ctxt->io_emul_stub[0] = 0x48; - ctxt->io_emul_stub[1] = 0xb9; - *(void **)&ctxt->io_emul_stub[2] = (void *)host_to_guest_gpr_switch; - /* callq *%rcx */ - ctxt->io_emul_stub[10] = 0xff; - ctxt->io_emul_stub[11] = 0xd1; - /* data16 or nop */ - ctxt->io_emul_stub[12] = (bytes != 2) ? 0x90 : 0x66; - /* <io-access opcode> */ - ctxt->io_emul_stub[13] = opcode; - /* imm8 or nop */ - ctxt->io_emul_stub[14] = !(opcode & 8) ? port : 0x90; - /* ret (jumps to guest_to_host_gpr_switch) */ - ctxt->io_emul_stub[15] = 0xc3; - BUILD_BUG_ON(STUB_BUF_SIZE / 2 < 16); - - if ( ioemul_handle_quirk ) - ioemul_handle_quirk(opcode, &ctxt->io_emul_stub[12], ctxt->ctxt.regs); - - /* Handy function-typed pointer to the stub. */ - return (void *)(this_cpu(stubs.addr) + STUB_BUF_SIZE / 2); -} - -static int priv_op_read_io(unsigned int port, unsigned int bytes, - unsigned long *val, struct x86_emulate_ctxt *ctxt) -{ - struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt); - struct vcpu *curr = current; - struct domain *currd = current->domain; - - /* INS must not come here. */ - ASSERT((ctxt->opcode & ~9) == 0xe4); - - if ( !guest_io_okay(port, bytes, curr, ctxt->regs) ) - return X86EMUL_UNHANDLEABLE; - - poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes); - - if ( admin_io_okay(port, bytes, currd) ) - { - io_emul_stub_t *io_emul = - io_emul_stub_setup(poc, ctxt->opcode, port, bytes); - - mark_regs_dirty(ctxt->regs); - io_emul(ctxt->regs); - return X86EMUL_DONE; - } - - *val = guest_io_read(port, bytes, currd); - - return X86EMUL_OKAY; -} - -static int priv_op_write_io(unsigned int port, unsigned int bytes, - unsigned long val, struct x86_emulate_ctxt *ctxt) -{ - struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt); - struct vcpu *curr = current; - struct domain *currd = current->domain; - - /* OUTS must not come here. */ - ASSERT((ctxt->opcode & ~9) == 0xe6); - - if ( !guest_io_okay(port, bytes, curr, ctxt->regs) ) - return X86EMUL_UNHANDLEABLE; - - poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes); - - if ( admin_io_okay(port, bytes, currd) ) - { - io_emul_stub_t *io_emul = - io_emul_stub_setup(poc, ctxt->opcode, port, bytes); - - mark_regs_dirty(ctxt->regs); - io_emul(ctxt->regs); - if ( (bytes == 1) && pv_post_outb_hook ) - pv_post_outb_hook(port, val); - return X86EMUL_DONE; - } - - guest_io_write(port, bytes, val, currd); - - return X86EMUL_OKAY; -} - -static int priv_op_rep_ins(uint16_t port, - enum x86_segment seg, unsigned long offset, - unsigned int bytes_per_rep, unsigned long *reps, - struct x86_emulate_ctxt *ctxt) -{ - struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt); - struct vcpu *curr = current; - struct domain *currd = current->domain; - unsigned long goal = *reps; - struct segment_register sreg; - int rc; - - ASSERT(seg == x86_seg_es); - - *reps = 0; - - if ( !guest_io_okay(port, bytes_per_rep, curr, ctxt->regs) ) - return X86EMUL_UNHANDLEABLE; - - rc = priv_op_read_segment(x86_seg_es, &sreg, ctxt); - if ( rc != X86EMUL_OKAY ) - return rc; - - if ( !sreg.attr.fields.p ) - return X86EMUL_UNHANDLEABLE; - if ( !sreg.attr.fields.s || - (sreg.attr.fields.type & (_SEGMENT_CODE >> 8)) || - !(sreg.attr.fields.type & (_SEGMENT_WR >> 8)) ) - { - x86_emul_hw_exception(TRAP_gp_fault, 0, ctxt); - return X86EMUL_EXCEPTION; - } - - poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes_per_rep); - - while ( *reps < goal ) - { - unsigned int data = guest_io_read(port, bytes_per_rep, currd); - unsigned long addr; - - rc = pv_emul_virt_to_linear(sreg.base, offset, bytes_per_rep, - sreg.limit, x86_seg_es, ctxt, &addr); - if ( rc != X86EMUL_OKAY ) - return rc; - - if ( (rc = __copy_to_user((void *)addr, &data, bytes_per_rep)) != 0 ) - { - x86_emul_pagefault(PFEC_write_access, - addr + bytes_per_rep - rc, ctxt); - return X86EMUL_EXCEPTION; - } - - ++*reps; - - if ( poc->bpmatch || hypercall_preempt_check() ) - break; - - /* x86_emulate() clips the repetition count to ensure we don't wrap. */ - if ( unlikely(ctxt->regs->eflags & X86_EFLAGS_DF) ) - offset -= bytes_per_rep; - else - offset += bytes_per_rep; - } - - return X86EMUL_OKAY; -} - -static int priv_op_rep_outs(enum x86_segment seg, unsigned long offset, - uint16_t port, - unsigned int bytes_per_rep, unsigned long *reps, - struct x86_emulate_ctxt *ctxt) -{ - struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt); - struct vcpu *curr = current; - struct domain *currd = current->domain; - unsigned long goal = *reps; - struct segment_register sreg; - int rc; - - *reps = 0; - - if ( !guest_io_okay(port, bytes_per_rep, curr, ctxt->regs) ) - return X86EMUL_UNHANDLEABLE; - - rc = priv_op_read_segment(seg, &sreg, ctxt); - if ( rc != X86EMUL_OKAY ) - return rc; - - if ( !sreg.attr.fields.p ) - return X86EMUL_UNHANDLEABLE; - if ( !sreg.attr.fields.s || - ((sreg.attr.fields.type & (_SEGMENT_CODE >> 8)) && - !(sreg.attr.fields.type & (_SEGMENT_WR >> 8))) ) - { - x86_emul_hw_exception(seg != x86_seg_ss ? TRAP_gp_fault - : TRAP_stack_error, - 0, ctxt); - return X86EMUL_EXCEPTION; - } - - poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes_per_rep); - - while ( *reps < goal ) - { - unsigned int data = 0; - unsigned long addr; - - rc = pv_emul_virt_to_linear(sreg.base, offset, bytes_per_rep, - sreg.limit, seg, ctxt, &addr); - if ( rc != X86EMUL_OKAY ) - return rc; - - if ( (rc = __copy_from_user(&data, (void *)addr, bytes_per_rep)) != 0 ) - { - x86_emul_pagefault(0, addr + bytes_per_rep - rc, ctxt); - return X86EMUL_EXCEPTION; - } - - guest_io_write(port, bytes_per_rep, data, currd); - - ++*reps; - - if ( poc->bpmatch || hypercall_preempt_check() ) - break; - - /* x86_emulate() clips the repetition count to ensure we don't wrap. */ - if ( unlikely(ctxt->regs->eflags & X86_EFLAGS_DF) ) - offset -= bytes_per_rep; - else - offset += bytes_per_rep; - } - - return X86EMUL_OKAY; -} - -static int priv_op_read_cr(unsigned int reg, unsigned long *val, - struct x86_emulate_ctxt *ctxt) -{ - const struct vcpu *curr = current; - - switch ( reg ) - { - case 0: /* Read CR0 */ - *val = (read_cr0() & ~X86_CR0_TS) | curr->arch.pv_vcpu.ctrlreg[0]; - return X86EMUL_OKAY; - - case 2: /* Read CR2 */ - case 4: /* Read CR4 */ - *val = curr->arch.pv_vcpu.ctrlreg[reg]; - return X86EMUL_OKAY; - - case 3: /* Read CR3 */ - { - const struct domain *currd = curr->domain; - unsigned long mfn; - - if ( !is_pv_32bit_domain(currd) ) - { - mfn = pagetable_get_pfn(curr->arch.guest_table); - *val = xen_pfn_to_cr3(mfn_to_gmfn(currd, mfn)); - } - else - { - l4_pgentry_t *pl4e = - map_domain_page(_mfn(pagetable_get_pfn(curr->arch.guest_table))); - - mfn = l4e_get_pfn(*pl4e); - unmap_domain_page(pl4e); - *val = compat_pfn_to_cr3(mfn_to_gmfn(currd, mfn)); - } - /* PTs should not be shared */ - BUG_ON(page_get_owner(mfn_to_page(mfn)) == dom_cow); - return X86EMUL_OKAY; - } - } - - return X86EMUL_UNHANDLEABLE; -} - -static int priv_op_write_cr(unsigned int reg, unsigned long val, - struct x86_emulate_ctxt *ctxt) -{ - struct vcpu *curr = current; - - switch ( reg ) - { - case 0: /* Write CR0 */ - if ( (val ^ read_cr0()) & ~X86_CR0_TS ) - { - gdprintk(XENLOG_WARNING, - "Attempt to change unmodifiable CR0 flags\n"); - break; - } - do_fpu_taskswitch(!!(val & X86_CR0_TS)); - return X86EMUL_OKAY; - - case 2: /* Write CR2 */ - curr->arch.pv_vcpu.ctrlreg[2] = val; - arch_set_cr2(curr, val); - return X86EMUL_OKAY; - - case 3: /* Write CR3 */ - { - struct domain *currd = curr->domain; - unsigned long gfn; - struct page_info *page; - int rc; - - gfn = !is_pv_32bit_domain(currd) - ? xen_cr3_to_pfn(val) : compat_cr3_to_pfn(val); - page = get_page_from_gfn(currd, gfn, NULL, P2M_ALLOC); - if ( !page ) - break; - rc = new_guest_cr3(page_to_mfn(page)); - put_page(page); - - switch ( rc ) - { - case 0: - return X86EMUL_OKAY; - case -ERESTART: /* retry after preemption */ - return X86EMUL_RETRY; - } - break; - } - - case 4: /* Write CR4 */ - curr->arch.pv_vcpu.ctrlreg[4] = pv_guest_cr4_fixup(curr, val); - write_cr4(pv_guest_cr4_to_real_cr4(curr)); - ctxt_switch_levelling(curr); - return X86EMUL_OKAY; - } - - return X86EMUL_UNHANDLEABLE; -} - -static int priv_op_read_dr(unsigned int reg, unsigned long *val, - struct x86_emulate_ctxt *ctxt) -{ - unsigned long res = do_get_debugreg(reg); - - if ( IS_ERR_VALUE(res) ) - return X86EMUL_UNHANDLEABLE; - - *val = res; - - return X86EMUL_OKAY; -} - -static int priv_op_write_dr(unsigned int reg, unsigned long val, - struct x86_emulate_ctxt *ctxt) -{ - return do_set_debugreg(reg, val) == 0 - ? X86EMUL_OKAY : X86EMUL_UNHANDLEABLE; -} - -static inline uint64_t guest_misc_enable(uint64_t val) -{ - val &= ~(MSR_IA32_MISC_ENABLE_PERF_AVAIL | - MSR_IA32_MISC_ENABLE_MONITOR_ENABLE); - val |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL | - MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL | - MSR_IA32_MISC_ENABLE_XTPR_DISABLE; - return val; -} - -static inline bool is_cpufreq_controller(const struct domain *d) -{ - return ((cpufreq_controller == FREQCTL_dom0_kernel) && - is_hardware_domain(d)); -} - -static int priv_op_read_msr(unsigned int reg, uint64_t *val, - struct x86_emulate_ctxt *ctxt) -{ - struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt); - const struct vcpu *curr = current; - const struct domain *currd = curr->domain; - bool vpmu_msr = false; - - switch ( reg ) - { - int rc; - - case MSR_FS_BASE: - if ( is_pv_32bit_domain(currd) ) - break; - *val = cpu_has_fsgsbase ? __rdfsbase() : curr->arch.pv_vcpu.fs_base; - return X86EMUL_OKAY; - - case MSR_GS_BASE: - if ( is_pv_32bit_domain(currd) ) - break; - *val = cpu_has_fsgsbase ? __rdgsbase() - : curr->arch.pv_vcpu.gs_base_kernel; - return X86EMUL_OKAY; - - case MSR_SHADOW_GS_BASE: - if ( is_pv_32bit_domain(currd) ) - break; - *val = curr->arch.pv_vcpu.gs_base_user; - return X86EMUL_OKAY; - - /* - * In order to fully retain original behavior, defer calling - * pv_soft_rdtsc() until after emulation. This may want/need to be - * reconsidered. - */ - case MSR_IA32_TSC: - poc->tsc |= TSC_BASE; - goto normal; - - case MSR_TSC_AUX: - poc->tsc |= TSC_AUX; - if ( cpu_has_rdtscp ) - goto normal; - *val = 0; - return X86EMUL_OKAY; - - case MSR_EFER: - *val = read_efer(); - if ( is_pv_32bit_domain(currd) ) - *val &= ~(EFER_LME | EFER_LMA | EFER_LMSLE); - return X86EMUL_OKAY; - - case MSR_K7_FID_VID_CTL: - case MSR_K7_FID_VID_STATUS: - case MSR_K8_PSTATE_LIMIT: - case MSR_K8_PSTATE_CTRL: - case MSR_K8_PSTATE_STATUS: - case MSR_K8_PSTATE0: - case MSR_K8_PSTATE1: - case MSR_K8_PSTATE2: - case MSR_K8_PSTATE3: - case MSR_K8_PSTATE4: - case MSR_K8_PSTATE5: - case MSR_K8_PSTATE6: - case MSR_K8_PSTATE7: - if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ) - break; - if ( unlikely(is_cpufreq_controller(currd)) ) - goto normal; - *val = 0; - return X86EMUL_OKAY; - - case MSR_IA32_UCODE_REV: - BUILD_BUG_ON(MSR_IA32_UCODE_REV != MSR_AMD_PATCHLEVEL); - if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ) - { - if ( wrmsr_safe(MSR_IA32_UCODE_REV, 0) ) - break; - /* As documented in the SDM: Do a CPUID 1 here */ - cpuid_eax(1); - } - goto normal; - - case MSR_IA32_MISC_ENABLE: - if ( rdmsr_safe(reg, *val) ) - break; - *val = guest_misc_enable(*val); - return X86EMUL_OKAY; - - case MSR_AMD64_DR0_ADDRESS_MASK: - if ( !boot_cpu_has(X86_FEATURE_DBEXT) ) - break; - *val = curr->arch.pv_vcpu.dr_mask[0]; - return X86EMUL_OKAY; - - case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK: - if ( !boot_cpu_has(X86_FEATURE_DBEXT) ) - break; - *val = curr->arch.pv_vcpu.dr_mask[reg - MSR_AMD64_DR1_ADDRESS_MASK + 1]; - return X86EMUL_OKAY; - - case MSR_IA32_PERF_CAPABILITIES: - /* No extra capabilities are supported. */ - *val = 0; - return X86EMUL_OKAY; - - case MSR_INTEL_PLATFORM_INFO: - if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || - rdmsr_safe(MSR_INTEL_PLATFORM_INFO, *val) ) - break; - *val = 0; - if ( this_cpu(cpuid_faulting_enabled) ) - *val |= MSR_PLATFORM_INFO_CPUID_FAULTING; - return X86EMUL_OKAY; - - case MSR_INTEL_MISC_FEATURES_ENABLES: - if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || - rdmsr_safe(MSR_INTEL_MISC_FEATURES_ENABLES, *val) ) - break; - *val = 0; - if ( curr->arch.cpuid_faulting ) - *val |= MSR_MISC_FEATURES_CPUID_FAULTING; - return X86EMUL_OKAY; - - case MSR_P6_PERFCTR(0)...MSR_P6_PERFCTR(7): - case MSR_P6_EVNTSEL(0)...MSR_P6_EVNTSEL(3): - case MSR_CORE_PERF_FIXED_CTR0...MSR_CORE_PERF_FIXED_CTR2: - case MSR_CORE_PERF_FIXED_CTR_CTRL...MSR_CORE_PERF_GLOBAL_OVF_CTRL: - if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ) - { - vpmu_msr = true; - /* fall through */ - case MSR_AMD_FAM15H_EVNTSEL0...MSR_AMD_FAM15H_PERFCTR5: - case MSR_K7_EVNTSEL0...MSR_K7_PERFCTR3: - if ( vpmu_msr || (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) ) - { - if ( vpmu_do_rdmsr(reg, val) ) - break; - return X86EMUL_OKAY; - } - } - /* fall through */ - default: - if ( rdmsr_hypervisor_regs(reg, val) ) - return X86EMUL_OKAY; - - rc = vmce_rdmsr(reg, val); - if ( rc < 0 ) - break; - if ( rc ) - return X86EMUL_OKAY; - /* fall through */ - normal: - /* Everyone can read the MSR space. */ - /* gdprintk(XENLOG_WARNING, "Domain attempted RDMSR %08x\n", reg); */ - if ( rdmsr_safe(reg, *val) ) - break; - return X86EMUL_OKAY; - } - - return X86EMUL_UNHANDLEABLE; -} - -#include "x86_64/mmconfig.h" - -static int priv_op_write_msr(unsigned int reg, uint64_t val, - struct x86_emulate_ctxt *ctxt) -{ - struct vcpu *curr = current; - const struct domain *currd = curr->domain; - bool vpmu_msr = false; - - switch ( reg ) - { - uint64_t temp; - int rc; - - case MSR_FS_BASE: - if ( is_pv_32bit_domain(currd) || !is_canonical_address(val) ) - break; - wrfsbase(val); - curr->arch.pv_vcpu.fs_base = val; - return X86EMUL_OKAY; - - case MSR_GS_BASE: - if ( is_pv_32bit_domain(currd) || !is_canonical_address(val) ) - break; - wrgsbase(val); - curr->arch.pv_vcpu.gs_base_kernel = val; - return X86EMUL_OKAY; - - case MSR_SHADOW_GS_BASE: - if ( is_pv_32bit_domain(currd) || !is_canonical_address(val) ) - break; - wrmsrl(MSR_SHADOW_GS_BASE, val); - curr->arch.pv_vcpu.gs_base_user = val; - return X86EMUL_OKAY; - - case MSR_K7_FID_VID_STATUS: - case MSR_K7_FID_VID_CTL: - case MSR_K8_PSTATE_LIMIT: - case MSR_K8_PSTATE_CTRL: - case MSR_K8_PSTATE_STATUS: - case MSR_K8_PSTATE0: - case MSR_K8_PSTATE1: - case MSR_K8_PSTATE2: - case MSR_K8_PSTATE3: - case MSR_K8_PSTATE4: - case MSR_K8_PSTATE5: - case MSR_K8_PSTATE6: - case MSR_K8_PSTATE7: - case MSR_K8_HWCR: - if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ) - break; - if ( likely(!is_cpufreq_controller(currd)) || - wrmsr_safe(reg, val) == 0 ) - return X86EMUL_OKAY; - break; - - case MSR_AMD64_NB_CFG: - if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD || - boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x17 ) - break; - if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) ) - return X86EMUL_OKAY; - if ( (rdmsr_safe(MSR_AMD64_NB_CFG, temp) != 0) || - ((val ^ temp) & ~(1ULL << AMD64_NB_CFG_CF8_EXT_ENABLE_BIT)) ) - goto invalid; - if ( wrmsr_safe(MSR_AMD64_NB_CFG, val) == 0 ) - return X86EMUL_OKAY; - break; - - case MSR_FAM10H_MMIO_CONF_BASE: - if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD || - boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x17 ) - break; - if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) ) - return X86EMUL_OKAY; - if ( rdmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, temp) != 0 ) - break; - if ( (pci_probe & PCI_PROBE_MASK) == PCI_PROBE_MMCONF ? - temp != val : - ((temp ^ val) & - ~(FAM10H_MMIO_CONF_ENABLE | - (FAM10H_MMIO_CONF_BUSRANGE_MASK << - FAM10H_MMIO_CONF_BUSRANGE_SHIFT) | - ((u64)FAM10H_MMIO_CONF_BASE_MASK << - FAM10H_MMIO_CONF_BASE_SHIFT))) ) - goto invalid; - if ( wrmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, val) == 0 ) - return X86EMUL_OKAY; - break; - - case MSR_IA32_UCODE_REV: - if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ) - break; - if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) ) - return X86EMUL_OKAY; - if ( rdmsr_safe(reg, temp) ) - break; - if ( val ) - goto invalid; - return X86EMUL_OKAY; - - case MSR_IA32_MISC_ENABLE: - if ( rdmsr_safe(reg, temp) ) - break; - if ( val != guest_misc_enable(temp) ) - goto invalid; - return X86EMUL_OKAY; - - case MSR_IA32_MPERF: - case MSR_IA32_APERF: - if ( (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) && - (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) ) - break; - if ( likely(!is_cpufreq_controller(currd)) || - wrmsr_safe(reg, val) == 0 ) - return X86EMUL_OKAY; - break; - - case MSR_IA32_PERF_CTL: - if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ) - break; - if ( likely(!is_cpufreq_controller(currd)) || - wrmsr_safe(reg, val) == 0 ) - return X86EMUL_OKAY; - break; - - case MSR_IA32_THERM_CONTROL: - case MSR_IA32_ENERGY_PERF_BIAS: - if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ) - break; - if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) || - wrmsr_safe(reg, val) == 0 ) - return X86EMUL_OKAY; - break; - - case MSR_AMD64_DR0_ADDRESS_MASK: - if ( !boot_cpu_has(X86_FEATURE_DBEXT) || (val >> 32) ) - break; - curr->arch.pv_vcpu.dr_mask[0] = val; - if ( curr->arch.debugreg[7] & DR7_ACTIVE_MASK ) - wrmsrl(MSR_AMD64_DR0_ADDRESS_MASK, val); - return X86EMUL_OKAY; - - case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK: - if ( !boot_cpu_has(X86_FEATURE_DBEXT) || (val >> 32) ) - break; - curr->arch.pv_vcpu.dr_mask[reg - MSR_AMD64_DR1_ADDRESS_MASK + 1] = val; - if ( curr->arch.debugreg[7] & DR7_ACTIVE_MASK ) - wrmsrl(reg, val); - return X86EMUL_OKAY; - - case MSR_INTEL_PLATFORM_INFO: - if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || - val || rdmsr_safe(MSR_INTEL_PLATFORM_INFO, val) ) - break; - return X86EMUL_OKAY; - - case MSR_INTEL_MISC_FEATURES_ENABLES: - if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || - (val & ~MSR_MISC_FEATURES_CPUID_FAULTING) || - rdmsr_safe(MSR_INTEL_MISC_FEATURES_ENABLES, temp) ) - break; - if ( (val & MSR_MISC_FEATURES_CPUID_FAULTING) && - !this_cpu(cpuid_faulting_enabled) ) - break; - curr->arch.cpuid_faulting = !!(val & MSR_MISC_FEATURES_CPUID_FAULTING); - return X86EMUL_OKAY; - - case MSR_P6_PERFCTR(0)...MSR_P6_PERFCTR(7): - case MSR_P6_EVNTSEL(0)...MSR_P6_EVNTSEL(3): - case MSR_CORE_PERF_FIXED_CTR0...MSR_CORE_PERF_FIXED_CTR2: - case MSR_CORE_PERF_FIXED_CTR_CTRL...MSR_CORE_PERF_GLOBAL_OVF_CTRL: - if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ) - { - vpmu_msr = true; - case MSR_AMD_FAM15H_EVNTSEL0...MSR_AMD_FAM15H_PERFCTR5: - case MSR_K7_EVNTSEL0...MSR_K7_PERFCTR3: - if ( vpmu_msr || (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) ) - { - if ( (vpmu_mode & XENPMU_MODE_ALL) && - !is_hardware_domain(currd) ) - return X86EMUL_OKAY; - - if ( vpmu_do_wrmsr(reg, val, 0) ) - break; - return X86EMUL_OKAY; - } - } - /* fall through */ - default: - if ( wrmsr_hypervisor_regs(reg, val) == 1 ) - return X86EMUL_OKAY; - - rc = vmce_wrmsr(reg, val); - if ( rc < 0 ) - break; - if ( rc ) - return X86EMUL_OKAY; - - if ( (rdmsr_safe(reg, temp) != 0) || (val != temp) ) - invalid: - gdprintk(XENLOG_WARNING, - "Domain attempted WRMSR %08x from 0x%016"PRIx64" to 0x%016"PRIx64"\n", - reg, temp, val); - return X86EMUL_OKAY; - } - - return X86EMUL_UNHANDLEABLE; -} - -static int priv_op_wbinvd(struct x86_emulate_ctxt *ctxt) -{ - /* Ignore the instruction if unprivileged. */ - if ( !cache_flush_permitted(current->domain) ) - /* - * Non-physdev domain attempted WBINVD; ignore for now since - * newer linux uses this in some start-of-day timing loops. - */ - ; - else - wbinvd(); - - return X86EMUL_OKAY; -} - -int pv_emul_cpuid(uint32_t leaf, uint32_t subleaf, - struct cpuid_leaf *res, struct x86_emulate_ctxt *ctxt) -{ - guest_cpuid(current, leaf, subleaf, res); - - return X86EMUL_OKAY; -} - -static int priv_op_validate(const struct x86_emulate_state *state, - struct x86_emulate_ctxt *ctxt) -{ - switch ( ctxt->opcode ) - { - case 0x6c ... 0x6f: /* ins / outs */ - case 0xe4 ... 0xe7: /* in / out (immediate port) */ - case 0xec ... 0xef: /* in / out (port in %dx) */ - case X86EMUL_OPC(0x0f, 0x06): /* clts */ - case X86EMUL_OPC(0x0f, 0x09): /* wbinvd */ - case X86EMUL_OPC(0x0f, 0x20) ... - X86EMUL_OPC(0x0f, 0x23): /* mov to/from cr/dr */ - case X86EMUL_OPC(0x0f, 0x30): /* wrmsr */ - case X86EMUL_OPC(0x0f, 0x31): /* rdtsc */ - case X86EMUL_OPC(0x0f, 0x32): /* rdmsr */ - case X86EMUL_OPC(0x0f, 0xa2): /* cpuid */ - return X86EMUL_OKAY; - - case 0xfa: case 0xfb: /* cli / sti */ - if ( !iopl_ok(current, ctxt->regs) ) - break; - /* - * This is just too dangerous to allow, in my opinion. Consider if the - * caller then tries to reenable interrupts using POPF: we can't trap - * that and we'll end up with hard-to-debug lockups. Fast & loose will - * do for us. :-) - vcpu_info(current, evtchn_upcall_mask) = (ctxt->opcode == 0xfa); - */ - return X86EMUL_DONE; - - case X86EMUL_OPC(0x0f, 0x01): - { - unsigned int modrm_rm, modrm_reg; - - if ( x86_insn_modrm(state, &modrm_rm, &modrm_reg) != 3 || - (modrm_rm & 7) != 1 ) - break; - switch ( modrm_reg & 7 ) - { - case 2: /* xsetbv */ - case 7: /* rdtscp */ - return X86EMUL_OKAY; - } - break; - } - } - - return X86EMUL_UNHANDLEABLE; -} - -static const struct x86_emulate_ops priv_op_ops = { - .insn_fetch = priv_op_insn_fetch, - .read = x86emul_unhandleable_rw, - .validate = priv_op_validate, - .read_io = priv_op_read_io, - .write_io = priv_op_write_io, - .rep_ins = priv_op_rep_ins, - .rep_outs = priv_op_rep_outs, - .read_segment = priv_op_read_segment, - .read_cr = priv_op_read_cr, - .write_cr = priv_op_write_cr, - .read_dr = priv_op_read_dr, - .write_dr = priv_op_write_dr, - .read_msr = priv_op_read_msr, - .write_msr = priv_op_write_msr, - .cpuid = pv_emul_cpuid, - .wbinvd = priv_op_wbinvd, -}; - -static int emulate_privileged_op(struct cpu_user_regs *regs) -{ - struct vcpu *curr = current; - struct domain *currd = curr->domain; - struct priv_op_ctxt ctxt = { - .ctxt.regs = regs, - .ctxt.vendor = currd->arch.cpuid->x86_vendor, - .ctxt.lma = !is_pv_32bit_domain(currd), - }; - int rc; - unsigned int eflags, ar; - - if ( !read_descriptor(regs->cs, curr, &ctxt.cs.base, &ctxt.cs.limit, - &ar, 1) || - !(ar & _SEGMENT_S) || - !(ar & _SEGMENT_P) || - !(ar & _SEGMENT_CODE) ) - return 0; - - /* Mirror virtualized state into EFLAGS. */ - ASSERT(regs->eflags & X86_EFLAGS_IF); - if ( vcpu_info(curr, evtchn_upcall_mask) ) - regs->eflags &= ~X86_EFLAGS_IF; - else - regs->eflags |= X86_EFLAGS_IF; - ASSERT(!(regs->eflags & X86_EFLAGS_IOPL)); - regs->eflags |= curr->arch.pv_vcpu.iopl; - eflags = regs->eflags; - - ctxt.ctxt.addr_size = ar & _SEGMENT_L ? 64 : ar & _SEGMENT_DB ? 32 : 16; - /* Leave zero in ctxt.ctxt.sp_size, as it's not needed. */ - rc = x86_emulate(&ctxt.ctxt, &priv_op_ops); - - if ( ctxt.io_emul_stub ) - unmap_domain_page(ctxt.io_emul_stub); - - /* - * Un-mirror virtualized state from EFLAGS. - * Nothing we allow to be emulated can change anything other than the - * arithmetic bits, and the resume flag. - */ - ASSERT(!((regs->eflags ^ eflags) & - ~(X86_EFLAGS_RF | X86_EFLAGS_ARITH_MASK))); - regs->eflags |= X86_EFLAGS_IF; - regs->eflags &= ~X86_EFLAGS_IOPL; - - switch ( rc ) - { - case X86EMUL_OKAY: - if ( ctxt.tsc & TSC_BASE ) - { - if ( ctxt.tsc & TSC_AUX ) - pv_soft_rdtsc(curr, regs, 1); - else if ( currd->arch.vtsc ) - pv_soft_rdtsc(curr, regs, 0); - else - msr_split(regs, rdtsc()); - } - - if ( ctxt.ctxt.retire.singlestep ) - ctxt.bpmatch |= DR_STEP; - if ( ctxt.bpmatch ) - { - curr->arch.debugreg[6] |= ctxt.bpmatch | DR_STATUS_RESERVED_ONE; - if ( !(curr->arch.pv_vcpu.trap_bounce.flags & TBF_EXCEPTION) ) - pv_inject_hw_exception(TRAP_debug, X86_EVENT_NO_EC); - } - /* fall through */ - case X86EMUL_RETRY: - return EXCRET_fault_fixed; - - case X86EMUL_EXCEPTION: - pv_inject_event(&ctxt.ctxt.event); - return EXCRET_fault_fixed; - } - - return 0; -} - static inline int check_stack_limit(unsigned int ar, unsigned int limit, unsigned int esp, unsigned int decr) { diff --git a/xen/arch/x86/x86_64/Makefile b/xen/arch/x86/x86_64/Makefile index d8815e78b0..f336a6ae65 100644 --- a/xen/arch/x86/x86_64/Makefile +++ b/xen/arch/x86/x86_64/Makefile @@ -1,7 +1,6 @@ subdir-y += compat obj-bin-y += entry.o -obj-bin-y += gpr_switch.o obj-y += traps.o obj-$(CONFIG_KEXEC) += machine_kexec.o obj-y += pci.o diff --git a/xen/include/asm-x86/pv/traps.h b/xen/include/asm-x86/pv/traps.h new file mode 100644 index 0000000000..32c7bac587 --- /dev/null +++ b/xen/include/asm-x86/pv/traps.h @@ -0,0 +1,48 @@ +/* + * pv/traps.h + * + * PV guest traps interface definitions + * + * Copyright (C) 2017 Wei Liu <wei.liu2@xxxxxxxxxx> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms and conditions of the GNU General Public + * License, version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef __X86_PV_TRAPS_H__ +#define __X86_PV_TRAPS_H__ + +#ifdef CONFIG_PV + +#include <public/xen.h> + +int emulate_privileged_op(struct cpu_user_regs *regs); + +#else /* !CONFIG_PV */ + +#include <xen/errno.h> + +int emulate_privileged_op(struct cpu_user_regs *regs) { return -EOPNOTSUPP; } + +#endif /* CONFIG_PV */ + +#endif /* __X86_PV_TRAPS_H__ */ + +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ -- 2.11.0 _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxx https://lists.xen.org/xen-devel
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |