x86emul: support XOP insns Signed-off-by: Jan Beulich --- a/.gitignore +++ b/.gitignore @@ -230,7 +230,7 @@ tools/tests/x86_emulator/sse*.[ch] tools/tests/x86_emulator/test_x86_emulator tools/tests/x86_emulator/x86_emulate +tools/tests/x86_emulator/xop*.[ch] tools/tests/xen-access/xen-access tools/tests/xenstore/xs-test tools/tests/regression/installed/* --- a/tools/tests/x86_emulator/Makefile +++ b/tools/tests/x86_emulator/Makefile @@ -11,7 +11,7 @@ all: $(TARGET) run: $(TARGET) ./$(TARGET) -SIMD := sse sse2 sse4 avx avx2 +SIMD := sse sse2 sse4 avx avx2 xop FMA := fma4 fma SG := avx2-sg TESTCASES := blowfish $(SIMD) $(FMA) $(SG) @@ -44,6 +44,9 @@ avx2-sg-vecs := $(avx2-vecs) avx2-sg-idxs := 4 8 avx2-sg-ints := 4 8 avx2-sg-flts := 4 8 +xop-vecs := $(avx-vecs) +xop-ints := 1 2 4 8 +xop-flts := $(avx-flts) # For AVX and later, have the compiler avoid XMM0 to widen coverage of # the VEX.vvvv checks in the emulator. @@ -98,6 +101,8 @@ $(addsuffix .c,$(SG)): $(addsuffix .o,$(SIMD) $(FMA) $(SG)): simd.h +xop.o: simd-fma.c + $(TARGET): x86_emulate.o test_x86_emulator.o $(HOSTCC) -o $@ $^ --- a/tools/tests/x86_emulator/simd.c +++ b/tools/tests/x86_emulator/simd.c @@ -471,6 +471,86 @@ static inline bool _to_bool(byte_vec_t b # endif # endif #endif +#ifdef __XOP__ +# undef select +# if VEC_SIZE == 16 +# if INT_SIZE == 2 || INT_SIZE == 4 +# include "simd-fma.c" +# endif +# define select(d, x, y, m) \ + (*(d) = (vec_t)__builtin_ia32_vpcmov((vdi_t)(x), (vdi_t)(y), (vdi_t)(m))) +# if INT_SIZE == 1 || UINT_SIZE == 1 +# define swap2(x) ((vec_t)__builtin_ia32_vpperm((vqi_t)(x), (vqi_t)(x), (vqi_t)inv - 1)) +# elif INT_SIZE == 2 || UINT_SIZE == 2 +# define swap2(x) \ + ((vec_t)__builtin_ia32_vpperm((vqi_t)(x), (vqi_t)(x), \ + (vqi_t)(__builtin_ia32_vprotwi(2 * (vhi_t)inv - 1, 8) | \ + (2 * inv - 2)))) +# elif FLOAT_SIZE == 4 +# define frac(x) __builtin_ia32_vfrczps(x) +# undef swap2 +# define swap2(x) ({ \ + /* Buggy in gcc 7.1.0 and earlier. */ \ + /* __builtin_ia32_vpermil2ps((vec_t){}, x, __builtin_ia32_cvtps2dq(inv) + 3, 0) */ \ + vec_t t_; \ + asm ( "vpermil2ps $0, %3, %2, %1, %0" : \ + "=x" (t_) : \ + "x" ((vec_t){}), "m" (x), "x" (__builtin_ia32_cvtps2dq(inv) + 3) ); \ + t_; \ +}) +# elif FLOAT_SIZE == 8 +# define frac(x) __builtin_ia32_vfrczpd(x) +# undef swap2 +# define swap2(x) ({ \ + /* Buggy in gcc 7.1.0 and earlier. */ \ + /* __builtin_ia32_vpermil2pd((vec_t){}, x, */ \ + /* __builtin_ia32_pmovsxdq128( */ \ + /* __builtin_ia32_cvtpd2dq(inv) + 1) << 1, 0) */ \ + vdi_t s_ = __builtin_ia32_pmovsxdq128( \ + __builtin_ia32_cvtpd2dq(inv) + 1) << 1; \ + vec_t t_; \ + asm ( "vpermil2pd $0, %3, %2, %1, %0" : \ + "=x" (t_) : "x" ((vec_t){}), "x" (x), "m" (s_) ); \ + t_; \ +}) +# endif +# if INT_SIZE == 1 +# define hadd(x, y) ((vec_t)__builtin_ia32_packsswb128(__builtin_ia32_vphaddbw((vqi_t)(x)), \ + __builtin_ia32_vphaddbw((vqi_t)(y)))) +# define hsub(x, y) ((vec_t)__builtin_ia32_packsswb128(__builtin_ia32_vphsubbw((vqi_t)(x)), \ + __builtin_ia32_vphsubbw((vqi_t)(y)))) +# elif UINT_SIZE == 1 +# define hadd(x, y) ((vec_t)__builtin_ia32_packuswb128(__builtin_ia32_vphaddubw((vqi_t)(x)), \ + __builtin_ia32_vphaddubw((vqi_t)(y)))) +# elif INT_SIZE == 2 +# undef hadd +# define hadd(x, y) __builtin_ia32_packssdw128(__builtin_ia32_vphaddwd(x), \ + __builtin_ia32_vphaddwd(y)) +# undef hsub +# define hsub(x, y) __builtin_ia32_packssdw128(__builtin_ia32_vphsubwd(x), \ + __builtin_ia32_vphsubwd(y)) +# elif UINT_SIZE == 2 +# undef hadd +# define hadd(x, y) ((vec_t)__builtin_ia32_packusdw128(__builtin_ia32_vphadduwd((vhi_t)(x)), \ + __builtin_ia32_vphadduwd((vhi_t)(y)))) +# undef hsub +# endif +# elif VEC_SIZE == 32 +# define select(d, x, y, m) \ + (*(d) = (vec_t)__builtin_ia32_vpcmov256((vdi_t)(x), (vdi_t)(y), (vdi_t)(m))) +# if FLOAT_SIZE == 4 +# define frac(x) __builtin_ia32_vfrczps256(x) +# elif FLOAT_SIZE == 8 +# define frac(x) __builtin_ia32_vfrczpd256(x) +# endif +# elif VEC_SIZE == FLOAT_SIZE +# if VEC_SIZE == 4 +# define frac(x) scalar_1op(x, "vfrczss %[in], %[out]") +# elif VEC_SIZE == 8 +# define frac(x) scalar_1op(x, "vfrczsd %[in], %[out]") +# endif +# endif +#endif int simd_test(void) { @@ -576,6 +656,29 @@ int simd_test(void) if ( !to_bool(y == z) ) return __LINE__; # endif +# ifdef frac + touch(src); + x = frac(src); + touch(src); + if ( !to_bool(x == 0) ) return __LINE__; + + x = 1 / (src + 1); + touch(x); + y = frac(x); + touch(x); + if ( !to_bool(x == y) ) return __LINE__; +# endif + +# if defined(trunc) && defined(frac) + x = src / 4; + touch(x); + y = trunc(x); + touch(x); + z = frac(x); + touch(x); + if ( !to_bool(x == y + z) ) return __LINE__; +# endif + #else # if ELEM_SIZE > 1 @@ -677,7 +780,7 @@ int simd_test(void) y = z << sh; if ( !to_bool(x == y + y) ) return __LINE__; -# if defined(__AVX2__) && ELEM_SIZE >= 4 +# if (defined(__AVX2__) && ELEM_SIZE >= 4) || defined(__XOP__) touch(sh); x = y >> sh; if ( !to_bool(x == z) ) return __LINE__; @@ -871,6 +974,8 @@ int simd_test(void) #endif #ifdef hadd +# if (!defined(INT_SIZE) || INT_SIZE > 1 || ELEM_COUNT < 16) && \ + (!defined(UINT_SIZE) || UINT_SIZE > 1 || ELEM_COUNT <= 16) x = src; for ( i = ELEM_COUNT; i >>= 1; ) { @@ -878,6 +983,7 @@ int simd_test(void) x = hadd((vec_t){}, x); } if ( x[ELEM_COUNT - 1] != (ELEM_COUNT * (ELEM_COUNT + 1)) / 2 ) return __LINE__; +# endif # ifdef hsub touch(src); @@ -889,6 +995,9 @@ int simd_test(void) # endif #endif +#if defined(__XOP__) && VEC_SIZE == 16 && (INT_SIZE == 2 || INT_SIZE == 4) + return -fma_test(); +#endif return 0; } --- a/tools/tests/x86_emulator/simd-fma.c +++ b/tools/tests/x86_emulator/simd-fma.c @@ -1,6 +1,8 @@ #include "simd.h" +#ifndef __XOP__ ENTRY(fma_test); +#endif #if VEC_SIZE < 16 # define to_bool(cmp) (!~(cmp)[0]) --- a/tools/tests/x86_emulator/test_x86_emulator.c +++ b/tools/tests/x86_emulator/test_x86_emulator.c @@ -13,6 +13,7 @@ #include "fma.h" #include "avx2.h" #include "avx2-sg.h" +#include "xop.h" #define verbose false /* Switch to true for far more logging. */ @@ -63,6 +64,11 @@ static bool simd_check_avx2(void) } #define simd_check_avx2_sg simd_check_avx2 +static bool simd_check_xop(void) +{ + return cpu_has_xop; +} + static void simd_set_regs(struct cpu_user_regs *regs) { if ( cpu_has_mmx ) @@ -191,6 +197,22 @@ static const struct { SIMD(AVX2 S/G i64[4x32], avx2_sg, 32x4i8), SIMD(AVX2 S/G i32[4x64], avx2_sg, 32x8i4), SIMD(AVX2 S/G i64[4x64], avx2_sg, 32x8i8), + SIMD(XOP 128bit single, xop, 16f4), + SIMD(XOP 256bit single, xop, 32f4), + SIMD(XOP 128bit double, xop, 16f8), + SIMD(XOP 256bit double, xop, 32f8), + SIMD(XOP s8x16, xop, 16i1), + SIMD(XOP u8x16, xop, 16u1), + SIMD(XOP s16x8, xop, 16i2), + SIMD(XOP u16x8, xop, 16u2), + SIMD(XOP s32x4, xop, 16i4), + SIMD(XOP u32x4, xop, 16u4), + SIMD(XOP s64x2, xop, 16i8), + SIMD(XOP u64x2, xop, 16u8), + SIMD(XOP i8x32, xop, 32i1), + SIMD(XOP i16x16, xop, 32i2), + SIMD(XOP i32x8, xop, 32i4), + SIMD(XOP i64x4, xop, 32i8), #undef SIMD_ #undef SIMD }; --- a/tools/tests/x86_emulator/x86_emulate.h +++ b/tools/tests/x86_emulator/x86_emulate.h @@ -172,6 +172,16 @@ static inline uint64_t xgetbv(uint32_t x (res.c & (1U << 6)) != 0; \ }) +#define cpu_has_xop ({ \ + struct cpuid_leaf res; \ + emul_test_cpuid(1, 0, &res, NULL); \ + if ( !(res.c & (1U << 27)) || ((xgetbv(0) & 6) != 6) ) \ + res.c = 0; \ + else \ + emul_test_cpuid(0x80000001, 0, &res, NULL); \ + (res.c & (1U << 11)) != 0; \ +}) + #define cpu_has_fma4 ({ \ struct cpuid_leaf res; \ emul_test_cpuid(1, 0, &res, NULL); \ --- a/xen/arch/x86/x86_emulate/x86_emulate.c +++ b/xen/arch/x86/x86_emulate/x86_emulate.c @@ -435,6 +435,7 @@ static const struct { [0x42] = { .simd_size = simd_packed_int }, [0x44] = { .simd_size = simd_packed_int }, [0x46] = { .simd_size = simd_packed_int }, + [0x48 ... 0x49] = { .simd_size = simd_packed_fp, .four_op = 1 }, [0x4a ... 0x4b] = { .simd_size = simd_packed_fp, .four_op = 1 }, [0x4c] = { .simd_size = simd_packed_int, .four_op = 1 }, [0x5c ... 0x5f] = { .simd_size = simd_packed_fp, .four_op = 1 }, @@ -463,6 +464,17 @@ static const struct { uint8_t two_op:1; uint8_t four_op:1; } ext8f08_table[256] = { + [0xa2] = { .simd_size = simd_packed_int, .four_op = 1 }, + [0x85 ... 0x87] = { .simd_size = simd_packed_int, .four_op = 1 }, + [0x8e ... 0x8f] = { .simd_size = simd_packed_int, .four_op = 1 }, + [0x95 ... 0x97] = { .simd_size = simd_packed_int, .four_op = 1 }, + [0x9e ... 0x9f] = { .simd_size = simd_packed_int, .four_op = 1 }, + [0xa3] = { .simd_size = simd_packed_int, .four_op = 1 }, + [0xa6] = { .simd_size = simd_packed_int, .four_op = 1 }, + [0xb6] = { .simd_size = simd_packed_int, .four_op = 1 }, + [0xc0 ... 0xc3] = { .simd_size = simd_packed_int, .two_op = 1 }, + [0xcc ... 0xcf] = { .simd_size = simd_packed_int }, + [0xec ... 0xef] = { .simd_size = simd_packed_int }, }; static const struct { @@ -470,6 +482,16 @@ static const struct { uint8_t two_op:1; } ext8f09_table[256] = { [0x01 ... 0x02] = { .two_op = 1 }, + [0x80 ... 0x81] = { .simd_size = simd_packed_fp, .two_op = 1 }, + [0x82 ... 0x83] = { .simd_size = simd_scalar_fp, .two_op = 1 }, + [0x90 ... 0x9b] = { .simd_size = simd_packed_int }, + [0xc1 ... 0xc3] = { .simd_size = simd_packed_int, .two_op = 1 }, + [0xc6 ... 0xc7] = { .simd_size = simd_packed_int, .two_op = 1 }, + [0xcb] = { .simd_size = simd_packed_int, .two_op = 1 }, + [0xd1 ... 0xd3] = { .simd_size = simd_packed_int, .two_op = 1 }, + [0xd6 ... 0xd7] = { .simd_size = simd_packed_int, .two_op = 1 }, + [0xdb] = { .simd_size = simd_packed_int, .two_op = 1 }, + [0xe1 ... 0xe3] = { .simd_size = simd_packed_int, .two_op = 1 }, }; #define REX_PREFIX 0x40 @@ -528,7 +550,7 @@ union vex { #define copy_VEX(ptr, vex) ({ \ if ( !mode_64bit() ) \ (vex).reg |= 8; \ - (ptr)[0 - PFX_BYTES] = 0xc4; \ + (ptr)[0 - PFX_BYTES] = ext < ext_8f08 ? 0xc4 : 0x8f; \ (ptr)[1 - PFX_BYTES] = (vex).raw[0]; \ (ptr)[2 - PFX_BYTES] = (vex).raw[1]; \ container_of((ptr) + 1 - PFX_BYTES, typeof(vex), raw[0]); \ @@ -1653,6 +1675,7 @@ static bool vcpu_has( #define vcpu_has_lzcnt() vcpu_has(0x80000001, ECX, 5, ctxt, ops) #define vcpu_has_sse4a() vcpu_has(0x80000001, ECX, 6, ctxt, ops) #define vcpu_has_misalignsse() vcpu_has(0x80000001, ECX, 7, ctxt, ops) +#define vcpu_has_xop() vcpu_has(0x80000001, ECX, 12, ctxt, ops) #define vcpu_has_fma4() vcpu_has(0x80000001, ECX, 16, ctxt, ops) #define vcpu_has_tbm() vcpu_has(0x80000001, ECX, 21, ctxt, ops) #define vcpu_has_bmi1() vcpu_has( 7, EBX, 3, ctxt, ops) @@ -2985,9 +3008,19 @@ x86_decode( case simd_packed_int: switch ( vex.pfx ) { - case vex_none: op_bytes = 8; break; - case vex_66: op_bytes = 16 << vex.l; break; - default: op_bytes = 0; break; + case vex_none: + if ( !vex.opcx ) + { + op_bytes = 8; + break; + } + /* fall through */ + case vex_66: + op_bytes = 16 << vex.l; + break; + default: + op_bytes = 0; + break; } break; @@ -7996,6 +8029,13 @@ x86_emulate( generate_exception_if(vex.w, EXC_UD); goto simd_0f_imm8_avx; + case X86EMUL_OPC_VEX_66(0x0f3a, 0x48): /* vpermil2ps $imm,{x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */ + /* vpermil2ps $imm,{x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */ + case X86EMUL_OPC_VEX_66(0x0f3a, 0x49): /* vpermil2pd $imm,{x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */ + /* vpermil2pd $imm,{x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */ + host_and_vcpu_must_have(xop); + goto simd_0f_imm8_ymm; + case X86EMUL_OPC_VEX_66(0x0f3a, 0x4c): /* vpblendvb {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */ generate_exception_if(vex.w, EXC_UD); goto simd_0f_int_imm8; @@ -8133,6 +8173,41 @@ x86_emulate( asm ( "rorl %b1,%k0" : "=g" (dst.val) : "c" (imm1), "0" (src.val) ); break; + case X86EMUL_OPC_XOP(08, 0x85): /* vpmacssww xmm,xmm/m128,xmm,xmm */ + case X86EMUL_OPC_XOP(08, 0x86): /* vpmacsswd xmm,xmm/m128,xmm,xmm */ + case X86EMUL_OPC_XOP(08, 0x87): /* vpmacssdql xmm,xmm/m128,xmm,xmm */ + case X86EMUL_OPC_XOP(08, 0x8e): /* vpmacssdd xmm,xmm/m128,xmm,xmm */ + case X86EMUL_OPC_XOP(08, 0x8f): /* vpmacssdqh xmm,xmm/m128,xmm,xmm */ + case X86EMUL_OPC_XOP(08, 0x95): /* vpmacsww xmm,xmm/m128,xmm,xmm */ + case X86EMUL_OPC_XOP(08, 0x96): /* vpmacswd xmm,xmm/m128,xmm,xmm */ + case X86EMUL_OPC_XOP(08, 0x97): /* vpmacsdql xmm,xmm/m128,xmm,xmm */ + case X86EMUL_OPC_XOP(08, 0x9e): /* vpmacsdd xmm,xmm/m128,xmm,xmm */ + case X86EMUL_OPC_XOP(08, 0x9f): /* vpmacsdqh xmm,xmm/m128,xmm,xmm */ + case X86EMUL_OPC_XOP(08, 0xa6): /* vpmadcsswd xmm,xmm/m128,xmm,xmm */ + case X86EMUL_OPC_XOP(08, 0xb6): /* vpmadcswd xmm,xmm/m128,xmm,xmm */ + case X86EMUL_OPC_XOP(08, 0xc0): /* vprotb $imm,xmm/m128,xmm */ + case X86EMUL_OPC_XOP(08, 0xc1): /* vprotw $imm,xmm/m128,xmm */ + case X86EMUL_OPC_XOP(08, 0xc2): /* vprotd $imm,xmm/m128,xmm */ + case X86EMUL_OPC_XOP(08, 0xc3): /* vprotq $imm,xmm/m128,xmm */ + case X86EMUL_OPC_XOP(08, 0xcc): /* vpcomb $imm,xmm/m128,xmm,xmm */ + case X86EMUL_OPC_XOP(08, 0xcd): /* vpcomw $imm,xmm/m128,xmm,xmm */ + case X86EMUL_OPC_XOP(08, 0xce): /* vpcomd $imm,xmm/m128,xmm,xmm */ + case X86EMUL_OPC_XOP(08, 0xcf): /* vpcomq $imm,xmm/m128,xmm,xmm */ + case X86EMUL_OPC_XOP(08, 0xec): /* vpcomub $imm,xmm/m128,xmm,xmm */ + case X86EMUL_OPC_XOP(08, 0xed): /* vpcomuw $imm,xmm/m128,xmm,xmm */ + case X86EMUL_OPC_XOP(08, 0xee): /* vpcomud $imm,xmm/m128,xmm,xmm */ + case X86EMUL_OPC_XOP(08, 0xef): /* vpcomuq $imm,xmm/m128,xmm,xmm */ + generate_exception_if(vex.w, EXC_UD); + /* fall through */ + case X86EMUL_OPC_XOP(08, 0xa3): /* vpperm xmm/m128,xmm,xmm,xmm */ + /* vpperm xmm,xmm/m128,xmm,xmm */ + generate_exception_if(vex.l, EXC_UD); + /* fall through */ + case X86EMUL_OPC_XOP(08, 0xa2): /* vpcmov {x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */ + /* vpcmov {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */ + host_and_vcpu_must_have(xop); + goto simd_0f_imm8_ymm; + case X86EMUL_OPC_XOP(09, 0x01): /* XOP Grp1 */ switch ( modrm_reg & 7 ) { @@ -8182,6 +8257,61 @@ x86_emulate( } goto cannot_emulate; + case X86EMUL_OPC_XOP(09, 0x82): /* vfrczss xmm/m128,xmm */ + case X86EMUL_OPC_XOP(09, 0x83): /* vfrczsd xmm/m128,xmm */ + generate_exception_if(vex.l, EXC_UD); + /* fall through */ + case X86EMUL_OPC_XOP(09, 0x80): /* vfrczps {x,y}mm/mem,{x,y}mm */ + case X86EMUL_OPC_XOP(09, 0x81): /* vfrczpd {x,y}mm/mem,{x,y}mm */ + host_and_vcpu_must_have(xop); + generate_exception_if(vex.w, EXC_UD); + goto simd_0f_ymm; + + case X86EMUL_OPC_XOP(09, 0xc1): /* vphaddbw xmm/m128,xmm */ + case X86EMUL_OPC_XOP(09, 0xc2): /* vphaddbd xmm/m128,xmm */ + case X86EMUL_OPC_XOP(09, 0xc3): /* vphaddbq xmm/m128,xmm */ + case X86EMUL_OPC_XOP(09, 0xc6): /* vphaddwd xmm/m128,xmm */ + case X86EMUL_OPC_XOP(09, 0xc7): /* vphaddwq xmm/m128,xmm */ + case X86EMUL_OPC_XOP(09, 0xcb): /* vphadddq xmm/m128,xmm */ + case X86EMUL_OPC_XOP(09, 0xd1): /* vphaddubw xmm/m128,xmm */ + case X86EMUL_OPC_XOP(09, 0xd2): /* vphaddubd xmm/m128,xmm */ + case X86EMUL_OPC_XOP(09, 0xd3): /* vphaddubq xmm/m128,xmm */ + case X86EMUL_OPC_XOP(09, 0xd6): /* vphadduwd xmm/m128,xmm */ + case X86EMUL_OPC_XOP(09, 0xd7): /* vphadduwq xmm/m128,xmm */ + case X86EMUL_OPC_XOP(09, 0xdb): /* vphaddudq xmm/m128,xmm */ + case X86EMUL_OPC_XOP(09, 0xe2): /* vphsubwd xmm/m128,xmm */ + case X86EMUL_OPC_XOP(09, 0xe3): /* vphsubdq xmm/m128,xmm */ + case X86EMUL_OPC_XOP(09, 0xe1): /* vphsubbw xmm/m128,xmm */ + generate_exception_if(vex.w, EXC_UD); + /* fall through */ + case X86EMUL_OPC_XOP(09, 0x90): /* vprotb xmm/m128,xmm,xmm */ + /* vprotb xmm,xmm/m128,xmm */ + case X86EMUL_OPC_XOP(09, 0x91): /* vprotw xmm/m128,xmm,xmm */ + /* vprotw xmm,xmm/m128,xmm */ + case X86EMUL_OPC_XOP(09, 0x92): /* vprotd xmm/m128,xmm,xmm */ + /* vprotd xmm,xmm/m128,xmm */ + case X86EMUL_OPC_XOP(09, 0x93): /* vprotq xmm/m128,xmm,xmm */ + /* vprotq xmm,xmm/m128,xmm */ + case X86EMUL_OPC_XOP(09, 0x94): /* vpshlb xmm/m128,xmm,xmm */ + /* vpshlb xmm,xmm/m128,xmm */ + case X86EMUL_OPC_XOP(09, 0x95): /* vpshlw xmm/m128,xmm,xmm */ + /* vpshlw xmm,xmm/m128,xmm */ + case X86EMUL_OPC_XOP(09, 0x96): /* vpshld xmm/m128,xmm,xmm */ + /* vpshld xmm,xmm/m128,xmm */ + case X86EMUL_OPC_XOP(09, 0x97): /* vpshlq xmm/m128,xmm,xmm */ + /* vpshlq xmm,xmm/m128,xmm */ + case X86EMUL_OPC_XOP(09, 0x98): /* vpshab xmm/m128,xmm,xmm */ + /* vpshab xmm,xmm/m128,xmm */ + case X86EMUL_OPC_XOP(09, 0x99): /* vpshaw xmm/m128,xmm,xmm */ + /* vpshaw xmm,xmm/m128,xmm */ + case X86EMUL_OPC_XOP(09, 0x9a): /* vpshad xmm/m128,xmm,xmm */ + /* vpshad xmm,xmm/m128,xmm */ + case X86EMUL_OPC_XOP(09, 0x9b): /* vpshaq xmm/m128,xmm,xmm */ + /* vpshaq xmm,xmm/m128,xmm */ + generate_exception_if(vex.l, EXC_UD); + host_and_vcpu_must_have(xop); + goto simd_0f_ymm; + case X86EMUL_OPC_XOP(0a, 0x10): /* bextr imm,r/m,r */ { uint8_t *buf = get_stub(stub); --- a/xen/include/asm-x86/cpufeature.h +++ b/xen/include/asm-x86/cpufeature.h @@ -76,6 +76,7 @@ #define cpu_has_cmp_legacy boot_cpu_has(X86_FEATURE_CMP_LEGACY) #define cpu_has_svm boot_cpu_has(X86_FEATURE_SVM) #define cpu_has_sse4a boot_cpu_has(X86_FEATURE_SSE4A) +#define cpu_has_xop boot_cpu_has(X86_FEATURE_XOP) #define cpu_has_lwp boot_cpu_has(X86_FEATURE_LWP) #define cpu_has_fma4 boot_cpu_has(X86_FEATURE_FMA4) #define cpu_has_tbm boot_cpu_has(X86_FEATURE_TBM)