Xen project Mailing List

[Xen-devel] [PATCH v7 22/49] x86emul: support AVX512F legacy-equivalent scalar int/FP conversion insns

To: "xen-devel" <xen-devel@xxxxxxxxxxxxxxxxxxxx>

From: "Jan Beulich" <JBeulich@xxxxxxxx>

Date: Wed, 19 Dec 2018 07:51:01 -0700

Cc: George Dunlap <George.Dunlap@xxxxxxxxxxxxx>, Andrew Cooper <andrew.cooper3@xxxxxxxxxx>, Wei Liu <wei.liu2@xxxxxxxxxx>, Roger Pau Monne <roger.pau@xxxxxxxxxx>

Delivery-date: Wed, 19 Dec 2018 14:51:08 +0000

List-id: Xen developer discussion <xen-devel.lists.xenproject.org>

VCVT{,T}S{S,D}2SI use EVEX.W for their destination (register) rather than their (possibly memory) source operand size and hence need a "manual" override of disp8scale. While the SDM claims that EVEX.L'L needs to be zero for the 32-bit forms of VCVT{,U}SI2SD (exception type E10NF), observations on my test system do not confirm this (and I've got informal confirmation that this is a doc mistake). Nevertheless, to be on the safe side, force evex.lr to be zero in this case though when constructing the stub. Slightly adjust the scalar to_int() in the test harness, to increase the chances of the operand ending up in memory. Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx> --- v7: Fix VCVTSI2SS - cannot re-use VMOV{D,Q} code here, as the register form can't be converted to a memory one when embedded rounding is in effect. Force evex.lr to zero for 32-bit VCVTSI2SD. Permit embedded rounding for VCVT{,T}S{S,D}2SI. Re-base. v4: New. --- a/tools/tests/x86_emulator/evex-disp8.c +++ b/tools/tests/x86_emulator/evex-disp8.c @@ -117,8 +117,16 @@ static const struct test avx512f_all[] = INSN(cvtps2dq, 66, 0f, 5b, vl, d, vl), INSN(cvtps2pd, , 0f, 5a, vl_2, d, vl), INSN(cvtps2ph, 66, 0f3a, 1d, vl_2, d_nb, vl), + INSN(cvtsd2si, f2, 0f, 2d, el, q, el), INSN(cvtsd2ss, f2, 0f, 5a, el, q, el), + INSN(cvtsi2sd, f2, 0f, 2a, el, dq64, el), + INSN(cvtsi2ss, f3, 0f, 2a, el, dq64, el), INSN(cvtss2sd, f3, 0f, 5a, el, d, el), + INSN(cvtss2si, f3, 0f, 2d, el, d, el), + INSN(cvttpd2dq, 66, 0f, e6, vl, q, vl), + INSN(cvttps2dq, f3, 0f, 5b, vl, d, vl), + INSN(cvttsd2si, f2, 0f, 2c, el, q, el), + INSN(cvttss2si, f3, 0f, 2c, el, d, el), INSN_FP(div, 0f, 5e), INSN(fmadd132, 66, 0f38, 98, vl, sd, vl), INSN(fmadd132, 66, 0f38, 99, el, sd, el), @@ -746,8 +754,9 @@ static void test_group(const struct test break; case ESZ_dq: - test_pair(&tests[i], vl[j], ESZ_d, "d", ESZ_q, "q", - instr, ctxt); + test_pair(&tests[i], vl[j], ESZ_d, + strncmp(tests[i].mnemonic, "cvt", 3) ? "d" : "l", + ESZ_q, "q", instr, ctxt); break; #ifdef __i386__ --- a/tools/tests/x86_emulator/simd.c +++ b/tools/tests/x86_emulator/simd.c @@ -89,7 +89,7 @@ static inline bool _to_bool(byte_vec_t b #endif #if VEC_SIZE == FLOAT_SIZE -# define to_int(x) ((vec_t){ (int)(x)[0] }) +# define to_int(x) ({ int i_ = (x)[0]; touch(i_); ((vec_t){ i_ }); }) #elif VEC_SIZE == 8 && FLOAT_SIZE == 4 && defined(__3dNOW__) # define to_int(x) __builtin_ia32_pi2fd(__builtin_ia32_pf2id(x)) #elif defined(FLOAT_SIZE) && VEC_SIZE > FLOAT_SIZE && defined(__AVX512F__) && \ --- a/tools/tests/x86_emulator/simd.h +++ b/tools/tests/x86_emulator/simd.h @@ -340,10 +340,28 @@ OVR(cvtps2dq); OVR(cvtps2pd); OVR(cvtps2ph); OVR(cvtsd2ss); +OVR(cvtsd2si); +OVR(cvtsd2sil); +OVR(cvtsd2siq); +OVR(cvtsi2sd); +OVR(cvtsi2sdl); +OVR(cvtsi2sdq); +OVR(cvtsi2ss); +OVR(cvtsi2ssl); +OVR(cvtsi2ssq); OVR(cvtss2sd); +OVR(cvtss2si); +OVR(cvtss2sil); +OVR(cvtss2siq); OVR(cvttpd2dqx); OVR(cvttpd2dqy); OVR(cvttps2dq); +OVR(cvttsd2si); +OVR(cvttsd2sil); +OVR(cvttsd2siq); +OVR(cvttss2si); +OVR(cvttss2sil); +OVR(cvttss2siq); OVR(movddup); OVR(movntdq); OVR(movntdqa); --- a/xen/arch/x86/x86_emulate/x86_emulate.c +++ b/xen/arch/x86/x86_emulate/x86_emulate.c @@ -296,7 +296,7 @@ static const struct twobyte_table { [0x22 ... 0x23] = { DstImplicit|SrcMem|ModRM }, [0x28] = { DstImplicit|SrcMem|ModRM|Mov, simd_packed_fp, d8s_vl }, [0x29] = { DstMem|SrcImplicit|ModRM|Mov, simd_packed_fp, d8s_vl }, - [0x2a] = { DstImplicit|SrcMem|ModRM|Mov, simd_other }, + [0x2a] = { DstImplicit|SrcMem|ModRM|Mov, simd_other, d8s_dq64 }, [0x2b] = { DstMem|SrcImplicit|ModRM|Mov, simd_any_fp, d8s_vl }, [0x2c ... 0x2d] = { DstImplicit|SrcMem|ModRM|Mov, simd_other }, [0x2e ... 0x2f] = { ImplicitOps|ModRM|TwoOp, simd_none, d8s_dq }, @@ -3072,6 +3072,12 @@ x86_decode( modrm_mod = 3; break; + case 0x2c: /* vcvtts{s,d}2si need special casing */ + case 0x2d: /* vcvts{s,d}2si need special casing */ + if ( evex_encoded() ) + disp8scale = 2 + (evex.pfx & VEX_PREFIX_DOUBLE_MASK); + break; + case 0x5a: /* vcvtps2pd needs special casing */ if ( disp8scale && !evex.pfx && !evex.brs ) --disp8scale; @@ -6190,6 +6196,48 @@ x86_emulate( state->simd_size = simd_none; goto simd_0f_rm; + CASE_SIMD_SCALAR_FP(_EVEX, 0x0f, 0x2a): /* vcvtsi2s{s,d} r/m,xmm,xmm */ + generate_exception_if(evex.opmsk || (ea.type != OP_REG && evex.brs), + EXC_UD); + host_and_vcpu_must_have(avx512f); + if ( !evex.brs ) + avx512_vlen_check(true); + get_fpu(X86EMUL_FPU_zmm); + + if ( ea.type == OP_MEM ) + { + rc = read_ulong(ea.mem.seg, ea.mem.off, &src.val, + rex_prefix & REX_W ? 8 : 4, ctxt, ops); + if ( rc != X86EMUL_OKAY ) + goto done; + } + else + src.val = *ea.reg; + + opc = init_evex(stub); + opc[0] = b; + /* Convert memory/GPR source to %rAX. */ + evex.b = 1; + if ( !mode_64bit() ) + evex.w = 0; + /* + * SDM version 067 claims that exception type E10NF implies #UD when + * EVEX.L'L is non-zero for 32-bit VCVT{,U}SI2SD. Experimentally this + * cannot be confirmed, but be on the safe side for the stub. + */ + if ( !evex.w && evex.pfx == vex_f2 ) + evex.lr = 0; + opc[1] = (modrm & 0x38) | 0xc0; + insn_bytes = EVEX_PFX_BYTES + 2; + opc[2] = 0xc3; + + copy_EVEX(opc, evex); + invoke_stub("", "", "=g" (dummy) : "a" (src.val)); + + put_stub(stub); + state->simd_size = simd_none; + break; + CASE_SIMD_SCALAR_FP(, 0x0f, 0x2c): /* cvtts{s,d}2si xmm/mem,reg */ CASE_SIMD_SCALAR_FP(_VEX, 0x0f, 0x2c): /* vcvtts{s,d}2si xmm/mem,reg */ CASE_SIMD_SCALAR_FP(, 0x0f, 0x2d): /* cvts{s,d}2si xmm/mem,reg */ @@ -6213,14 +6261,17 @@ x86_emulate( } opc = init_prefixes(stub); + cvts_2si: opc[0] = b; /* Convert GPR destination to %rAX and memory operand to (%rCX). */ rex_prefix &= ~REX_R; vex.r = 1; + evex.r = 1; if ( ea.type == OP_MEM ) { rex_prefix &= ~REX_B; vex.b = 1; + evex.b = 1; opc[1] = 0x01; rc = ops->read(ea.mem.seg, ea.mem.off, mmvalp, @@ -6231,11 +6282,22 @@ x86_emulate( else opc[1] = modrm & 0xc7; if ( !mode_64bit() ) + { vex.w = 0; - insn_bytes = PFX_BYTES + 2; + evex.w = 0; + } + if ( evex_encoded() ) + { + insn_bytes = EVEX_PFX_BYTES + 2; + copy_EVEX(opc, evex); + } + else + { + insn_bytes = PFX_BYTES + 2; + copy_REX_VEX(opc, rex_prefix, vex); + } opc[2] = 0xc3; - copy_REX_VEX(opc, rex_prefix, vex); ea.reg = decode_gpr(&_regs, modrm_reg); invoke_stub("", "", "=a" (*ea.reg) : "c" (mmvalp), "m" (*mmvalp)); @@ -6243,6 +6305,18 @@ x86_emulate( state->simd_size = simd_none; break; + CASE_SIMD_SCALAR_FP(_EVEX, 0x0f, 0x2c): /* vcvtts{s,d}2si xmm/mem,reg */ + CASE_SIMD_SCALAR_FP(_EVEX, 0x0f, 0x2d): /* vcvts{s,d}2si xmm/mem,reg */ + generate_exception_if((evex.reg != 0xf || !evex.RX || evex.opmsk || + (ea.type != OP_REG && evex.brs)), + EXC_UD); + host_and_vcpu_must_have(avx512f); + if ( !evex.brs ) + avx512_vlen_check(true); + get_fpu(X86EMUL_FPU_zmm); + opc = init_evex(stub); + goto cvts_2si; + CASE_SIMD_PACKED_FP(, 0x0f, 0x2e): /* ucomis{s,d} xmm/mem,xmm */ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x2e): /* vucomis{s,d} xmm/mem,xmm */ CASE_SIMD_PACKED_FP(, 0x0f, 0x2f): /* comis{s,d} xmm/mem,xmm */ _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxxx https://lists.xenproject.org/mailman/listinfo/xen-devel

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.