Xen project Mailing List

[Xen-devel] [PATCH v5 35/47] x86emul: support AVX512F floating-point conversion insns

To: "xen-devel" <xen-devel@xxxxxxxxxxxxxxxxxxxx>

From: "Jan Beulich" <JBeulich@xxxxxxxx>

Date: Mon, 19 Nov 2018 03:33:54 -0700

Cc: George Dunlap <George.Dunlap@xxxxxxxxxxxxx>, Andrew Cooper <andrew.cooper3@xxxxxxxxxx>, Wei Liu <wei.liu2@xxxxxxxxxx>, Roger Pau Monne <roger.pau@xxxxxxxxxx>

Delivery-date: Mon, 19 Nov 2018 10:34:02 +0000

List-id: Xen developer discussion <xen-devel.lists.xenproject.org>

VCVTPS2PD, sharing its main opcode with others, needs a "manual" override of disp8scale. The simd_size change for twobyte_table[0x5a] is benign to pre-existing code, but allows decode_disp8scale() to work as is here. Also correct the comment on an AVX counterpart. Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx> --- v5: Re-base over changes earlier in the series. v4: New. --- a/tools/tests/x86_emulator/evex-disp8.c +++ b/tools/tests/x86_emulator/evex-disp8.c @@ -109,6 +109,12 @@ static const struct test avx512f_all[] = INSN_FP(cmp, 0f, c2), INSN(comisd, 66, 0f, 2f, el, q, el), INSN(comiss, , 0f, 2f, el, d, el), + INSN(cvtpd2ps, 66, 0f, 5a, vl, q, vl), + INSN(cvtph2ps, 66, 0f38, 13, vl_2, d_nb, vl), + INSN(cvtps2pd, , 0f, 5a, vl_2, d, vl), + INSN(cvtps2ph, 66, 0f3a, 1d, vl_2, d_nb, vl), + INSN(cvtsd2ss, f2, 0f, 5a, el, q, el), + INSN(cvtss2sd, f3, 0f, 5a, el, d, el), INSN_FP(div, 0f, 5e), INSN(fmadd132, 66, 0f38, 98, vl, sd, vl), INSN(fmadd132, 66, 0f38, 99, el, sd, el), --- a/tools/tests/x86_emulator/simd.c +++ b/tools/tests/x86_emulator/simd.c @@ -181,7 +181,9 @@ static inline bool _to_bool(byte_vec_t b # define max(x, y) BR_(maxps, _mask, x, y, undef(), ~0) # define min(x, y) BR_(minps, _mask, x, y, undef(), ~0) # define mix(x, y) B(movaps, _mask, x, y, (0b0101010101010101 & ALL_TRUE)) +# define shrink1(x) BR_(cvtpd2ps, _mask, (vdf_t)(x), (vsf_half_t){}, ~0) # define sqrt(x) BR(sqrtps, _mask, x, undef(), ~0) +# define widen1(x) ((vec_t)BR(cvtps2pd, _mask, x, (vdf_t)undef(), ~0)) # if VEC_SIZE == 16 # define interleave_hi(x, y) B(unpckhps, _mask, x, y, undef(), ~0) # define interleave_lo(x, y) B(unpcklps, _mask, x, y, undef(), ~0) --- a/tools/tests/x86_emulator/simd.h +++ b/tools/tests/x86_emulator/simd.h @@ -68,6 +68,7 @@ typedef short __attribute__((vector_size typedef int __attribute__((vector_size(VEC_SIZE))) vsi_t; #if VEC_SIZE >= 8 typedef long long __attribute__((vector_size(VEC_SIZE))) vdi_t; +typedef double __attribute__((vector_size(VEC_SIZE))) vdf_t; #endif #if ELEM_SIZE == 1 @@ -93,6 +94,7 @@ typedef char __attribute__((vector_size( typedef short __attribute__((vector_size(HALF_SIZE))) vhi_half_t; typedef int __attribute__((vector_size(HALF_SIZE))) vsi_half_t; typedef long long __attribute__((vector_size(HALF_SIZE))) vdi_half_t; +typedef float __attribute__((vector_size(HALF_SIZE))) vsf_half_t; # endif # if ELEM_COUNT >= 4 @@ -328,6 +330,13 @@ REN(pandn, , d); REN(por, , d); REN(pxor, , d); # endif +OVR(cvtpd2psx); +OVR(cvtpd2psy); +OVR(cvtph2ps); +OVR(cvtps2pd); +OVR(cvtps2ph); +OVR(cvtsd2ss); +OVR(cvtss2sd); OVR(movddup); OVR(movntdq); OVR(movntdqa); --- a/tools/tests/x86_emulator/test_x86_emulator.c +++ b/tools/tests/x86_emulator/test_x86_emulator.c @@ -3842,6 +3842,49 @@ int main(int argc, char **argv) else printf("skipped\n"); + printf("%-40s", "Testing vcvtph2ps 32(%ecx),%zmm7{%k4}..."); + if ( stack_exec && cpu_has_avx512f ) + { + decl_insn(evex_vcvtph2ps); + decl_insn(evex_vcvtps2ph); + + asm volatile ( "vpternlogd $0x81, %%zmm7, %%zmm7, %%zmm7\n\t" + "kmovw %1,%%k4\n" + put_insn(evex_vcvtph2ps, "vcvtph2ps 32(%0), %%zmm7%{%%k4%}") + :: "c" (NULL), "r" (0x3333) ); + + set_insn(evex_vcvtph2ps); + memset(res, 0xff, 128); + res[8] = 0x40003c00; /* (1.0, 2.0) */ + res[10] = 0x44004200; /* (3.0, 4.0) */ + res[12] = 0x3400b800; /* (-.5, .25) */ + res[14] = 0xbc000000; /* (0.0, -1.) */ + regs.ecx = (unsigned long)res; + rc = x86_emulate(&ctxt, &emulops); + asm volatile ( "vmovups %%zmm7, %0" : "=m" (res[16]) ); + if ( rc != X86EMUL_OKAY || !check_eip(evex_vcvtph2ps) ) + goto fail; + printf("okay\n"); + + printf("%-40s", "Testing vcvtps2ph $0,%zmm3,64(%edx){%k4}..."); + asm volatile ( "vmovups %0, %%zmm3\n" + put_insn(evex_vcvtps2ph, "vcvtps2ph $0, %%zmm3, 128(%1)%{%%k4%}") + :: "m" (res[16]), "d" (NULL) ); + + set_insn(evex_vcvtps2ph); + regs.edx = (unsigned long)res; + memset(res + 32, 0xcc, 32); + rc = x86_emulate(&ctxt, &emulops); + if ( rc != X86EMUL_OKAY || !check_eip(evex_vcvtps2ph) ) + goto fail; + res[15] = res[13] = res[11] = res[9] = 0xcccccccc; + if ( memcmp(res + 8, res + 32, 32) ) + goto fail; + printf("okay\n"); + } + else + printf("skipped\n"); + #undef decl_insn #undef put_insn #undef set_insn --- a/xen/arch/x86/x86_emulate/x86_emulate.c +++ b/xen/arch/x86/x86_emulate/x86_emulate.c @@ -310,7 +310,8 @@ static const struct twobyte_table { [0x52 ... 0x53] = { DstImplicit|SrcMem|ModRM|TwoOp, simd_single_fp }, [0x54 ... 0x57] = { DstImplicit|SrcMem|ModRM, simd_packed_fp, d8s_vl }, [0x58 ... 0x59] = { DstImplicit|SrcMem|ModRM, simd_any_fp, d8s_vl }, - [0x5a ... 0x5b] = { DstImplicit|SrcMem|ModRM|Mov, simd_other }, + [0x5a] = { DstImplicit|SrcMem|ModRM|Mov, simd_any_fp, d8s_vl }, + [0x5b] = { DstImplicit|SrcMem|ModRM|Mov, simd_other }, [0x5c ... 0x5f] = { DstImplicit|SrcMem|ModRM, simd_any_fp, d8s_vl }, [0x60 ... 0x62] = { DstImplicit|SrcMem|ModRM, simd_other, d8s_vl }, [0x63 ... 0x67] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl }, @@ -437,7 +438,7 @@ static const struct ext0f38_table { [0x0c ... 0x0d] = { .simd_size = simd_packed_fp, .d8s = d8s_vl }, [0x0e ... 0x0f] = { .simd_size = simd_packed_fp }, [0x10 ... 0x12] = { .simd_size = simd_packed_int, .d8s = d8s_vl }, - [0x13] = { .simd_size = simd_other, .two_op = 1 }, + [0x13] = { .simd_size = simd_other, .two_op = 1, .d8s = d8s_vl_by_2 }, [0x14 ... 0x16] = { .simd_size = simd_packed_fp, .d8s = d8s_vl }, [0x17] = { .simd_size = simd_packed_int, .two_op = 1 }, [0x18] = { .simd_size = simd_scalar_opc, .two_op = 1, .d8s = 2 }, @@ -541,7 +542,7 @@ static const struct ext0f3a_table { [0x19] = { .simd_size = simd_128, .to_mem = 1, .two_op = 1, .d8s = 4 }, [0x1a] = { .simd_size = simd_256, .d8s = d8s_vl_by_2 }, [0x1b] = { .simd_size = simd_256, .to_mem = 1, .two_op = 1, .d8s = d8s_vl_by_2 }, - [0x1d] = { .simd_size = simd_other, .to_mem = 1, .two_op = 1 }, + [0x1d] = { .simd_size = simd_other, .to_mem = 1, .two_op = 1, .d8s = d8s_vl_by_2 }, [0x1e ... 0x1f] = { .simd_size = simd_packed_int, .d8s = d8s_vl }, [0x20] = { .simd_size = simd_none, .d8s = 0 }, [0x21] = { .simd_size = simd_other, .d8s = 2 }, @@ -3066,6 +3067,11 @@ x86_decode( modrm_mod = 3; break; + case 0x5a: /* vcvtps2pd needs special casing */ + if ( disp8scale && !evex.pfx && !evex.br ) + --disp8scale; + break; + case 0x7e: /* vmovq xmm/m64,xmm needs special casing */ if ( disp8scale == 2 && evex.pfx == vex_f3 ) disp8scale = 3; @@ -5974,6 +5980,7 @@ x86_emulate( CASE_SIMD_ALL_FP(_EVEX, 0x0f, 0x5d): /* vmin{p,s}{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ CASE_SIMD_ALL_FP(_EVEX, 0x0f, 0x5e): /* vdiv{p,s}{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ CASE_SIMD_ALL_FP(_EVEX, 0x0f, 0x5f): /* vmax{p,s}{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ + avx512f_all_fp: generate_exception_if((evex.w != (evex.pfx & VEX_PREFIX_DOUBLE_MASK) || (ea.type == OP_MEM && evex.br && (evex.pfx & VEX_PREFIX_SCALAR_MASK))), @@ -6530,7 +6537,7 @@ x86_emulate( goto simd_zmm; CASE_SIMD_ALL_FP(, 0x0f, 0x5a): /* cvt{p,s}{s,d}2{p,s}{s,d} xmm/mem,xmm */ - CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x5a): /* vcvtp{s,d}2p{s,d} xmm/mem,xmm */ + CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x5a): /* vcvtp{s,d}2p{s,d} {x,y}mm/mem,{x,y}mm */ /* vcvts{s,d}2s{s,d} xmm/mem,xmm,xmm */ op_bytes = 4 << (((vex.pfx & VEX_PREFIX_SCALAR_MASK) ? 0 : 1 + vex.l) + !!(vex.pfx & VEX_PREFIX_DOUBLE_MASK)); @@ -6539,6 +6546,12 @@ x86_emulate( goto simd_0f_sse2; goto simd_0f_avx; + CASE_SIMD_ALL_FP(_EVEX, 0x0f, 0x5a): /* vcvtp{s,d}2p{s,d} [xyz]mm/mem,[xyz]mm{k} */ + /* vcvts{s,d}2s{s,d} xmm/mem,xmm,xmm{k} */ + op_bytes = 4 << (((evex.pfx & VEX_PREFIX_SCALAR_MASK) ? 0 : 1 + evex.lr) + + evex.w); + goto avx512f_all_fp; + CASE_SIMD_PACKED_FP(, 0x0f, 0x5b): /* cvt{ps,dq}2{dq,ps} xmm/mem,xmm */ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x5b): /* vcvt{ps,dq}2{dq,ps} {x,y}mm/mem,{x,y}mm */ case X86EMUL_OPC_F3(0x0f, 0x5b): /* cvttps2dq xmm/mem,xmm */ @@ -8420,6 +8433,15 @@ x86_emulate( op_bytes = 8 << vex.l; goto simd_0f_ymm; + case X86EMUL_OPC_EVEX_66(0x0f38, 0x13): /* vcvtph2ps {x,y}mm/mem,[xyz]mm{k} */ + generate_exception_if(evex.w || (ea.type == OP_MEM && evex.br), EXC_UD); + host_and_vcpu_must_have(avx512f); + if ( !evex.br ) + avx512_vlen_check(false); + op_bytes = 8 << evex.lr; + elem_bytes = 2; + goto simd_zmm; + case X86EMUL_OPC_VEX_66(0x0f38, 0x16): /* vpermps ymm/m256,ymm,ymm */ case X86EMUL_OPC_VEX_66(0x0f38, 0x36): /* vpermd ymm/m256,ymm,ymm */ generate_exception_if(!vex.l || vex.w, EXC_UD); @@ -9243,27 +9265,79 @@ x86_emulate( goto avx512f_imm8_no_sae; case X86EMUL_OPC_VEX_66(0x0f3a, 0x1d): /* vcvtps2ph $imm8,{x,y}mm,xmm/mem */ + case X86EMUL_OPC_EVEX_66(0x0f3a, 0x1d): /* vcvtps2ph $imm8,[xyz]mm,{x,y}mm/mem{k} */ { uint32_t mxcsr; - generate_exception_if(vex.w || vex.reg != 0xf, EXC_UD); - host_and_vcpu_must_have(f16c); fail_if(!ops->write); + if ( evex_encoded() ) + { + generate_exception_if((evex.w || evex.reg != 0xf || !evex.RX || + (ea.type == OP_MEM && (evex.z || evex.br))), + EXC_UD); + host_and_vcpu_must_have(avx512f); + avx512_vlen_check(false); + opc = init_evex(stub); + } + else + { + generate_exception_if(vex.w || vex.reg != 0xf, EXC_UD); + host_and_vcpu_must_have(f16c); + opc = init_prefixes(stub); + } + + op_bytes = 8 << evex.lr; - opc = init_prefixes(stub); opc[0] = b; opc[1] = modrm; if ( ea.type == OP_MEM ) { /* Convert memory operand to (%rAX). */ vex.b = 1; + evex.b = 1; opc[1] &= 0x38; } opc[2] = imm1; - insn_bytes = PFX_BYTES + 3; + if ( evex_encoded() ) + { + unsigned int full = 0; + + insn_bytes = EVEX_PFX_BYTES + 3; + copy_EVEX(opc, evex); + + if ( ea.type == OP_MEM && evex.opmsk ) + { + full = 0xffff >> (16 - op_bytes / 2); + op_mask &= full; + if ( !op_mask ) + goto complete_insn; + + first_byte = __builtin_ctz(op_mask); + op_mask >>= first_byte; + full >>= first_byte; + first_byte <<= 1; + op_bytes = (32 - __builtin_clz(op_mask)) << 1; + + /* + * We may need to read (parts of) the memory operand for the + * purpose of merging in order to avoid splitting the write + * below into multiple ones. + */ + if ( op_mask != full && + (rc = ops->read(ea.mem.seg, + truncate_ea(ea.mem.off + first_byte), + (void *)mmvalp + first_byte, op_bytes, + ctxt)) != X86EMUL_OKAY ) + goto done; + } + } + else + { + insn_bytes = PFX_BYTES + 3; + copy_VEX(opc, vex); + } opc[3] = 0xc3; - copy_VEX(opc, vex); /* Latch MXCSR - we may need to restore it below. */ invoke_stub("stmxcsr %[mxcsr]", "", "=m" (*mmvalp), [mxcsr] "=m" (mxcsr) : "a" (mmvalp)); @@ -9272,7 +9346,8 @@ x86_emulate( if ( ea.type == OP_MEM ) { - rc = ops->write(ea.mem.seg, ea.mem.off, mmvalp, 8 << vex.l, ctxt); + rc = ops->write(ea.mem.seg, truncate_ea(ea.mem.off + first_byte), + (void *)mmvalp + first_byte, op_bytes, ctxt); if ( rc != X86EMUL_OKAY ) { asm volatile ( "ldmxcsr %0" :: "m" (mxcsr) ); _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxxx https://lists.xenproject.org/mailman/listinfo/xen-devel

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.