[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [PATCH v3 12/16] x86emul: support AVX10.2 BFloat16 insns
These are all very similar to various existing insns. VGETEXPPBF16, not living in the expected place, benefits from the respective twobyte_table[] entry already having Mov (aka TwoOp). Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx> --- This still follows what spec version 001 says for VGETEXPPBF16. It moved to map 6 (and be NP), yet so far no SDE is available to run the test harness there with the changed encoding. Spec rev 002 says VSCALEFPBF16, yet that's going to change to VSCALEFNEPBF16. --- SDE: ??? --- v3: New. --- a/tools/tests/x86_emulator/evex-disp8.c +++ b/tools/tests/x86_emulator/evex-disp8.c @@ -713,16 +713,37 @@ static const struct test vpclmulqdq_all[ }; static const struct test avx10_2_all[] = { + INSN(addnepbf16, 66, map5, 58, vl, bf16, vl), + INSN(cmppbf16, f2, 0f3a, c2, vl, bf16, vl), INSN(comsbf16, 66, map5, 2f, el, bf16, el), INSN(comxsd, f3, 0f, 2f, el, q, el), INSN(comxsh, f2, map5, 2f, el, fp16, el), INSN(comxss, f2, 0f, 2f, el, d, el), + INSN(divnepbf16, 66, map5, 5e, vl, bf16, vl), INSN(dpphps, , 0f38, 52, vl, d, vl), + INSN(fmadd132nepbf16, , map6, 98, vl, bf16, vl), + INSN(fmadd213nepbf16, , map6, a8, vl, bf16, vl), + INSN(fmadd231nepbf16, , map6, b8, vl, bf16, vl), + INSN(fmsub132nepbf16, , map6, 9a, vl, bf16, vl), + INSN(fmsub213nepbf16, , map6, aa, vl, bf16, vl), + INSN(fmsub231nepbf16, , map6, ba, vl, bf16, vl), + INSN(fnmadd132nepbf16, , map6, 9c, vl, bf16, vl), + INSN(fnmadd213nepbf16, , map6, ac, vl, bf16, vl), + INSN(fnmadd231nepbf16, , map6, bc, vl, bf16, vl), + INSN(fnmsub132nepbf16, , map6, 9e, vl, bf16, vl), + INSN(fnmsub213nepbf16, , map6, ae, vl, bf16, vl), + INSN(fnmsub231nepbf16, , map6, be, vl, bf16, vl), + INSN(fpclasspbf16, f2, 0f3a, 66, vl, bf16, vl), + INSN(getexppbf16, 66, map5, 42, vl, bf16, vl), + INSN(getmantpbf16, f2, 0f3a, 26, vl, bf16, vl), + INSN(maxpbf16, 66, map5, 5f, vl, bf16, vl), + INSN(minpbf16, 66, map5, 5d, vl, bf16, vl), INSN(minmax, 66, 0f3a, 52, vl, sd, vl), INSN(minmax, 66, 0f3a, 53, el, sd, el), INSN(minmaxpbf16, f2, 0f3a, 52, vl, bf16, vl), INSN(minmaxph, , 0f3a, 52, vl, fp16, vl), INSN(minmaxsh, , 0f3a, 53, el, fp16, el), + INSN(mulnepbf16, 66, map5, 59, vl, bf16, vl), INSN(mpsadbw, f3, 0f3a, 42, vl, d_nb, vl), INSN(pdpbssd, f2, 0f38, 50, vl, d, vl), INSN(pdpbssds, f2, 0f38, 51, vl, d, vl), @@ -736,6 +757,13 @@ static const struct test avx10_2_all[] = INSN(pdpwusds, 66, 0f38, d3, vl, d, vl), INSN(pdpwuud, , 0f38, d2, vl, d, vl), INSN(pdpwuuds, , 0f38, d3, vl, d, vl), + INSN(rcpph, , map6, 4c, vl, bf16, vl), + INSN(reducenepbf16, f2, 0f3a, 56, vl, bf16, vl), + INSN(rndscalenepbf16, f2, 0f3a, 08, vl, bf16, vl), + INSN(rsqrtph, , map6, 4e, vl, bf16, vl), + INSN(scalefnepbf16, , map6, 2c, vl, bf16, vl), + INSN(sqrtnepbf16, 66, map5, 51, vl, bf16, vl), + INSN(subnepbf16, 66, map5, 5c, vl, bf16, vl), INSN(ucomxsd, f3, 0f, 2e, el, q, el), INSN(ucomxsh, f2, map5, 2e, el, fp16, el), INSN(ucomxss, f2, 0f, 2e, el, d, el), --- a/tools/tests/x86_emulator/predicates.c +++ b/tools/tests/x86_emulator/predicates.c @@ -2054,6 +2054,7 @@ static const struct evex { { { 0x05 }, 3, T, R, pfx_66, W1, Ln }, /* vpermilpd */ { { 0x08 }, 3, T, R, pfx_no, W0, Ln }, /* vrndscaleph */ { { 0x08 }, 3, T, R, pfx_66, W0, Ln }, /* vrndscaleps */ + { { 0x08 }, 3, T, R, pfx_f2, W0, Ln }, /* vrndscalenepbf16 */ { { 0x09 }, 3, T, R, pfx_66, W1, Ln }, /* vrndscalepd */ { { 0x0a }, 3, T, R, pfx_no, W0, LIG }, /* vrndscalesh */ { { 0x0a }, 3, T, R, pfx_66, W0, LIG }, /* vrndscaless */ @@ -2077,6 +2078,7 @@ static const struct evex { { { 0x25 }, 3, T, R, pfx_66, Wn, Ln }, /* vpternlog{d,q} */ { { 0x26 }, 3, T, R, pfx_no, W0, Ln }, /* vgetmantph */ { { 0x26 }, 3, T, R, pfx_66, Wn, Ln }, /* vgetmantp{s,d} */ + { { 0x26 }, 3, T, R, pfx_f2, W0, Ln }, /* vgetmantpbf16 */ { { 0x27 }, 3, T, R, pfx_no, W0, LIG }, /* vgetmantsh */ { { 0x27 }, 3, T, R, pfx_66, Wn, LIG }, /* vgetmants{s,d} */ { { 0x38 }, 3, T, R, pfx_66, Wn, L1|L2 }, /* vinserti{32x4,64x2} */ @@ -2100,10 +2102,12 @@ static const struct evex { { { 0x55 }, 3, T, R, pfx_66, Wn, LIG }, /* vfixumpimms{s,d} */ { { 0x56 }, 3, T, R, pfx_no, W0, Ln }, /* vreduceph */ { { 0x56 }, 3, T, R, pfx_66, Wn, Ln }, /* vreducep{s,d} */ + { { 0x56 }, 3, T, R, pfx_f2, W0, Ln }, /* vreducenepbf16 */ { { 0x57 }, 3, T, R, pfx_no, W0, LIG }, /* vreducesh */ { { 0x57 }, 3, T, R, pfx_66, Wn, LIG }, /* vreduces{s,d} */ { { 0x66 }, 3, T, R, pfx_no, W0, Ln }, /* vfpclassph */ { { 0x66 }, 3, T, R, pfx_66, Wn, Ln }, /* vfpclassp{s,d} */ + { { 0x66 }, 3, T, R, pfx_f2, W0, Ln }, /* vfpclasspbf16 */ { { 0x67 }, 3, T, R, pfx_no, W0, LIG }, /* vfpclasssh */ { { 0x67 }, 3, T, R, pfx_66, Wn, LIG }, /* vfpclasss{s,d} */ { { 0x70 }, 3, T, R, pfx_66, W1, Ln }, /* vshldw */ @@ -2112,6 +2116,7 @@ static const struct evex { { { 0x73 }, 3, T, R, pfx_66, Wn, Ln }, /* vshrd{d,q} */ { { 0xc2 }, 3, T, R, pfx_no, W0, Ln }, /* vcmpph */ { { 0xc2 }, 3, T, R, pfx_f3, W0, LIG }, /* vcmpsh */ + { { 0xc2 }, 3, T, R, pfx_f2, W0, Ln }, /* vcmppbf16 */ { { 0xce }, 3, T, R, pfx_66, W1, Ln }, /* vgf2p8affineqb */ { { 0xcf }, 3, T, R, pfx_66, W1, Ln }, /* vgf2p8affineinvqb */ }, evex_map5[] = { @@ -2127,11 +2132,15 @@ static const struct evex { { { 0x2f }, 2, T, R, pfx_no, W0, LIG }, /* vcomish */ { { 0x2f }, 2, T, R, pfx_66, W0, LIG }, /* vcomsbf16 */ { { 0x2f }, 2, T, R, pfx_f2, W0, LIG }, /* vcomxsh */ + { { 0x42 }, 2, T, R, pfx_66, W0, Ln }, /* vgetexppbf16 */ { { 0x51 }, 2, T, R, pfx_no, W0, Ln }, /* vsqrtph */ + { { 0x51 }, 2, T, R, pfx_66, W0, Ln }, /* vsqrtnepbf16 */ { { 0x51 }, 2, T, R, pfx_f3, W0, LIG }, /* vsqrtsh */ { { 0x58 }, 2, T, R, pfx_no, W0, Ln }, /* vaddph */ + { { 0x58 }, 2, T, R, pfx_66, W0, Ln }, /* vaddnepbf16 */ { { 0x58 }, 2, T, R, pfx_f3, W0, LIG }, /* vaddsh */ { { 0x59 }, 2, T, R, pfx_no, W0, Ln }, /* vmulph */ + { { 0x59 }, 2, T, R, pfx_66, W0, Ln }, /* vmulnepbf16 */ { { 0x59 }, 2, T, R, pfx_f3, W0, LIG }, /* vmulsh */ { { 0x5a }, 2, T, R, pfx_no, W0, Ln }, /* vcvtph2pd */ { { 0x5a }, 2, T, R, pfx_66, W1, Ln }, /* vcvtpd2ph */ @@ -2142,12 +2151,16 @@ static const struct evex { { { 0x5b }, 2, T, R, pfx_66, W0, Ln }, /* vcvtph2dq */ { { 0x5b }, 2, T, R, pfx_f3, W0, Ln }, /* vcvttph2dq */ { { 0x5c }, 2, T, R, pfx_no, W0, Ln }, /* vsubph */ + { { 0x5c }, 2, T, R, pfx_66, W0, Ln }, /* vsubnepbf16 */ { { 0x5c }, 2, T, R, pfx_f3, W0, LIG }, /* vsubsh */ { { 0x5d }, 2, T, R, pfx_no, W0, Ln }, /* vminph */ + { { 0x5d }, 2, T, R, pfx_66, W0, Ln }, /* vminpbf16 */ { { 0x5d }, 2, T, R, pfx_f3, W0, LIG }, /* vminsh */ { { 0x5e }, 2, T, R, pfx_no, W0, Ln }, /* vdivph */ + { { 0x5e }, 2, T, R, pfx_66, W0, Ln }, /* vdivnepbf16 */ { { 0x5e }, 2, T, R, pfx_f3, W0, LIG }, /* vdivsh */ { { 0x5f }, 2, T, R, pfx_no, W0, Ln }, /* vmaxph */ + { { 0x5f }, 2, T, R, pfx_66, W0, Ln }, /* vmaxpbf16 */ { { 0x5f }, 2, T, R, pfx_f3, W0, LIG }, /* vmaxsh */ { { 0x6e }, 2, T, R, pfx_66, WIG, L0 }, /* vmovw */ { { 0x6e }, 2, T, R, pfx_f3, W0, L0 }, /* vmovw */ @@ -2173,12 +2186,15 @@ static const struct evex { }, evex_map6[] = { { { 0x13 }, 2, T, R, pfx_66, W0, Ln }, /* vcvtph2psx */ { { 0x13 }, 2, T, R, pfx_no, W0, LIG }, /* vcvtsh2ss */ + { { 0x2c }, 2, T, R, pfx_no, W0, Ln }, /* vscalefnepbf16 */ { { 0x2c }, 2, T, R, pfx_66, W0, Ln }, /* vscalefph */ { { 0x2d }, 2, T, R, pfx_66, W0, LIG }, /* vscalefsh */ { { 0x42 }, 2, T, R, pfx_66, W0, Ln }, /* vgetexpph */ { { 0x43 }, 2, T, R, pfx_66, W0, LIG }, /* vgetexpsh */ + { { 0x4c }, 2, T, R, pfx_no, W0, Ln }, /* vrcppbf16 */ { { 0x4c }, 2, T, R, pfx_66, W0, Ln }, /* vrcpph */ { { 0x4d }, 2, T, R, pfx_66, W0, LIG }, /* vrcpsh */ + { { 0x4e }, 2, T, R, pfx_no, W0, Ln }, /* vrsqrtpbf16 */ { { 0x4e }, 2, T, R, pfx_66, W0, Ln }, /* vrsqrtph */ { { 0x4f }, 2, T, R, pfx_66, W0, LIG }, /* vrsqrtsh */ { { 0x56 }, 2, T, R, pfx_f3, W0, Ln }, /* vfmaddcph */ --- a/xen/arch/x86/x86_emulate/decode.c +++ b/xen/arch/x86/x86_emulate/decode.c @@ -1472,31 +1472,34 @@ int x86emul_decode(struct x86_emulate_st { switch ( b ) { - case 0x08: /* vrndscaleph */ + case 0x08: /* vrndscale{ph,nepbf16} */ + case 0x26: /* vfpclassp{h,bf16} */ + case 0x52: /* vminmaxp{h,bf16} */ + case 0x56: /* vgetmantp{h,bf16} */ + case 0x66: /* vreduce{ph,nepbf16} */ + if ( !s->evex.pfx || s->evex.pfx == vex_f2 ) + s->fp16 = true; + break; + case 0x0a: /* vrndscalesh */ - case 0x26: /* vfpclassph */ case 0x27: /* vfpclasssh */ case 0x53: /* vminmaxsh */ - case 0x56: /* vgetmantph */ case 0x57: /* vgetmantsh */ - case 0x66: /* vreduceph */ case 0x67: /* vreducesh */ if ( !s->evex.pfx ) s->fp16 = true; break; - case 0x52: /* vminmaxp{h,bf16} */ - if ( !s->evex.pfx || s->evex.pfx == vex_f2 ) - s->fp16 = true; - break; - - case 0xc2: /* vpcmp{p,s}h */ - if ( !(s->evex.pfx & VEX_PREFIX_DOUBLE_MASK) ) + case 0xc2: /* vpcmp{p,s}h, vcmppbf16 */ + if ( s->evex.pfx != vex_66 ) s->fp16 = true; break; } - disp8scale = decode_disp8scale(ext0f3a_table[b].d8s, s); + if ( s->fp16 && s->evex.pfx == vex_f2 && !s->evex.brs ) + disp8scale = 4 + s->evex.lr; + else + disp8scale = decode_disp8scale(ext0f3a_table[b].d8s, s); } break; @@ -1504,7 +1507,7 @@ int x86emul_decode(struct x86_emulate_st switch ( b ) { default: - if ( !(s->evex.pfx & VEX_PREFIX_DOUBLE_MASK) ) + if ( s->evex.pfx != vex_f2 ) s->fp16 = true; break; @@ -1534,6 +1537,11 @@ int x86emul_decode(struct x86_emulate_st s->simd_size = simd_none; break; + case 0x5a: /* vcvt{p,s}d2{p,s}h, vcvt{p,s}h2{p,s}d */ + if ( !(s->evex.pfx & VEX_PREFIX_DOUBLE_MASK) ) + s->fp16 = true; + break; + case 0x5b: /* vcvt{d,q}q2ph, vcvt{,t}ph2dq */ if ( s->evex.pfx && s->evex.pfx != vex_f2 ) s->fp16 = true; @@ -1586,6 +1594,14 @@ int x86emul_decode(struct x86_emulate_st disp8scale = 1; break; + case 0x42: /* vgetexppbf16 needs special casing */ + if ( s->evex.pfx == vex_66 ) + { + s->simd_size = simd_packed_fp; + disp8scale = s->evex.brs ? 1 : 4 + s->evex.lr; + } + break; + case 0x5a: /* vcvtph2pd needs special casing */ if ( !s->evex.pfx && !s->evex.brs ) disp8scale -= 2; @@ -1618,7 +1634,7 @@ int x86emul_decode(struct x86_emulate_st switch ( b ) { default: - if ( s->evex.pfx == vex_66 ) + if ( !(s->evex.pfx & VEX_PREFIX_SCALAR_MASK) ) s->fp16 = true; break; @@ -1950,6 +1966,13 @@ int x86emul_decode(struct x86_emulate_st s->op_bytes = 4 >> s->fp16; break; case vex_f2: + if ( s->fp16 ) + { + ASSERT(evex_encoded()); + generate_exception_if(s->evex.w, X86_EXC_UD); + s->op_bytes = 0; + break; + } generate_exception_if(evex_encoded() && !s->evex.w, X86_EXC_UD); s->op_bytes = 8; break; --- a/xen/arch/x86/x86_emulate/x86_emulate.c +++ b/xen/arch/x86/x86_emulate/x86_emulate.c @@ -7319,6 +7319,20 @@ x86_emulate( avx512_vlen_check(b & 2); goto simd_imm8_zmm; + case X86EMUL_OPC_EVEX_F2(0x0f3a, 0x66): /* vfpclasspbf16 $imm8,[xyz]mm/mem,k{k} */ + case X86EMUL_OPC_EVEX_F2(0x0f3a, 0xc2): /* vcmppbf16 $imm8,[xyz]mm/mem,[xyz]mm,k{k} */ + generate_exception_if(!evex.r || !evex.R || evex.z, X86_EXC_UD); + /* fall through */ + case X86EMUL_OPC_EVEX_F2(0x0f3a, 0x08): /* vrndscalenepbf16 $imm8,[xyz]mm/mem,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX_F2(0x0f3a, 0x26): /* vgetmantpbf16 $imm8,[xyz]mm/mem,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX_F2(0x0f3a, 0x56): /* vreducenepbf16 $imm8,[xyz]mm/mem,[xyz]mm{k} */ + generate_exception_if(evex.w || (ea.type != OP_MEM && evex.brs), + X86_EXC_UD); + vcpu_must_have(avx10, 2); + avx512_vlen_check(false); + op_bytes = 16 << evex.lr; + goto simd_imm8_zmm; + #endif /* X86EMUL_NO_SIMD */ CASE_SIMD_PACKED_INT(0x0f3a, 0x0f): /* palignr $imm8,{,x}mm/mem,{,x}mm */ @@ -7951,6 +7965,36 @@ x86_emulate( generate_exception_if(evex.w, X86_EXC_UD); goto avx512f_all_fp; + case X86EMUL_OPC_EVEX_66(5, 0x42): /* vgetexppbf16 [xyz]mm/mem,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX_66(5, 0x51): /* vsqrtnepbf16 [xyz]mm/mem,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX_66(5, 0x58): /* vaddnepbf16 [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX_66(5, 0x59): /* vmulnepbf16 [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX_66(5, 0x5c): /* vsubnepbf16 [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX_66(5, 0x5d): /* vminpbf16 [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX_66(5, 0x5e): /* vdivnepbf16 [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX_66(5, 0x5f): /* vmaxpbf16 [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX(6, 0x2c): /* vscalefnepbf16 [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX(6, 0x4c): /* vrcppbf16 [xyz]mm/mem,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX(6, 0x4e): /* vrsqrtpbf16 [xyz]mm/mem,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX(6, 0x98): /* vfmadd132nepbf16 [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX(6, 0x9a): /* vfmsub132nepbf16 [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX(6, 0x9c): /* vfnmadd132nepbf16 [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX(6, 0x9e): /* vfnmsub132nepbf16 [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX(6, 0xa8): /* vfmadd213nepbf16 [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX(6, 0xaa): /* vfmsub213nepbf16 [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX(6, 0xac): /* vfnmadd213nepbf16 [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX(6, 0xae): /* vfnmsub213nepbf16 [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX(6, 0xb8): /* vfmadd231nepbf16 [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX(6, 0xba): /* vfmsub231nepbf16 [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX(6, 0xbc): /* vfnmadd231nepbf16 [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX(6, 0xbe): /* vfnmsub231nepbf16 [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ + generate_exception_if(evex.w || (ea.type != OP_MEM && evex.brs), + X86_EXC_UD); + vcpu_must_have(avx10, 2); + avx512_vlen_check(false); + op_bytes = 16 << evex.lr; + goto simd_zmm; + CASE_SIMD_ALL_FP(_EVEX, 5, 0x5a): /* vcvtp{h,d}2p{h,d} [xyz]mm/mem,[xyz]mm{k} */ /* vcvts{h,d}2s{h,d} xmm/mem,xmm,xmm{k} */ visa_check(_fp16);
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |