|
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-devel] [PATCH v3 29/34] x86emul: support AVX512{F, BW} integer shuffle insns
Also include shuff{32x4,64x2} as being very similar to shufi{32x4,64x2}.
Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>
---
v3: New.
--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -200,6 +200,7 @@ static const struct test avx512f_all[] =
INSN(prolv, 66, 0f38, 15, vl, dq, vl),
INSNX(pror, 66, 0f, 72, 0, vl, dq, vl),
INSN(prorv, 66, 0f38, 14, vl, dq, vl),
+ INSN(pshufd, 66, 0f, 70, vl, d, vl),
INSN(pslld, 66, 0f, f2, el_4, d, vl),
INSNX(pslld, 66, 0f, 72, 6, vl, d, vl),
INSN(psllq, 66, 0f, f3, el_2, q, vl),
@@ -250,6 +251,10 @@ static const struct test avx512f_no128[]
INSN(extracti32x4, 66, 0f3a, 39, el_4, d, vl),
INSN(insertf32x4, 66, 0f3a, 18, el_4, d, vl),
INSN(inserti32x4, 66, 0f3a, 38, el_4, d, vl),
+ INSN(shuff32x4, 66, 0f3a, 23, vl, d, vl),
+ INSN(shuff64x2, 66, 0f3a, 23, vl, q, vl),
+ INSN(shufi32x4, 66, 0f3a, 43, vl, d, vl),
+ INSN(shufi64x2, 66, 0f3a, 43, vl, q, vl),
};
static const struct test avx512f_512[] = {
@@ -304,6 +309,9 @@ static const struct test avx512bw_all[]
INSN(pmulhw, 66, 0f, e5, vl, w, vl),
INSN(pmullw, 66, 0f, d5, vl, w, vl),
INSN(psadbw, 66, 0f, f6, vl, b, vl),
+ INSN(pshufb, 66, 0f38, 00, vl, b, vl),
+ INSN(pshufhw, f3, 0f, 70, vl, w, vl),
+ INSN(pshuflw, f2, 0f, 70, vl, w, vl),
INSNX(pslldq, 66, 0f, 73, 7, vl, b, vl),
INSN(psllvw, 66, 0f38, 12, vl, w, vl),
INSN(psllw, 66, 0f, f1, el_8, w, vl),
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -147,6 +147,10 @@ static inline bool _to_bool(byte_vec_t b
# else
# define interleave_hi(x, y) B(vpermi2varps, _mask, x, interleave_hi, y, ~0)
# define interleave_lo(x, y) B(vpermt2varps, _mask, interleave_lo, x, y, ~0)
+# define swap(x) ({ \
+ vec_t t_ = B(shuf_f32x4_, _mask, x, x, VEC_SIZE == 32 ? 0b01 : 0b00011011,
undef(), ~0); \
+ B(shufps, _mask, t_, t_, 0b00011011, undef(), ~0); \
+})
# endif
# elif FLOAT_SIZE == 8
# if VEC_SIZE >= 32
@@ -174,6 +178,10 @@ static inline bool _to_bool(byte_vec_t b
# else
# define interleave_hi(x, y) B(vpermi2varpd, _mask, x, interleave_hi, y, ~0)
# define interleave_lo(x, y) B(vpermt2varpd, _mask, interleave_lo, x, y, ~0)
+# define swap(x) ({ \
+ vec_t t_ = B(shuf_f64x2_, _mask, x, x, VEC_SIZE == 32 ? 0b01 : 0b00011011,
undef(), ~0); \
+ B(shufpd, _mask, t_, t_, 0b01010101, undef(), ~0); \
+})
# endif
# endif
# endif
@@ -303,9 +311,14 @@ static inline bool _to_bool(byte_vec_t b
# if VEC_SIZE == 16
# define interleave_hi(x, y) ((vec_t)B(punpckhdq, _mask, (vsi_t)(x),
(vsi_t)(y), (vsi_t)undef(), ~0))
# define interleave_lo(x, y) ((vec_t)B(punpckldq, _mask, (vsi_t)(x),
(vsi_t)(y), (vsi_t)undef(), ~0))
+# define swap(x) ((vec_t)B(pshufd, _mask, (vsi_t)(x), 0b00011011,
(vsi_t)undef(), ~0))
# else
# define interleave_hi(x, y) ((vec_t)B(vpermi2vard, _mask, (vsi_t)(x),
interleave_hi, (vsi_t)(y), ~0))
# define interleave_lo(x, y) ((vec_t)B(vpermt2vard, _mask, interleave_lo,
(vsi_t)(x), (vsi_t)(y), ~0))
+# define swap(x) ((vec_t)B(pshufd, _mask, \
+ B(shuf_i32x4_, _mask, (vsi_t)(x), (vsi_t)(x), \
+ VEC_SIZE == 32 ? 0b01 : 0b00011011,
(vsi_t)undef(), ~0), \
+ 0b00011011, (vsi_t)undef(), ~0))
# endif
# define mix(x, y) ((vec_t)B(movdqa32_, _mask, (vsi_t)(x), (vsi_t)(y), \
(0b0101010101010101 & ((1 << ELEM_COUNT) - 1))))
@@ -327,9 +340,14 @@ static inline bool _to_bool(byte_vec_t b
# if VEC_SIZE == 16
# define interleave_hi(x, y) ((vec_t)B(punpckhqdq, _mask, (vdi_t)(x),
(vdi_t)(y), (vdi_t)undef(), ~0))
# define interleave_lo(x, y) ((vec_t)B(punpcklqdq, _mask, (vdi_t)(x),
(vdi_t)(y), (vdi_t)undef(), ~0))
+# define swap(x) ((vec_t)B(pshufd, _mask, (vsi_t)(x), 0b01001110,
(vsi_t)undef(), ~0))
# else
# define interleave_hi(x, y) ((vec_t)B(vpermi2varq, _mask, (vdi_t)(x),
interleave_hi, (vdi_t)(y), ~0))
# define interleave_lo(x, y) ((vec_t)B(vpermt2varq, _mask, interleave_lo,
(vdi_t)(x), (vdi_t)(y), ~0))
+# define swap(x) ((vec_t)B(pshufd, _mask, \
+ (vsi_t)B(shuf_i64x2_, _mask, (vdi_t)(x),
(vdi_t)(x), \
+ VEC_SIZE == 32 ? 0b01 : 0b00011011,
(vdi_t)undef(), ~0), \
+ 0b01001110, (vsi_t)undef(), ~0))
# endif
# define mix(x, y) ((vec_t)B(movdqa64_, _mask, (vdi_t)(x), (vdi_t)(y),
0b01010101))
# endif
--- a/tools/tests/x86_emulator/simd.h
+++ b/tools/tests/x86_emulator/simd.h
@@ -119,6 +119,12 @@ typedef long long __attribute__((vector_
#ifdef __AVX512F__
+/* Sadly there are a few exceptions to the general naming rules. */
+#define __builtin_ia32_shuf_f32x4_512_mask __builtin_ia32_shuf_f32x4_mask
+#define __builtin_ia32_shuf_f64x2_512_mask __builtin_ia32_shuf_f64x2_mask
+#define __builtin_ia32_shuf_i32x4_512_mask __builtin_ia32_shuf_i32x4_mask
+#define __builtin_ia32_shuf_i64x2_512_mask __builtin_ia32_shuf_i64x2_mask
+
#if VEC_SIZE < 64
# pragma GCC target ( "avx512vl" )
#endif
@@ -208,6 +214,7 @@ OVR(pmovzxbq);
OVR(pmovzxdq);
OVR(pmovzxwd);
OVR(pmovzxwq);
+OVR(pshufd);
OVR(punpckhdq);
OVR(punpckhqdq);
OVR(punpckldq);
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -316,7 +316,7 @@ static const struct twobyte_table {
[0x6b ... 0x6d] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
[0x6e] = { DstImplicit|SrcMem|ModRM|Mov, simd_none, d8s_dq },
[0x6f] = { DstImplicit|SrcMem|ModRM|Mov, simd_packed_int, d8s_vl },
- [0x70] = { SrcImmByte|ModRM|TwoOp, simd_other },
+ [0x70] = { SrcImmByte|ModRM|TwoOp, simd_other, d8s_vl },
[0x71 ... 0x73] = { DstImplicit|SrcImmByte|ModRM, simd_none, d8s_vl },
[0x74 ... 0x76] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
[0x77] = { DstImplicit|SrcNone },
@@ -430,7 +430,8 @@ static const struct ext0f38_table {
uint8_t vsib:1;
disp8scale_t d8s:4;
} ext0f38_table[256] = {
- [0x00 ... 0x0b] = { .simd_size = simd_packed_int },
+ [0x00] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
+ [0x01 ... 0x0b] = { .simd_size = simd_packed_int },
[0x0c ... 0x0f] = { .simd_size = simd_packed_fp },
[0x10 ... 0x12] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
[0x13] = { .simd_size = simd_other, .two_op = 1 },
@@ -541,6 +542,7 @@ static const struct ext0f3a_table {
[0x20] = { .simd_size = simd_none, .d8s = 0 },
[0x21] = { .simd_size = simd_other, .d8s = 2 },
[0x22] = { .simd_size = simd_none, .d8s = d8s_dq },
+ [0x23] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
[0x25] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
[0x30 ... 0x33] = { .simd_size = simd_other, .two_op = 1 },
[0x38] = { .simd_size = simd_128, .d8s = 4 },
@@ -550,6 +552,7 @@ static const struct ext0f3a_table {
[0x3e ... 0x3f] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
[0x40 ... 0x41] = { .simd_size = simd_packed_fp },
[0x42] = { .simd_size = simd_packed_int },
+ [0x43] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
[0x44] = { .simd_size = simd_packed_int },
[0x46] = { .simd_size = simd_packed_int },
[0x48 ... 0x49] = { .simd_size = simd_packed_fp, .four_op = 1 },
@@ -6653,6 +6656,7 @@ x86_emulate(
case X86EMUL_OPC_EVEX_66(0x0f, 0xf1): /* vpsllw
xmm/m128,[xyz]mm,[xyz]mm{k} */
case X86EMUL_OPC_EVEX_66(0x0f, 0xf5): /* vpmaddwd
[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
case X86EMUL_OPC_EVEX_66(0x0f, 0xf6): /* vpsadbw
[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x00): /* vpshufb
[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
fault_suppression = false;
/* fall through */
case X86EMUL_OPC_EVEX_66(0x0f, 0xd5): /* vpmullw
[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
@@ -6924,6 +6928,20 @@ x86_emulate(
insn_bytes = PFX_BYTES + 3;
break;
+ case X86EMUL_OPC_EVEX_66(0x0f, 0x70): /* vpshufd
$imm8,[xyz]mm/mem,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_F3(0x0f, 0x70): /* vpshufhw
$imm8,[xyz]mm/mem,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_F2(0x0f, 0x70): /* vpshuflw
$imm8,[xyz]mm/mem,[xyz]mm{k} */
+ if ( evex.pfx == vex_66 )
+ generate_exception_if(evex.w, EXC_UD);
+ else
+ {
+ host_and_vcpu_must_have(avx512bw);
+ generate_exception_if(evex.br, EXC_UD);
+ }
+ d = (d & ~SrcMask) | SrcMem | TwoOp;
+ op_bytes = 16 << evex.lr;
+ goto avx512f_imm_no_sae;
+
CASE_SIMD_PACKED_INT(0x0f, 0x71): /* Grp12 */
case X86EMUL_OPC_VEX_66(0x0f, 0x71):
CASE_SIMD_PACKED_INT(0x0f, 0x72): /* Grp13 */
@@ -9093,7 +9111,13 @@ x86_emulate(
/* vextracti64x2
$imm8,{y,z}mm,xmm/m128{k} */
if ( evex.w )
host_and_vcpu_must_have(avx512dq);
- generate_exception_if(!evex.lr || evex.br, EXC_UD);
+ generate_exception_if(evex.br, EXC_UD);
+ /* fall through */
+ case X86EMUL_OPC_EVEX_66(0x0f3a, 0x23): /* vshuff32x4
$imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ /* vshuff64x2
$imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f3a, 0x43): /* vshufi32x4
$imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ /* vshufi64x2
$imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ generate_exception_if(!evex.lr, EXC_UD);
fault_suppression = false;
goto avx512f_imm_no_sae;
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel
|
![]() |
Lists.xenproject.org is hosted with RackSpace, monitoring our |