[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH v3 29/34] x86emul: support AVX512{F, BW} integer shuffle insns



Also include shuff{32x4,64x2} as being very similar to shufi{32x4,64x2}.

Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>
---
v3: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -200,6 +200,7 @@ static const struct test avx512f_all[] =
     INSN(prolv,        66, 0f38, 15,    vl,     dq, vl),
     INSNX(pror,        66,   0f, 72, 0, vl,     dq, vl),
     INSN(prorv,        66, 0f38, 14,    vl,     dq, vl),
+    INSN(pshufd,       66,   0f, 70,    vl,      d, vl),
     INSN(pslld,        66,   0f, f2,    el_4,    d, vl),
     INSNX(pslld,       66,   0f, 72, 6, vl,      d, vl),
     INSN(psllq,        66,   0f, f3,    el_2,    q, vl),
@@ -250,6 +251,10 @@ static const struct test avx512f_no128[]
     INSN(extracti32x4,   66, 0f3a, 39, el_4,  d, vl),
     INSN(insertf32x4,    66, 0f3a, 18, el_4,  d, vl),
     INSN(inserti32x4,    66, 0f3a, 38, el_4,  d, vl),
+    INSN(shuff32x4,      66, 0f3a, 23, vl,    d, vl),
+    INSN(shuff64x2,      66, 0f3a, 23, vl,    q, vl),
+    INSN(shufi32x4,      66, 0f3a, 43, vl,    d, vl),
+    INSN(shufi64x2,      66, 0f3a, 43, vl,    q, vl),
 };
 
 static const struct test avx512f_512[] = {
@@ -304,6 +309,9 @@ static const struct test avx512bw_all[]
     INSN(pmulhw,      66,   0f, e5,    vl,   w, vl),
     INSN(pmullw,      66,   0f, d5,    vl,   w, vl),
     INSN(psadbw,      66,   0f, f6,    vl,   b, vl),
+    INSN(pshufb,      66, 0f38, 00,    vl,   b, vl),
+    INSN(pshufhw,     f3,   0f, 70,    vl,   w, vl),
+    INSN(pshuflw,     f2,   0f, 70,    vl,   w, vl),
     INSNX(pslldq,     66,   0f, 73, 7, vl,   b, vl),
     INSN(psllvw,      66, 0f38, 12,    vl,   w, vl),
     INSN(psllw,       66,   0f, f1,    el_8, w, vl),
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -147,6 +147,10 @@ static inline bool _to_bool(byte_vec_t b
 #   else
 #    define interleave_hi(x, y) B(vpermi2varps, _mask, x, interleave_hi, y, ~0)
 #    define interleave_lo(x, y) B(vpermt2varps, _mask, interleave_lo, x, y, ~0)
+#    define swap(x) ({ \
+    vec_t t_ = B(shuf_f32x4_, _mask, x, x, VEC_SIZE == 32 ? 0b01 : 0b00011011, 
undef(), ~0); \
+    B(shufps, _mask, t_, t_, 0b00011011, undef(), ~0); \
+})
 #   endif
 #  elif FLOAT_SIZE == 8
 #   if VEC_SIZE >= 32
@@ -174,6 +178,10 @@ static inline bool _to_bool(byte_vec_t b
 #   else
 #    define interleave_hi(x, y) B(vpermi2varpd, _mask, x, interleave_hi, y, ~0)
 #    define interleave_lo(x, y) B(vpermt2varpd, _mask, interleave_lo, x, y, ~0)
+#    define swap(x) ({ \
+    vec_t t_ = B(shuf_f64x2_, _mask, x, x, VEC_SIZE == 32 ? 0b01 : 0b00011011, 
undef(), ~0); \
+    B(shufpd, _mask, t_, t_, 0b01010101, undef(), ~0); \
+})
 #   endif
 #  endif
 # endif
@@ -303,9 +311,14 @@ static inline bool _to_bool(byte_vec_t b
 #  if VEC_SIZE == 16
 #   define interleave_hi(x, y) ((vec_t)B(punpckhdq, _mask, (vsi_t)(x), 
(vsi_t)(y), (vsi_t)undef(), ~0))
 #   define interleave_lo(x, y) ((vec_t)B(punpckldq, _mask, (vsi_t)(x), 
(vsi_t)(y), (vsi_t)undef(), ~0))
+#   define swap(x) ((vec_t)B(pshufd, _mask, (vsi_t)(x), 0b00011011, 
(vsi_t)undef(), ~0))
 #  else
 #   define interleave_hi(x, y) ((vec_t)B(vpermi2vard, _mask, (vsi_t)(x), 
interleave_hi, (vsi_t)(y), ~0))
 #   define interleave_lo(x, y) ((vec_t)B(vpermt2vard, _mask, interleave_lo, 
(vsi_t)(x), (vsi_t)(y), ~0))
+#   define swap(x) ((vec_t)B(pshufd, _mask, \
+                             B(shuf_i32x4_, _mask, (vsi_t)(x), (vsi_t)(x), \
+                               VEC_SIZE == 32 ? 0b01 : 0b00011011, 
(vsi_t)undef(), ~0), \
+                             0b00011011, (vsi_t)undef(), ~0))
 #  endif
 #  define mix(x, y) ((vec_t)B(movdqa32_, _mask, (vsi_t)(x), (vsi_t)(y), \
                               (0b0101010101010101 & ((1 << ELEM_COUNT) - 1))))
@@ -327,9 +340,14 @@ static inline bool _to_bool(byte_vec_t b
 #  if VEC_SIZE == 16
 #   define interleave_hi(x, y) ((vec_t)B(punpckhqdq, _mask, (vdi_t)(x), 
(vdi_t)(y), (vdi_t)undef(), ~0))
 #   define interleave_lo(x, y) ((vec_t)B(punpcklqdq, _mask, (vdi_t)(x), 
(vdi_t)(y), (vdi_t)undef(), ~0))
+#   define swap(x) ((vec_t)B(pshufd, _mask, (vsi_t)(x), 0b01001110, 
(vsi_t)undef(), ~0))
 #  else
 #   define interleave_hi(x, y) ((vec_t)B(vpermi2varq, _mask, (vdi_t)(x), 
interleave_hi, (vdi_t)(y), ~0))
 #   define interleave_lo(x, y) ((vec_t)B(vpermt2varq, _mask, interleave_lo, 
(vdi_t)(x), (vdi_t)(y), ~0))
+#   define swap(x) ((vec_t)B(pshufd, _mask, \
+                             (vsi_t)B(shuf_i64x2_, _mask, (vdi_t)(x), 
(vdi_t)(x), \
+                                      VEC_SIZE == 32 ? 0b01 : 0b00011011, 
(vdi_t)undef(), ~0), \
+                             0b01001110, (vsi_t)undef(), ~0))
 #  endif
 #  define mix(x, y) ((vec_t)B(movdqa64_, _mask, (vdi_t)(x), (vdi_t)(y), 
0b01010101))
 # endif
--- a/tools/tests/x86_emulator/simd.h
+++ b/tools/tests/x86_emulator/simd.h
@@ -119,6 +119,12 @@ typedef long long __attribute__((vector_
 
 #ifdef __AVX512F__
 
+/* Sadly there are a few exceptions to the general naming rules. */
+#define __builtin_ia32_shuf_f32x4_512_mask __builtin_ia32_shuf_f32x4_mask
+#define __builtin_ia32_shuf_f64x2_512_mask __builtin_ia32_shuf_f64x2_mask
+#define __builtin_ia32_shuf_i32x4_512_mask __builtin_ia32_shuf_i32x4_mask
+#define __builtin_ia32_shuf_i64x2_512_mask __builtin_ia32_shuf_i64x2_mask
+
 #if VEC_SIZE < 64
 # pragma GCC target ( "avx512vl" )
 #endif
@@ -208,6 +214,7 @@ OVR(pmovzxbq);
 OVR(pmovzxdq);
 OVR(pmovzxwd);
 OVR(pmovzxwq);
+OVR(pshufd);
 OVR(punpckhdq);
 OVR(punpckhqdq);
 OVR(punpckldq);
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -316,7 +316,7 @@ static const struct twobyte_table {
     [0x6b ... 0x6d] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
     [0x6e] = { DstImplicit|SrcMem|ModRM|Mov, simd_none, d8s_dq },
     [0x6f] = { DstImplicit|SrcMem|ModRM|Mov, simd_packed_int, d8s_vl },
-    [0x70] = { SrcImmByte|ModRM|TwoOp, simd_other },
+    [0x70] = { SrcImmByte|ModRM|TwoOp, simd_other, d8s_vl },
     [0x71 ... 0x73] = { DstImplicit|SrcImmByte|ModRM, simd_none, d8s_vl },
     [0x74 ... 0x76] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
     [0x77] = { DstImplicit|SrcNone },
@@ -430,7 +430,8 @@ static const struct ext0f38_table {
     uint8_t vsib:1;
     disp8scale_t d8s:4;
 } ext0f38_table[256] = {
-    [0x00 ... 0x0b] = { .simd_size = simd_packed_int },
+    [0x00] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
+    [0x01 ... 0x0b] = { .simd_size = simd_packed_int },
     [0x0c ... 0x0f] = { .simd_size = simd_packed_fp },
     [0x10 ... 0x12] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x13] = { .simd_size = simd_other, .two_op = 1 },
@@ -541,6 +542,7 @@ static const struct ext0f3a_table {
     [0x20] = { .simd_size = simd_none, .d8s = 0 },
     [0x21] = { .simd_size = simd_other, .d8s = 2 },
     [0x22] = { .simd_size = simd_none, .d8s = d8s_dq },
+    [0x23] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x25] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x30 ... 0x33] = { .simd_size = simd_other, .two_op = 1 },
     [0x38] = { .simd_size = simd_128, .d8s = 4 },
@@ -550,6 +552,7 @@ static const struct ext0f3a_table {
     [0x3e ... 0x3f] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x40 ... 0x41] = { .simd_size = simd_packed_fp },
     [0x42] = { .simd_size = simd_packed_int },
+    [0x43] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x44] = { .simd_size = simd_packed_int },
     [0x46] = { .simd_size = simd_packed_int },
     [0x48 ... 0x49] = { .simd_size = simd_packed_fp, .four_op = 1 },
@@ -6653,6 +6656,7 @@ x86_emulate(
     case X86EMUL_OPC_EVEX_66(0x0f, 0xf1): /* vpsllw 
xmm/m128,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xf5): /* vpmaddwd 
[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xf6): /* vpsadbw 
[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x00): /* vpshufb 
[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
         fault_suppression = false;
         /* fall through */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xd5): /* vpmullw 
[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
@@ -6924,6 +6928,20 @@ x86_emulate(
         insn_bytes = PFX_BYTES + 3;
         break;
 
+    case X86EMUL_OPC_EVEX_66(0x0f, 0x70): /* vpshufd 
$imm8,[xyz]mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f, 0x70): /* vpshufhw 
$imm8,[xyz]mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_F2(0x0f, 0x70): /* vpshuflw 
$imm8,[xyz]mm/mem,[xyz]mm{k} */
+        if ( evex.pfx == vex_66 )
+            generate_exception_if(evex.w, EXC_UD);
+        else
+        {
+            host_and_vcpu_must_have(avx512bw);
+            generate_exception_if(evex.br, EXC_UD);
+        }
+        d = (d & ~SrcMask) | SrcMem | TwoOp;
+        op_bytes = 16 << evex.lr;
+        goto avx512f_imm_no_sae;
+
     CASE_SIMD_PACKED_INT(0x0f, 0x71):    /* Grp12 */
     case X86EMUL_OPC_VEX_66(0x0f, 0x71):
     CASE_SIMD_PACKED_INT(0x0f, 0x72):    /* Grp13 */
@@ -9093,7 +9111,13 @@ x86_emulate(
                                             /* vextracti64x2 
$imm8,{y,z}mm,xmm/m128{k} */
         if ( evex.w )
             host_and_vcpu_must_have(avx512dq);
-        generate_exception_if(!evex.lr || evex.br, EXC_UD);
+        generate_exception_if(evex.br, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x23): /* vshuff32x4 
$imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+                                            /* vshuff64x2 
$imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x43): /* vshufi32x4 
$imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+                                            /* vshufi64x2 
$imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        generate_exception_if(!evex.lr, EXC_UD);
         fault_suppression = false;
         goto avx512f_imm_no_sae;
 




_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.