Discussion:
[Xen-devel] [PATCH v4 00/17] x86emul: MMX/SSEn support
Jan Beulich
2017-02-28 12:42:27 UTC
Permalink
This includes support for AVX counterparts of them as well as a few
later SSE additions (basically covering the entire 0f-prefixed opcode
space, but not the 0f38 and 0f3a ones, nor 3dnow).

1: support most memory accessing MMX/SSE{,2,3} insns
2: support MMX/SSE{,2,3} moves
3: support MMX/SSE/SSE2 converts
4: support {,V}{,U}COMIS{S,D}
5: support MMX/SSE{,2,4a} insns with only register operands
6: support {,V}{LD,ST}MXCSR
7: support {,V}MOVNTDQA
8: test coverage for SSE/SSE2 insns
9: honor MMXEXT feature flag
10: add tables for 0f38 and 0f3a extension space
11: support SSSE3 insns
12: support SSE4.1 insns
13: support SSE4.2 insns
14: test coverage for SSE3/SSSE3/SSE4* insns

Partly RFC from here on, as there's testing code still mostly missing,
albeit I'm unsure whether it makes sense to cover each and every
individual instruction.

15: support PCLMULQDQ
16: support AESNI insns
17: support SHA insns

Signed-off-by: Jan Beulich <***@suse.com>
---
v4: New patch 14. Fixes to other patches see there.
Jan Beulich
2017-02-28 12:49:49 UTC
Permalink
e 0x0f-escape
space with memory operands. Not covered here are irregular moves,
converts, and {,U}COMIS{S,D} (modifying EFLAGS).

Note that the distinction between simd_*_fp isn't strictly needed, but
I've kept them as separate entries since in an earlier version I needed
them to be separate, and we may well find it useful down the road to
have that distinction.

Also take the opportunity and adjust the vmovdqu test case the new
LDDQU one here has been cloned from: To zero a ymm register we don't
need to go through hoops, as 128-bit AVX insns zero the upper portion
of the destination register, and in the disabled AVX2 code there was a
wrong YMM register used.

Signed-off-by: Jan Beulich <***@suse.com>
---
v4: Add blank lines to enum simd_opsize. Re-base.
v3: Correct {,v}addsubp{s,d} comments (no 'h' in mnemonic).
Consistently generate #UD when VEX.l is disallowed. Ignore VEX.l
for scalar insns. Re-base. Introduce more labels to reduce
redundant code. Add fic.exn_raised constraint in invoke_stub() use.
v2: Correct SSE2 p{max,min}{ub,sw} case labels. Correct MMX
ps{ll,r{a,l}} and MMX punpckh{bw,wd,dq} operand sizes. Correct
zapping of TwoOp in x86_decode_twobyte() (and vmovs{s,d} handling
as a result). Also decode pshuf{h,l}w. Correct v{rcp,rsqrt}ss and
vsqrts{s,d} comments (they allow memory operands).

--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -1665,12 +1665,7 @@ int main(int argc, char **argv)
{
decl_insn(vmovdqu_from_mem);

-#if 0 /* Don't use AVX2 instructions for now */
- asm volatile ( "vpcmpgtb %%ymm4, %%ymm4, %%ymm4\n"
-#else
- asm volatile ( "vpcmpgtb %%xmm4, %%xmm4, %%xmm4\n\t"
- "vinsertf128 $1, %%xmm4, %%ymm4, %%ymm4\n"
-#endif
+ asm volatile ( "vpxor %%xmm4, %%xmm4, %%xmm4\n"
put_insn(vmovdqu_from_mem, "vmovdqu (%0), %%ymm4")
:: "d" (NULL) );

@@ -1684,7 +1679,7 @@ int main(int argc, char **argv)
#if 0 /* Don't use AVX2 instructions for now */
asm ( "vpcmpeqb %%ymm2, %%ymm2, %%ymm2\n\t"
"vpcmpeqb %%ymm4, %%ymm2, %%ymm0\n\t"
- "vpmovmskb %%ymm1, %0" : "=r" (rc) );
+ "vpmovmskb %%ymm0, %0" : "=r" (rc) );
#else
asm ( "vextractf128 $1, %%ymm4, %%xmm3\n\t"
"vpcmpeqb %%xmm2, %%xmm2, %%xmm2\n\t"
@@ -2092,6 +2087,67 @@ int main(int argc, char **argv)
printf("skipped\n");
#endif

+ printf("%-40s", "Testing lddqu 4(%edx),%xmm4...");
+ if ( stack_exec && cpu_has_sse3 )
+ {
+ decl_insn(lddqu);
+
+ asm volatile ( "pcmpgtb %%xmm4, %%xmm4\n"
+ put_insn(lddqu, "lddqu 4(%0), %%xmm4")
+ :: "d" (NULL) );
+
+ set_insn(lddqu);
+ memset(res, 0x55, 64);
+ memset(res + 1, 0xff, 16);
+ regs.edx = (unsigned long)res;
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( rc != X86EMUL_OKAY || !check_eip(lddqu) )
+ goto fail;
+ asm ( "pcmpeqb %%xmm2, %%xmm2\n\t"
+ "pcmpeqb %%xmm4, %%xmm2\n\t"
+ "pmovmskb %%xmm2, %0" : "=r" (rc) );
+ if ( rc != 0xffff )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
+ printf("%-40s", "Testing vlddqu (%ecx),%ymm4...");
+ if ( stack_exec && cpu_has_avx )
+ {
+ decl_insn(vlddqu);
+
+ asm volatile ( "vpxor %%xmm4, %%xmm4, %%xmm4\n"
+ put_insn(vlddqu, "vlddqu (%0), %%ymm4")
+ :: "c" (NULL) );
+
+ set_insn(vlddqu);
+ memset(res + 1, 0xff, 32);
+ regs.ecx = (unsigned long)(res + 1);
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( rc != X86EMUL_OKAY || !check_eip(vlddqu) )
+ goto fail;
+#if 0 /* Don't use AVX2 instructions for now */
+ asm ( "vpcmpeqb %%ymm2, %%ymm2, %%ymm2\n\t"
+ "vpcmpeqb %%ymm4, %%ymm2, %%ymm0\n\t"
+ "vpmovmskb %%ymm0, %0" : "=r" (rc) );
+#else
+ asm ( "vextractf128 $1, %%ymm4, %%xmm3\n\t"
+ "vpcmpeqb %%xmm2, %%xmm2, %%xmm2\n\t"
+ "vpcmpeqb %%xmm4, %%xmm2, %%xmm0\n\t"
+ "vpcmpeqb %%xmm3, %%xmm2, %%xmm1\n\t"
+ "vpmovmskb %%xmm0, %0\n\t"
+ "vpmovmskb %%xmm1, %1" : "=r" (rc), "=r" (i) );
+ rc |= i << 16;
+#endif
+ if ( ~rc )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
#undef decl_insn
#undef put_insn
#undef set_insn
--- a/tools/tests/x86_emulator/x86_emulate.h
+++ b/tools/tests/x86_emulator/x86_emulate.h
@@ -80,6 +80,12 @@ static inline uint64_t xgetbv(uint32_t x
(res.d & (1U << 26)) != 0; \
})

+#define cpu_has_sse3 ({ \
+ struct cpuid_leaf res; \
+ emul_test_cpuid(1, 0, &res, NULL); \
+ (res.c & (1U << 0)) != 0; \
+})
+
#define cpu_has_popcnt ({ \
struct cpuid_leaf res; \
emul_test_cpuid(1, 0, &res, NULL); \
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -45,6 +45,8 @@
#define ModRM (1<<6)
/* Destination is only written; never read. */
#define Mov (1<<7)
+/* VEX/EVEX (SIMD only): 2nd source operand unused (must be all ones) */
+#define TwoOp Mov
/* All operands are implicit in the opcode. */
#define ImplicitOps (DstImplicit|SrcImplicit)

@@ -180,8 +182,49 @@ static const opcode_desc_t opcode_table[
ImplicitOps, ImplicitOps, ByteOp|DstMem|SrcNone|ModRM, DstMem|SrcNone|ModRM
};

+enum simd_opsize {
+ simd_none,
+
+ /*
+ * Ordinary packed integers:
+ * - 64 bits without prefix 66 (MMX)
+ * - 128 bits with prefix 66 (SSEn)
+ * - 128/256 bits depending on VEX.L (AVX)
+ */
+ simd_packed_int,
+
+ /*
+ * Ordinary packed/scalar floating point:
+ * - 128 bits without prefix or with prefix 66 (SSEn)
+ * - 128/256 bits depending on VEX.L (AVX)
+ * - 32 bits with prefix F3 (scalar single)
+ * - 64 bits with prefix F2 (scalar doubgle)
+ */
+ simd_any_fp,
+
+ /*
+ * Packed floating point:
+ * - 128 bits without prefix or with prefix 66 (SSEn)
+ * - 128/256 bits depending on VEX.L (AVX)
+ */
+ simd_packed_fp,
+
+ /*
+ * Single precision packed/scalar floating point:
+ * - 128 bits without prefix (SSEn)
+ * - 128/256 bits depending on VEX.L, no prefix (AVX)
+ * - 32 bits with prefix F3 (scalar)
+ */
+ simd_single_fp,
+
+ /* Operand size encoded in non-standard way. */
+ simd_other
+};
+typedef uint8_t simd_opsize_t;
+
static const struct {
opcode_desc_t desc;
+ simd_opsize_t size;
} twobyte_table[256] = {
[0x00] = { ModRM },
[0x01] = { ImplicitOps|ModRM },
@@ -196,22 +239,41 @@ static const struct {
[0x0d] = { ImplicitOps|ModRM },
[0x0e] = { ImplicitOps },
[0x0f] = { ModRM|SrcImmByte },
- [0x10 ... 0x1f] = { ImplicitOps|ModRM },
+ [0x10] = { DstImplicit|SrcMem|ModRM|Mov, simd_any_fp },
+ [0x11] = { DstMem|SrcImplicit|ModRM|Mov, simd_any_fp },
+ [0x12 ... 0x13] = { ImplicitOps|ModRM },
+ [0x14 ... 0x15] = { DstImplicit|SrcMem|ModRM, simd_packed_fp },
+ [0x16 ... 0x1f] = { ImplicitOps|ModRM },
[0x20 ... 0x21] = { DstMem|SrcImplicit|ModRM },
[0x22 ... 0x23] = { DstImplicit|SrcMem|ModRM },
- [0x28 ... 0x2f] = { ImplicitOps|ModRM },
+ [0x28] = { DstImplicit|SrcMem|ModRM|Mov, simd_packed_fp },
+ [0x29] = { DstMem|SrcImplicit|ModRM|Mov, simd_packed_fp },
+ [0x2a] = { ImplicitOps|ModRM },
+ [0x2b] = { DstMem|SrcImplicit|ModRM|Mov, simd_any_fp },
+ [0x2c ... 0x2f] = { ImplicitOps|ModRM },
[0x30 ... 0x35] = { ImplicitOps },
[0x37] = { ImplicitOps },
[0x38] = { DstReg|SrcMem|ModRM },
[0x3a] = { DstReg|SrcImmByte|ModRM },
[0x40 ... 0x4f] = { DstReg|SrcMem|ModRM|Mov },
- [0x50 ... 0x6e] = { ModRM },
- [0x6f] = { ImplicitOps|ModRM },
- [0x70 ... 0x73] = { SrcImmByte|ModRM },
- [0x74 ... 0x76] = { ModRM },
- [0x77] = { ImplicitOps },
+ [0x50] = { ModRM },
+ [0x51] = { DstImplicit|SrcMem|ModRM|TwoOp, simd_any_fp },
+ [0x52 ... 0x53] = { DstImplicit|SrcMem|ModRM|TwoOp, simd_single_fp },
+ [0x54 ... 0x57] = { DstImplicit|SrcMem|ModRM, simd_packed_fp },
+ [0x58 ... 0x59] = { DstImplicit|SrcMem|ModRM, simd_any_fp },
+ [0x5a ... 0x5b] = { ModRM },
+ [0x5c ... 0x5f] = { DstImplicit|SrcMem|ModRM, simd_any_fp },
+ [0x60 ... 0x62] = { DstImplicit|SrcMem|ModRM, simd_other },
+ [0x63 ... 0x67] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+ [0x68 ... 0x6a] = { DstImplicit|SrcMem|ModRM, simd_other },
+ [0x6b ... 0x6d] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+ [0x6e ... 0x6f] = { ImplicitOps|ModRM },
+ [0x70] = { SrcImmByte|ModRM|TwoOp, simd_other },
+ [0x71 ... 0x73] = { SrcImmByte|ModRM },
+ [0x74 ... 0x76] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+ [0x77] = { DstImplicit|SrcNone },
[0x78 ... 0x79] = { ModRM },
- [0x7c ... 0x7d] = { ModRM },
+ [0x7c ... 0x7d] = { DstImplicit|SrcMem|ModRM, simd_other },
[0x7e ... 0x7f] = { ImplicitOps|ModRM },
[0x80 ... 0x8f] = { DstImplicit|SrcImm },
[0x90 ... 0x9f] = { ByteOp|DstMem|SrcNone|ModRM|Mov },
@@ -244,18 +306,31 @@ static const struct {
[0xbf] = { DstReg|SrcMem16|ModRM|Mov },
[0xc0] = { ByteOp|DstMem|SrcReg|ModRM },
[0xc1] = { DstMem|SrcReg|ModRM },
- [0xc2] = { SrcImmByte|ModRM },
+ [0xc2] = { DstImplicit|SrcImmByte|ModRM, simd_any_fp },
[0xc3] = { DstMem|SrcReg|ModRM|Mov },
- [0xc4 ... 0xc6] = { SrcImmByte|ModRM },
+ [0xc4] = { DstReg|SrcImmByte|ModRM, simd_packed_int },
+ [0xc5] = { SrcImmByte|ModRM },
+ [0xc6] = { DstImplicit|SrcImmByte|ModRM, simd_packed_fp },
[0xc7] = { ImplicitOps|ModRM },
[0xc8 ... 0xcf] = { ImplicitOps },
- [0xd0 ... 0xd5] = { ModRM },
+ [0xd0] = { DstImplicit|SrcMem|ModRM, simd_other },
+ [0xd1 ... 0xd3] = { DstImplicit|SrcMem|ModRM, simd_other },
+ [0xd4 ... 0xd5] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
[0xd6] = { ImplicitOps|ModRM },
- [0xd7 ... 0xdf] = { ModRM },
- [0xe0 ... 0xe6] = { ModRM },
+ [0xd7] = { ModRM },
+ [0xd8 ... 0xdf] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+ [0xe0] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+ [0xe1 ... 0xe2] = { DstImplicit|SrcMem|ModRM, simd_other },
+ [0xe3 ... 0xe5] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+ [0xe6] = { ModRM },
[0xe7] = { ImplicitOps|ModRM },
- [0xe8 ... 0xef] = { ModRM },
- [0xf0 ... 0xff] = { ModRM }
+ [0xe8 ... 0xef] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+ [0xf0] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
+ [0xf1 ... 0xf3] = { DstImplicit|SrcMem|ModRM, simd_other },
+ [0xf4 ... 0xf6] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+ [0xf7] = { ModRM },
+ [0xf8 ... 0xfe] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+ [0xff] = { ModRM }
};

static const opcode_desc_t xop_table[] = {
@@ -1310,10 +1385,12 @@ static bool vcpu_has(
#define vcpu_has_lahf_lm() vcpu_has(0x80000001, ECX, 0, ctxt, ops)
#define vcpu_has_cr8_legacy() vcpu_has(0x80000001, ECX, 4, ctxt, ops)
#define vcpu_has_lzcnt() vcpu_has(0x80000001, ECX, 5, ctxt, ops)
+#define vcpu_has_sse4a() vcpu_has(0x80000001, ECX, 6, ctxt, ops)
#define vcpu_has_misalignsse() vcpu_has(0x80000001, ECX, 7, ctxt, ops)
#define vcpu_has_tbm() vcpu_has(0x80000001, ECX, 21, ctxt, ops)
#define vcpu_has_bmi1() vcpu_has( 7, EBX, 3, ctxt, ops)
#define vcpu_has_hle() vcpu_has( 7, EBX, 4, ctxt, ops)
+#define vcpu_has_avx2() vcpu_has( 7, EBX, 5, ctxt, ops)
#define vcpu_has_bmi2() vcpu_has( 7, EBX, 8, ctxt, ops)
#define vcpu_has_rtm() vcpu_has( 7, EBX, 11, ctxt, ops)
#define vcpu_has_mpx() vcpu_has( 7, EBX, 14, ctxt, ops)
@@ -1914,6 +1991,7 @@ struct x86_emulate_state {
opcode_desc_t desc;
union vex vex;
union evex evex;
+ enum simd_opsize simd_size;

/*
* Data operand effective address (usually computed from ModRM).
@@ -2073,7 +2151,8 @@ x86_decode_twobyte(
case 0x50 ... 0x77:
case 0x79 ... 0x7f:
case 0xae:
- case 0xc2 ... 0xc6:
+ case 0xc2 ... 0xc3:
+ case 0xc5 ... 0xc6:
case 0xd0 ... 0xfe:
ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
break;
@@ -2100,8 +2179,23 @@ x86_decode_twobyte(
case 0xbd: bsr / lzcnt
* They're being dealt with in the execution phase (if at all).
*/
+
+ case 0xc4: /* pinsrw */
+ ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
+ /* fall through */
+ case X86EMUL_OPC_VEX_66(0, 0xc4): /* vpinsrw */
+ state->desc = DstReg | SrcMem16 | ModRM;
+ break;
}

+ /*
+ * Scalar forms of most VEX-encoded TwoOp instructions have
+ * three operands.
+ */
+ if ( state->simd_size && vex.opcx &&
+ (vex.pfx & VEX_PREFIX_SCALAR_MASK) )
+ state->desc &= ~TwoOp;
+
done:
return rc;
}
@@ -2239,6 +2333,7 @@ x86_decode(
default:
opcode = b | MASK_INSR(0x0f, X86EMUL_OPC_EXT_MASK);
ext = ext_0f;
+ state->simd_size = twobyte_table[b].size;
break;
case 0x38:
b = insn_fetch_type(uint8_t);
@@ -2345,6 +2440,7 @@ x86_decode(
case vex_0f:
opcode |= MASK_INSR(0x0f, X86EMUL_OPC_EXT_MASK);
d = twobyte_table[b].desc;
+ state->simd_size = twobyte_table[b].size;
break;
case vex_0f38:
opcode |= MASK_INSR(0x0f38, X86EMUL_OPC_EXT_MASK);
@@ -2602,13 +2698,53 @@ x86_decode(
ea.mem.off = truncate_ea(ea.mem.off);
}

- /*
- * When prefix 66 has a meaning different from operand-size override,
- * operand size defaults to 4 and can't be overridden to 2.
- */
- if ( op_bytes == 2 &&
- (ctxt->opcode & X86EMUL_OPC_PFX_MASK) == X86EMUL_OPC_66(0, 0) )
- op_bytes = 4;
+ switch ( state->simd_size )
+ {
+ case simd_none:
+ /*
+ * When prefix 66 has a meaning different from operand-size override,
+ * operand size defaults to 4 and can't be overridden to 2.
+ */
+ if ( op_bytes == 2 &&
+ (ctxt->opcode & X86EMUL_OPC_PFX_MASK) == X86EMUL_OPC_66(0, 0) )
+ op_bytes = 4;
+ break;
+
+ case simd_packed_int:
+ switch ( vex.pfx )
+ {
+ case vex_none: op_bytes = 8; break;
+ case vex_66: op_bytes = 16 << vex.l; break;
+ default: op_bytes = 0; break;
+ }
+ break;
+
+ case simd_single_fp:
+ if ( vex.pfx & VEX_PREFIX_DOUBLE_MASK )
+ {
+ op_bytes = 0;
+ break;
+ case simd_packed_fp:
+ if ( vex.pfx & VEX_PREFIX_SCALAR_MASK )
+ {
+ op_bytes = 0;
+ break;
+ }
+ }
+ /* fall through */
+ case simd_any_fp:
+ switch ( vex.pfx )
+ {
+ default: op_bytes = 16 << vex.l; break;
+ case vex_f3: op_bytes = 4; break;
+ case vex_f2: op_bytes = 8; break;
+ }
+ break;
+
+ default:
+ op_bytes = 0;
+ break;
+ }

done:
return rc;
@@ -2633,8 +2769,10 @@ x86_emulate(
uint8_t b, d;
bool singlestep = (_regs._eflags & X86_EFLAGS_TF) &&
!is_branch_step(ctxt, ops);
+ bool sfence = false;
struct operand src = { .reg = PTR_POISON };
struct operand dst = { .reg = PTR_POISON };
+ unsigned long cr4;
enum x86_swint_type swint_type;
struct fpu_insn_ctxt fic;
struct x86_emulate_stub stub = {};
@@ -2705,6 +2843,8 @@ x86_emulate(
ea.bytes = 2;
goto srcmem_common;
case SrcMem:
+ if ( state->simd_size )
+ break;
ea.bytes = (d & ByteOp) ? 1 : op_bytes;
srcmem_common:
src = ea;
@@ -2805,6 +2945,11 @@ x86_emulate(
d = (d & ~DstMask) | DstMem;
/* Becomes a normal DstMem operation from here on. */
case DstMem:
+ if ( state->simd_size )
+ {
+ generate_exception_if(lock_prefix, EXC_UD);
+ break;
+ }
ea.bytes = (d & ByteOp) ? 1 : op_bytes;
dst = ea;
if ( dst.type == OP_REG )
@@ -2839,7 +2984,6 @@ x86_emulate(
{
enum x86_segment seg;
struct segment_register cs, sreg;
- unsigned long cr4;
struct cpuid_leaf cpuid_leaf;
uint64_t msr_val;

@@ -5017,116 +5161,117 @@ x86_emulate(
case X86EMUL_OPC(0x0f, 0x19) ... X86EMUL_OPC(0x0f, 0x1f): /* nop */
break;

- case X86EMUL_OPC(0x0f, 0x2b): /* movntps xmm,m128 */
- case X86EMUL_OPC_VEX(0x0f, 0x2b): /* vmovntps xmm,m128 */
- /* vmovntps ymm,m256 */
- case X86EMUL_OPC_66(0x0f, 0x2b): /* movntpd xmm,m128 */
- case X86EMUL_OPC_VEX_66(0x0f, 0x2b): /* vmovntpd xmm,m128 */
- /* vmovntpd ymm,m256 */
- fail_if(ea.type != OP_MEM);
- /* fall through */
- case X86EMUL_OPC(0x0f, 0x28): /* movaps xmm/m128,xmm */
- case X86EMUL_OPC_VEX(0x0f, 0x28): /* vmovaps xmm/m128,xmm */
- /* vmovaps ymm/m256,ymm */
- case X86EMUL_OPC_66(0x0f, 0x28): /* movapd xmm/m128,xmm */
- case X86EMUL_OPC_VEX_66(0x0f, 0x28): /* vmovapd xmm/m128,xmm */
- /* vmovapd ymm/m256,ymm */
- case X86EMUL_OPC(0x0f, 0x29): /* movaps xmm,xmm/m128 */
- case X86EMUL_OPC_VEX(0x0f, 0x29): /* vmovaps xmm,xmm/m128 */
- /* vmovaps ymm,ymm/m256 */
- case X86EMUL_OPC_66(0x0f, 0x29): /* movapd xmm,xmm/m128 */
- case X86EMUL_OPC_VEX_66(0x0f, 0x29): /* vmovapd xmm,xmm/m128 */
- /* vmovapd ymm,ymm/m256 */
- case X86EMUL_OPC(0x0f, 0x10): /* movups xmm/m128,xmm */
- case X86EMUL_OPC_VEX(0x0f, 0x10): /* vmovups xmm/m128,xmm */
- /* vmovups ymm/m256,ymm */
- case X86EMUL_OPC_66(0x0f, 0x10): /* movupd xmm/m128,xmm */
- case X86EMUL_OPC_VEX_66(0x0f, 0x10): /* vmovupd xmm/m128,xmm */
- /* vmovupd ymm/m256,ymm */
- case X86EMUL_OPC_F3(0x0f, 0x10): /* movss xmm/m32,xmm */
- case X86EMUL_OPC_VEX_F3(0x0f, 0x10): /* vmovss xmm/m32,xmm */
- case X86EMUL_OPC_F2(0x0f, 0x10): /* movsd xmm/m64,xmm */
- case X86EMUL_OPC_VEX_F2(0x0f, 0x10): /* vmovsd xmm/m64,xmm */
- case X86EMUL_OPC(0x0f, 0x11): /* movups xmm,xmm/m128 */
- case X86EMUL_OPC_VEX(0x0f, 0x11): /* vmovups xmm,xmm/m128 */
- /* vmovups ymm,ymm/m256 */
- case X86EMUL_OPC_66(0x0f, 0x11): /* movupd xmm,xmm/m128 */
- case X86EMUL_OPC_VEX_66(0x0f, 0x11): /* vmovupd xmm,xmm/m128 */
- /* vmovupd ymm,ymm/m256 */
- case X86EMUL_OPC_F3(0x0f, 0x11): /* movss xmm,xmm/m32 */
- case X86EMUL_OPC_VEX_F3(0x0f, 0x11): /* vmovss xmm,xmm/m32 */
- case X86EMUL_OPC_F2(0x0f, 0x11): /* movsd xmm,xmm/m64 */
- case X86EMUL_OPC_VEX_F2(0x0f, 0x11): /* vmovsd xmm,xmm/m64 */
- {
- uint8_t *buf = get_stub(stub);
+#define CASE_SIMD_PACKED_INT(pfx, opc) \
+ case X86EMUL_OPC(pfx, opc): \
+ case X86EMUL_OPC_66(pfx, opc)
+#define CASE_SIMD_SINGLE_FP(kind, pfx, opc) \
+ case X86EMUL_OPC##kind(pfx, opc): \
+ case X86EMUL_OPC##kind##_F3(pfx, opc)
+#define CASE_SIMD_DOUBLE_FP(kind, pfx, opc) \
+ case X86EMUL_OPC##kind##_66(pfx, opc): \
+ case X86EMUL_OPC##kind##_F2(pfx, opc)
+#define CASE_SIMD_ALL_FP(kind, pfx, opc) \
+ CASE_SIMD_SINGLE_FP(kind, pfx, opc): \
+ CASE_SIMD_DOUBLE_FP(kind, pfx, opc)
+#define CASE_SIMD_PACKED_FP(kind, pfx, opc) \
+ case X86EMUL_OPC##kind(pfx, opc): \
+ case X86EMUL_OPC##kind##_66(pfx, opc)
+#define CASE_SIMD_SCALAR_FP(kind, pfx, opc) \
+ case X86EMUL_OPC##kind##_F3(pfx, opc): \
+ case X86EMUL_OPC##kind##_F2(pfx, opc)

- fic.insn_bytes = 5;
- buf[0] = 0x3e;
- buf[1] = 0x3e;
- buf[2] = 0x0f;
- buf[3] = b;
- buf[4] = modrm;
- buf[5] = 0xc3;
+ CASE_SIMD_SCALAR_FP(, 0x0f, 0x2b): /* movnts{s,d} xmm,mem */
+ host_and_vcpu_must_have(sse4a);
+ /* fall through */
+ CASE_SIMD_PACKED_FP(, 0x0f, 0x2b): /* movntp{s,d} xmm,m128 */
+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x2b): /* vmovntp{s,d} {x,y}mm,mem */
+ generate_exception_if(ea.type != OP_MEM, EXC_UD);
+ sfence = true;
+ /* fall through */
+ CASE_SIMD_ALL_FP(, 0x0f, 0x10): /* mov{up,s}{s,d} xmm/mem,xmm */
+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x10): /* vmovup{s,d} {x,y}mm/mem,{x,y}mm */
+ CASE_SIMD_SCALAR_FP(_VEX, 0x0f, 0x10): /* vmovs{s,d} mem,xmm */
+ /* vmovs{s,d} xmm,xmm,xmm */
+ CASE_SIMD_ALL_FP(, 0x0f, 0x11): /* mov{up,s}{s,d} xmm,xmm/mem */
+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x11): /* vmovup{s,d} {x,y}mm,{x,y}mm/mem */
+ CASE_SIMD_SCALAR_FP(_VEX, 0x0f, 0x11): /* vmovs{s,d} xmm,mem */
+ /* vmovs{s,d} xmm,xmm,xmm */
+ CASE_SIMD_PACKED_FP(, 0x0f, 0x14): /* unpcklp{s,d} xmm/m128,xmm */
+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x14): /* vunpcklp{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_FP(, 0x0f, 0x15): /* unpckhp{s,d} xmm/m128,xmm */
+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x15): /* vunpckhp{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_FP(, 0x0f, 0x28): /* movap{s,d} xmm/m128,xmm */
+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x28): /* vmovap{s,d} {x,y}mm/mem,{x,y}mm */
+ CASE_SIMD_PACKED_FP(, 0x0f, 0x29): /* movap{s,d} xmm,xmm/m128 */
+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x29): /* vmovap{s,d} {x,y}mm,{x,y}mm/mem */
+ CASE_SIMD_ALL_FP(, 0x0f, 0x51): /* sqrt{p,s}{s,d} xmm/mem,xmm */
+ CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x51): /* vsqrtp{s,d} {x,y}mm/mem,{x,y}mm */
+ /* vsqrts{s,d} xmm/m32,xmm,xmm */
+ CASE_SIMD_SINGLE_FP(, 0x0f, 0x52): /* rsqrt{p,s}s xmm/mem,xmm */
+ CASE_SIMD_SINGLE_FP(_VEX, 0x0f, 0x52): /* vrsqrtps {x,y}mm/mem,{x,y}mm */
+ /* vrsqrtss xmm/m32,xmm,xmm */
+ CASE_SIMD_SINGLE_FP(, 0x0f, 0x53): /* rcp{p,s}s xmm/mem,xmm */
+ CASE_SIMD_SINGLE_FP(_VEX, 0x0f, 0x53): /* vrcpps {x,y}mm/mem,{x,y}mm */
+ /* vrcpss xmm/m32,xmm,xmm */
+ CASE_SIMD_PACKED_FP(, 0x0f, 0x54): /* andp{s,d} xmm/m128,xmm */
+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x54): /* vandp{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_FP(, 0x0f, 0x55): /* andnp{s,d} xmm/m128,xmm */
+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x55): /* vandnp{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_FP(, 0x0f, 0x56): /* orp{s,d} xmm/m128,xmm */
+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x56): /* vorp{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_FP(, 0x0f, 0x57): /* xorp{s,d} xmm/m128,xmm */
+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x57): /* vxorp{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_ALL_FP(, 0x0f, 0x58): /* add{p,s}{s,d} xmm/mem,xmm */
+ CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x58): /* vadd{p,s}{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_ALL_FP(, 0x0f, 0x59): /* mul{p,s}{s,d} xmm/mem,xmm */
+ CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x59): /* vmul{p,s}{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_ALL_FP(, 0x0f, 0x5c): /* sub{p,s}{s,d} xmm/mem,xmm */
+ CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x5c): /* vsub{p,s}{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_ALL_FP(, 0x0f, 0x5d): /* min{p,s}{s,d} xmm/mem,xmm */
+ CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x5d): /* vmin{p,s}{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_ALL_FP(, 0x0f, 0x5e): /* div{p,s}{s,d} xmm/mem,xmm */
+ CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x5e): /* vdiv{p,s}{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_ALL_FP(, 0x0f, 0x5f): /* max{p,s}{s,d} xmm/mem,xmm */
+ CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x5f): /* vmax{p,s}{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
if ( vex.opcx == vex_none )
{
if ( vex.pfx & VEX_PREFIX_DOUBLE_MASK )
+ {
+ simd_0f_sse2:
vcpu_must_have(sse2);
+ }
else
vcpu_must_have(sse);
- ea.bytes = 16;
- SET_SSE_PREFIX(buf[0], vex.pfx);
+ simd_0f_xmm:
get_fpu(X86EMUL_FPU_xmm, &fic);
}
else
{
- fail_if((vex.reg != 0xf) &&
- ((ea.type == OP_MEM) ||
- !(vex.pfx & VEX_PREFIX_SCALAR_MASK)));
+ /* vmovs{s,d} to/from memory have only two operands. */
+ if ( (b & ~1) == 0x10 && ea.type == OP_MEM )
+ d |= TwoOp;
+ simd_0f_avx:
host_and_vcpu_must_have(avx);
+ simd_0f_ymm:
get_fpu(X86EMUL_FPU_ymm, &fic);
- ea.bytes = 16 << vex.l;
}
- if ( vex.pfx & VEX_PREFIX_SCALAR_MASK )
- ea.bytes = vex.pfx & VEX_PREFIX_DOUBLE_MASK ? 8 : 4;
+ simd_0f_common:
+ {
+ uint8_t *buf = get_stub(stub);
+
+ buf[0] = 0x3e;
+ buf[1] = 0x3e;
+ buf[2] = 0x0f;
+ buf[3] = b;
+ buf[4] = modrm;
if ( ea.type == OP_MEM )
{
- uint32_t mxcsr = 0;
-
- if ( b < 0x28 )
- mxcsr = MXCSR_MM;
- else if ( vcpu_has_misalignsse() )
- asm ( "stmxcsr %0" : "=m" (mxcsr) );
- generate_exception_if(!(mxcsr & MXCSR_MM) &&
- !is_aligned(ea.mem.seg, ea.mem.off, ea.bytes,
- ctxt, ops),
- EXC_GP, 0);
- if ( !(b & 1) )
- rc = ops->read(ea.mem.seg, ea.mem.off+0, mmvalp,
- ea.bytes, ctxt);
- else
- fail_if(!ops->write); /* Check before running the stub. */
/* convert memory operand to (%rAX) */
rex_prefix &= ~REX_B;
vex.b = 1;
buf[4] &= 0x38;
}
- if ( !rc )
- {
- copy_REX_VEX(buf, rex_prefix, vex);
- asm volatile ( "call *%0" : : "r" (stub.func), "a" (mmvalp)
- : "memory" );
- }
- put_fpu(&fic);
- put_stub(stub);
- if ( !rc && (b & 1) && (ea.type == OP_MEM) )
- {
- ASSERT(ops->write); /* See the fail_if() above. */
- rc = ops->write(ea.mem.seg, ea.mem.off, mmvalp,
- ea.bytes, ctxt);
- }
- if ( rc )
- goto done;
- dst.type = OP_NONE;
+ fic.insn_bytes = 5;
break;
}

@@ -5284,6 +5429,119 @@ x86_emulate(
singlestep = _regs._eflags & X86_EFLAGS_TF;
break;

+ CASE_SIMD_PACKED_INT(0x0f, 0x60): /* punpcklbw {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x60): /* vpunpcklbw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0x61): /* punpcklwd {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x61): /* vpunpcklwd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0x62): /* punpckldq {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x62): /* vpunpckldq {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0x68): /* punpckhbw {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x68): /* vpunpckhbw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0x69): /* punpckhwd {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x69): /* vpunpckhwd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0x6a): /* punpckhdq {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x6a): /* vpunpckhdq {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ op_bytes = vex.pfx ? 16 << vex.l : b & 8 ? 8 : 4;
+ /* fall through */
+ CASE_SIMD_PACKED_INT(0x0f, 0x63): /* packssbw {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x63): /* vpackssbw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0x64): /* pcmpgtb {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x64): /* vpcmpgtb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0x65): /* pcmpgtw {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x65): /* vpcmpgtw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0x66): /* pcmpgtd {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x66): /* vpcmpgtd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0x67): /* packusbw {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x67): /* vpackusbw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0x6b): /* packsswd {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x6b): /* vpacksswd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_66(0x0f, 0x6c): /* punpcklqdq xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x6c): /* vpunpcklqdq {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_66(0x0f, 0x6d): /* punpckhqdq xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x6d): /* vpunpckhqdq {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0x74): /* pcmpeqb {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x74): /* vpcmpeqb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0x75): /* pcmpeqw {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x75): /* vpcmpeqw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0x76): /* pcmpeqd {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x76): /* vpcmpeqd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_66(0x0f, 0xd4): /* paddq xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xd4): /* vpaddq {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xd5): /* pmullw {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xd5): /* vpmullw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xd8): /* psubusb {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xd8): /* vpsubusb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xd9): /* psubusw {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xd9): /* vpsubusw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_66(0x0f, 0xda): /* pminub xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xda): /* vpminub {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xdb): /* pand {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xdb): /* vpand {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xdc): /* paddusb {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xdc): /* vpaddusb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xdd): /* paddusw {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xdd): /* vpaddusw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_66(0x0f, 0xde): /* pmaxub xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xde): /* vpmaxub {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xdf): /* pandn {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xdf): /* vpandn {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_66(0x0f, 0xe0): /* pavgb xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xe0): /* vpavgb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_66(0x0f, 0xe3): /* pavgw xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xe3): /* vpavgw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_66(0x0f, 0xe4): /* pmulhuw xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xe4): /* vpmulhuw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xe5): /* pmulhw {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xe5): /* vpmulhw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xe8): /* psubsb {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xe8): /* vpsubsb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xe9): /* psubsw {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xe9): /* vpsubsw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_66(0x0f, 0xea): /* pminsw xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xea): /* vpminsw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xeb): /* por {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xeb): /* vpor {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xec): /* paddsb {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xec): /* vpaddsb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xed): /* paddsw {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xed): /* vpaddsw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_66(0x0f, 0xee): /* pmaxsw xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xee): /* vpmaxsw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xef): /* pxor {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xef): /* vpxor {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_66(0x0f, 0xf4): /* pmuludq xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xf4): /* vpmuludq {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_66(0x0f, 0xf6): /* psadbw xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xf6): /* vpsadbw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xf8): /* psubb {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xf8): /* vpsubb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xf9): /* psubw {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xf9): /* vpsubw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xfa): /* psubd {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xfa): /* vpsubd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_66(0x0f, 0xfb): /* psubq xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xfb): /* vpsubq {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xfc): /* paddb {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xfc): /* vpaddb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xfd): /* paddw {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xfd): /* vpaddw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xfe): /* paddd {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xfe): /* vpaddd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ simd_0f_int:
+ if ( vex.opcx != vex_none )
+ {
+ if ( !vex.l )
+ goto simd_0f_avx;
+ host_and_vcpu_must_have(avx2);
+ goto simd_0f_ymm;
+ }
+ if ( vex.pfx )
+ goto simd_0f_sse2;
+ simd_0f_mmx:
+ host_and_vcpu_must_have(mmx);
+ get_fpu(X86EMUL_FPU_mmx, &fic);
+ goto simd_0f_common;
+
case X86EMUL_OPC(0x0f, 0xe7): /* movntq mm,m64 */
case X86EMUL_OPC_66(0x0f, 0xe7): /* movntdq xmm,m128 */
case X86EMUL_OPC_VEX_66(0x0f, 0xe7): /* vmovntdq xmm,m128 */
@@ -5413,6 +5671,81 @@ x86_emulate(
break;
}

+ CASE_SIMD_PACKED_INT(0x0f, 0x70): /* pshuf{w,d} $imm8,{,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x70): /* vpshufd $imm8,{x,y}mm/mem,{x,y}mm */
+ case X86EMUL_OPC_F3(0x0f, 0x70): /* pshufhw $imm8,xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_F3(0x0f, 0x70): /* vpshufhw $imm8,{x,y}mm/mem,{x,y}mm */
+ case X86EMUL_OPC_F2(0x0f, 0x70): /* pshuflw $imm8,xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_F2(0x0f, 0x70): /* vpshuflw $imm8,{x,y}mm/mem,{x,y}mm */
+ d = (d & ~SrcMask) | SrcMem | TwoOp;
+ op_bytes = vex.pfx ? 16 << vex.l : 8;
+ simd_0f_int_imm8:
+ if ( vex.opcx != vex_none )
+ {
+ if ( vex.l )
+ host_and_vcpu_must_have(avx2);
+ else
+ {
+ simd_0f_imm8_avx:
+ host_and_vcpu_must_have(avx);
+ }
+ get_fpu(X86EMUL_FPU_ymm, &fic);
+ }
+ else if ( vex.pfx )
+ {
+ simd_0f_imm8_sse2:
+ vcpu_must_have(sse2);
+ get_fpu(X86EMUL_FPU_xmm, &fic);
+ }
+ else
+ {
+ host_and_vcpu_must_have(mmx);
+ vcpu_must_have(sse);
+ get_fpu(X86EMUL_FPU_mmx, &fic);
+ }
+ simd_0f_imm8:
+ {
+ uint8_t *buf = get_stub(stub);
+
+ buf[0] = 0x3e;
+ buf[1] = 0x3e;
+ buf[2] = 0x0f;
+ buf[3] = b;
+ buf[4] = modrm;
+ if ( ea.type == OP_MEM )
+ {
+ /* Convert memory operand to (%rAX). */
+ rex_prefix &= ~REX_B;
+ vex.b = 1;
+ buf[4] &= 0x38;
+ }
+ buf[5] = imm1;
+ fic.insn_bytes = 6;
+ break;
+ }
+
+ case X86EMUL_OPC_F2(0x0f, 0xf0): /* lddqu m128,xmm */
+ case X86EMUL_OPC_VEX_F2(0x0f, 0xf0): /* vlddqu mem,{x,y}mm */
+ generate_exception_if(ea.type != OP_MEM, EXC_UD);
+ /* fall through */
+ case X86EMUL_OPC_66(0x0f, 0x7c): /* haddpd xmm/m128,xmm */
+ case X86EMUL_OPC_F2(0x0f, 0x7c): /* haddps xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x7c): /* vhaddpd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_F2(0x0f, 0x7c): /* vhaddps {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_66(0x0f, 0x7d): /* hsubpd xmm/m128,xmm */
+ case X86EMUL_OPC_F2(0x0f, 0x7d): /* hsubps xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x7d): /* vhsubpd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_F2(0x0f, 0x7d): /* vhsubps {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_66(0x0f, 0xd0): /* addsubpd xmm/m128,xmm */
+ case X86EMUL_OPC_F2(0x0f, 0xd0): /* addsubps xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xd0): /* vaddsubpd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_F2(0x0f, 0xd0): /* vaddsubps {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ op_bytes = 16 << vex.l;
+ if ( vex.opcx != vex_none )
+ goto simd_0f_avx;
+ host_and_vcpu_must_have(sse3);
+ goto simd_0f_xmm;
+
case X86EMUL_OPC(0x0f, 0x80) ... X86EMUL_OPC(0x0f, 0x8f): /* jcc (near) */
if ( test_cc(b, _regs._eflags) )
jmp_rel((int32_t)src.val);
@@ -5714,12 +6047,35 @@ x86_emulate(
}
goto add;

+ CASE_SIMD_ALL_FP(, 0x0f, 0xc2): /* cmp{p,s}{s,d} $imm8,xmm/mem,xmm */
+ CASE_SIMD_ALL_FP(_VEX, 0x0f, 0xc2): /* vcmp{p,s}{s,d} $imm8,{x,y}mm/mem,{x,y}mm */
+ CASE_SIMD_PACKED_FP(, 0x0f, 0xc6): /* shufp{s,d} $imm8,xmm/mem,xmm */
+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0xc6): /* vshufp{s,d} $imm8,{x,y}mm/mem,{x,y}mm */
+ d = (d & ~SrcMask) | SrcMem;
+ if ( vex.opcx == vex_none )
+ {
+ if ( vex.pfx & VEX_PREFIX_DOUBLE_MASK )
+ goto simd_0f_imm8_sse2;
+ vcpu_must_have(sse);
+ get_fpu(X86EMUL_FPU_xmm, &fic);
+ goto simd_0f_imm8;
+ }
+ goto simd_0f_imm8_avx;
+
case X86EMUL_OPC(0x0f, 0xc3): /* movnti */
/* Ignore the non-temporal hint for now. */
vcpu_must_have(sse2);
dst.val = src.val;
+ sfence = true;
break;

+ CASE_SIMD_PACKED_INT(0x0f, 0xc4): /* pinsrw $imm8,r32/m16,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xc4): /* vpinsrw $imm8,r32/m16,xmm,xmm */
+ generate_exception_if(vex.l, EXC_UD);
+ memcpy(mmvalp, &src.val, 2);
+ ea.type = OP_MEM;
+ goto simd_0f_int_imm8;
+
case X86EMUL_OPC(0x0f, 0xc7): /* Grp9 */
{
union {
@@ -5898,6 +6254,42 @@ x86_emulate(
}
break;

+ CASE_SIMD_PACKED_INT(0x0f, 0xd1): /* psrlw {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xd1): /* vpsrlw xmm/m128,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xd2): /* psrld {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xd2): /* vpsrld xmm/m128,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xd3): /* psrlq {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xd3): /* vpsrlq xmm/m128,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xe1): /* psraw {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xe1): /* vpsraw xmm/m128,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xe2): /* psrad {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xe2): /* vpsrad xmm/m128,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xf1): /* psllw {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xf1): /* vpsllw xmm/m128,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xf2): /* pslld {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xf2): /* vpslld xmm/m128,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xf3): /* psllq {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xf3): /* vpsllq xmm/m128,{x,y}mm,{x,y}mm */
+ op_bytes = vex.pfx ? 16 : 8;
+ goto simd_0f_int;
+
+ case X86EMUL_OPC(0x0f, 0xd4): /* paddq mm/m64,mm */
+ case X86EMUL_OPC(0x0f, 0xf4): /* pmuludq mm/m64,mm */
+ case X86EMUL_OPC(0x0f, 0xfb): /* psubq mm/m64,mm */
+ vcpu_must_have(sse2);
+ goto simd_0f_mmx;
+
+ case X86EMUL_OPC(0x0f, 0xda): /* pminub mm/m64,mm */
+ case X86EMUL_OPC(0x0f, 0xde): /* pmaxub mm/m64,mm */
+ case X86EMUL_OPC(0x0f, 0xea): /* pminsw mm/m64,mm */
+ case X86EMUL_OPC(0x0f, 0xee): /* pmaxsw mm/m64,mm */
+ case X86EMUL_OPC(0x0f, 0xe0): /* pavgb mm/m64,mm */
+ case X86EMUL_OPC(0x0f, 0xe3): /* pavgw mm/m64,mm */
+ case X86EMUL_OPC(0x0f, 0xe4): /* pmulhuw mm/m64,mm */
+ case X86EMUL_OPC(0x0f, 0xf6): /* psadbw mm/m64,mm */
+ vcpu_must_have(sse);
+ goto simd_0f_mmx;
+
case X86EMUL_OPC(0x0f38, 0xf0): /* movbe m,r */
case X86EMUL_OPC(0x0f38, 0xf1): /* movbe r,m */
vcpu_must_have(movbe);
@@ -6159,6 +6551,76 @@ x86_emulate(
goto cannot_emulate;
}

+ if ( state->simd_size )
+ {
+#ifdef __XEN__
+ uint8_t *buf = stub.ptr;
+#else
+ uint8_t *buf = get_stub(stub);
+#endif
+
+ generate_exception_if(!op_bytes, EXC_UD);
+ generate_exception_if(vex.opcx && (d & TwoOp) && vex.reg != 0xf,
+ EXC_UD);
+
+ if ( !buf )
+ BUG();
+ if ( vex.opcx == vex_none )
+ SET_SSE_PREFIX(buf[0], vex.pfx);
+
+ buf[fic.insn_bytes] = 0xc3;
+ copy_REX_VEX(buf, rex_prefix, vex);
+
+ if ( ea.type == OP_MEM )
+ {
+ uint32_t mxcsr = 0;
+
+ if ( op_bytes < 16 ||
+ (vex.opcx
+ ? /* vmov{a,nt}p{s,d} are exceptions. */
+ ext != ext_0f || ((b | 1) != 0x29 && b != 0x2b)
+ : /* movup{s,d} and lddqu are exceptions. */
+ ext == ext_0f && ((b | 1) == 0x11 || b == 0xf0)) )
+ mxcsr = MXCSR_MM;
+ else if ( vcpu_has_misalignsse() )
+ asm ( "stmxcsr %0" : "=m" (mxcsr) );
+ generate_exception_if(!(mxcsr & MXCSR_MM) &&
+ !is_aligned(ea.mem.seg, ea.mem.off, op_bytes,
+ ctxt, ops),
+ EXC_GP, 0);
+ if ( (d & SrcMask) == SrcMem )
+ {
+ rc = ops->read(ea.mem.seg, ea.mem.off, mmvalp, op_bytes, ctxt);
+ if ( rc != X86EMUL_OKAY )
+ goto done;
+ dst.type = OP_NONE;
+ }
+ else if ( (d & DstMask) == DstMem )
+ {
+ fail_if(!ops->write); /* Check before running the stub. */
+ ASSERT(d & Mov);
+ dst.type = OP_MEM;
+ dst.bytes = op_bytes;
+ dst.mem = ea.mem;
+ }
+ else if ( (d & SrcMask) == SrcMem16 )
+ dst.type = OP_NONE;
+ else
+ {
+ ASSERT_UNREACHABLE();
+ return X86EMUL_UNHANDLEABLE;
+ }
+ }
+ else
+ dst.type = OP_NONE;
+
+ invoke_stub("", "", "+m" (*mmvalp), "+m" (fic.exn_raised)
+ : "a" (mmvalp));
+
+ put_stub(stub);
+ put_fpu(&fic);
+ }
+
switch ( dst.type )
{
case OP_REG:
@@ -6185,8 +6647,11 @@ x86_emulate(
else
{
fail_if(!ops->write);
- rc = ops->write(
- dst.mem.seg, dst.mem.off, &dst.val, dst.bytes, ctxt);
+ rc = ops->write(dst.mem.seg, dst.mem.off,
+ !state->simd_size ? &dst.val : (void *)mmvalp,
+ dst.bytes, ctxt);
+ if ( sfence )
+ asm volatile ( "sfence" ::: "memory" );
}
if ( rc != 0 )
goto done;
@@ -6458,22 +6923,6 @@ x86_insn_is_mem_write(const struct x86_e
case 0x6c: case 0x6d: /* INS */
case 0xa4: case 0xa5: /* MOVS */
case 0xaa: case 0xab: /* STOS */
- case X86EMUL_OPC(0x0f, 0x11): /* MOVUPS */
- case X86EMUL_OPC_VEX(0x0f, 0x11): /* VMOVUPS */
- case X86EMUL_OPC_66(0x0f, 0x11): /* MOVUPD */
- case X86EMUL_OPC_VEX_66(0x0f, 0x11): /* VMOVUPD */
- case X86EMUL_OPC_F3(0x0f, 0x11): /* MOVSS */
- case X86EMUL_OPC_VEX_F3(0x0f, 0x11): /* VMOVSS */
- case X86EMUL_OPC_F2(0x0f, 0x11): /* MOVSD */
- case X86EMUL_OPC_VEX_F2(0x0f, 0x11): /* VMOVSD */
- case X86EMUL_OPC(0x0f, 0x29): /* MOVAPS */
- case X86EMUL_OPC_VEX(0x0f, 0x29): /* VMOVAPS */
- case X86EMUL_OPC_66(0x0f, 0x29): /* MOVAPD */
- case X86EMUL_OPC_VEX_66(0x0f, 0x29): /* VMOVAPD */
- case X86EMUL_OPC(0x0f, 0x2b): /* MOVNTPS */
- case X86EMUL_OPC_VEX(0x0f, 0x2b): /* VMOVNTPS */
- case X86EMUL_OPC_66(0x0f, 0x2b): /* MOVNTPD */
- case X86EMUL_OPC_VEX_66(0x0f, 0x2b): /* VMOVNTPD */
case X86EMUL_OPC(0x0f, 0x7e): /* MOVD/MOVQ */
case X86EMUL_OPC_66(0x0f, 0x7e): /* MOVD/MOVQ */
case X86EMUL_OPC_VEX_66(0x0f, 0x7e): /* VMOVD/VMOVQ */
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -70,12 +70,14 @@
#define cpu_has_xsavec boot_cpu_has(X86_FEATURE_XSAVEC)
#define cpu_has_xgetbv1 boot_cpu_has(X86_FEATURE_XGETBV1)
#define cpu_has_xsaves boot_cpu_has(X86_FEATURE_XSAVES)
+#define cpu_has_avx2 boot_cpu_has(X86_FEATURE_AVX2)
#define cpu_has_monitor boot_cpu_has(X86_FEATURE_MONITOR)
#define cpu_has_eist boot_cpu_has(X86_FEATURE_EIST)
#define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR)
#define cpu_has_rdrand boot_cpu_has(X86_FEATURE_RDRAND)
#define cpu_has_rdseed boot_cpu_has(X86_FEATURE_RDSEED)
#define cpu_has_cmp_legacy boot_cpu_has(X86_FEATURE_CMP_LEGACY)
+#define cpu_has_sse4a boot_cpu_has(X86_FEATURE_SSE4A)
#define cpu_has_tbm boot_cpu_has(X86_FEATURE_TBM)
#define cpu_has_itsc boot_cpu_has(X86_FEATURE_ITSC)
#define cpu_has_hle boot_cpu_has(X86_FEATURE_HLE)
Andrew Cooper
2017-03-01 13:17:17 UTC
Permalink
Post by Jan Beulich
e 0x0f-escape
space with memory operands. Not covered here are irregular moves,
converts, and {,U}COMIS{S,D} (modifying EFLAGS).
Your email has text corruption in this paragraph, but the patch itself
looks ok.
Post by Jan Beulich
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -2602,13 +2698,53 @@ x86_decode(
ea.mem.off = truncate_ea(ea.mem.off);
}
- /*
- * When prefix 66 has a meaning different from operand-size override,
- * operand size defaults to 4 and can't be overridden to 2.
- */
- if ( op_bytes == 2 &&
- (ctxt->opcode & X86EMUL_OPC_PFX_MASK) == X86EMUL_OPC_66(0, 0) )
- op_bytes = 4;
Can we have a comment here along the lines of:

"Simple op_bytes calculations. More complicated cases use 0 and are
further decoded during execute." ?
Post by Jan Beulich
+ switch ( state->simd_size )
+ {
+ /*
+ * When prefix 66 has a meaning different from operand-size override,
+ * operand size defaults to 4 and can't be overridden to 2.
+ */
+ if ( op_bytes == 2 &&
+ (ctxt->opcode & X86EMUL_OPC_PFX_MASK) == X86EMUL_OPC_66(0, 0) )
+ op_bytes = 4;
+ break;
+
+ switch ( vex.pfx )
+ {
+ case vex_none: op_bytes = 8; break;
+ case vex_66: op_bytes = 16 << vex.l; break;
+ default: op_bytes = 0; break;
+ }
+ break;
+
+ if ( vex.pfx & VEX_PREFIX_DOUBLE_MASK )
+ {
+ op_bytes = 0;
+ break;
+ if ( vex.pfx & VEX_PREFIX_SCALAR_MASK )
+ {
+ op_bytes = 0;
+ break;
+ }
+ }
+ /* fall through */
+ switch ( vex.pfx )
+ {
+ default: op_bytes = 16 << vex.l; break;
+ case vex_f3: op_bytes = 4; break;
+ case vex_f2: op_bytes = 8; break;
+ }
+ break;
+
+ op_bytes = 0;
+ break;
+ }
return rc;
@@ -5413,6 +5671,81 @@ x86_emulate(
break;
}
+ CASE_SIMD_PACKED_INT(0x0f, 0x70): /* pshuf{w,d} $imm8,{,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x70): /* vpshufd $imm8,{x,y}mm/mem,{x,y}mm */
+ case X86EMUL_OPC_F3(0x0f, 0x70): /* pshufhw $imm8,xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_F3(0x0f, 0x70): /* vpshufhw $imm8,{x,y}mm/mem,{x,y}mm */
+ case X86EMUL_OPC_F2(0x0f, 0x70): /* pshuflw $imm8,xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_F2(0x0f, 0x70): /* vpshuflw $imm8,{x,y}mm/mem,{x,y}mm */
+ d = (d & ~SrcMask) | SrcMem | TwoOp;
+ op_bytes = vex.pfx ? 16 << vex.l : 8;
+ if ( vex.opcx != vex_none )
+ {
+ if ( vex.l )
+ host_and_vcpu_must_have(avx2);
+ else
+ {
+ host_and_vcpu_must_have(avx);
+ }
+ get_fpu(X86EMUL_FPU_ymm, &fic);
+ }
+ else if ( vex.pfx )
+ {
+ vcpu_must_have(sse2);
+ get_fpu(X86EMUL_FPU_xmm, &fic);
+ }
+ else
+ {
+ host_and_vcpu_must_have(mmx);
+ vcpu_must_have(sse);
+ get_fpu(X86EMUL_FPU_mmx, &fic);
+ }
+ {
+ uint8_t *buf = get_stub(stub);
+
+ buf[0] = 0x3e;
+ buf[1] = 0x3e;
+ buf[2] = 0x0f;
+ buf[3] = b;
+ buf[4] = modrm;
+ if ( ea.type == OP_MEM )
+ {
+ /* Convert memory operand to (%rAX). */
+ rex_prefix &= ~REX_B;
+ vex.b = 1;
+ buf[4] &= 0x38;
+ }
+ buf[5] = imm1;
+ fic.insn_bytes = 6;
What is the expectation with setting up the ret in the stub or not?
This seems rather inconsistent at the moment.
Post by Jan Beulich
@@ -6159,6 +6551,76 @@ x86_emulate(
goto cannot_emulate;
}
+ if ( state->simd_size )
+ {
+#ifdef __XEN__
+ uint8_t *buf = stub.ptr;
+#else
+ uint8_t *buf = get_stub(stub);
+#endif
Is this stale? Everywhere else is just get_stub() without any ifdefary.

~Andrew
Jan Beulich
2017-03-01 13:50:18 UTC
Permalink
Post by Andrew Cooper
Post by Jan Beulich
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -2602,13 +2698,53 @@ x86_decode(
ea.mem.off = truncate_ea(ea.mem.off);
}
- /*
- * When prefix 66 has a meaning different from operand-size override,
- * operand size defaults to 4 and can't be overridden to 2.
- */
- if ( op_bytes == 2 &&
- (ctxt->opcode & X86EMUL_OPC_PFX_MASK) == X86EMUL_OPC_66(0, 0) )
- op_bytes = 4;
"Simple op_bytes calculations. More complicated cases use 0 and are
further decoded during execute." ?
Sure.
Post by Andrew Cooper
Post by Jan Beulich
@@ -5413,6 +5671,81 @@ x86_emulate(
break;
}
+ CASE_SIMD_PACKED_INT(0x0f, 0x70): /* pshuf{w,d} $imm8,{,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x70): /* vpshufd $imm8,{x,y}mm/mem,{x,y}mm */
+ case X86EMUL_OPC_F3(0x0f, 0x70): /* pshufhw $imm8,xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_F3(0x0f, 0x70): /* vpshufhw $imm8,{x,y}mm/mem,{x,y}mm */
+ case X86EMUL_OPC_F2(0x0f, 0x70): /* pshuflw $imm8,xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_F2(0x0f, 0x70): /* vpshuflw $imm8,{x,y}mm/mem,{x,y}mm */
+ d = (d & ~SrcMask) | SrcMem | TwoOp;
+ op_bytes = vex.pfx ? 16 << vex.l : 8;
+ if ( vex.opcx != vex_none )
+ {
+ if ( vex.l )
+ host_and_vcpu_must_have(avx2);
+ else
+ {
+ host_and_vcpu_must_have(avx);
+ }
+ get_fpu(X86EMUL_FPU_ymm, &fic);
+ }
+ else if ( vex.pfx )
+ {
+ vcpu_must_have(sse2);
+ get_fpu(X86EMUL_FPU_xmm, &fic);
+ }
+ else
+ {
+ host_and_vcpu_must_have(mmx);
+ vcpu_must_have(sse);
+ get_fpu(X86EMUL_FPU_mmx, &fic);
+ }
+ {
+ uint8_t *buf = get_stub(stub);
+
+ buf[0] = 0x3e;
+ buf[1] = 0x3e;
+ buf[2] = 0x0f;
+ buf[3] = b;
+ buf[4] = modrm;
+ if ( ea.type == OP_MEM )
+ {
+ /* Convert memory operand to (%rAX). */
+ rex_prefix &= ~REX_B;
+ vex.b = 1;
+ buf[4] &= 0x38;
+ }
+ buf[5] = imm1;
+ fic.insn_bytes = 6;
What is the expectation with setting up the ret in the stub or not?
The code portion actually invoking the stub should generally do this.
All it needs for this is fic.insn_bytes to be set correctly.
Post by Andrew Cooper
This seems rather inconsistent at the moment.
Does it? At least in this patch I can't spot an inconsistency.
Post by Andrew Cooper
Post by Jan Beulich
@@ -6159,6 +6551,76 @@ x86_emulate(
goto cannot_emulate;
}
+ if ( state->simd_size )
+ {
+#ifdef __XEN__
+ uint8_t *buf = stub.ptr;
+#else
+ uint8_t *buf = get_stub(stub);
+#endif
Is this stale? Everywhere else is just get_stub() without any ifdefary.
No, it's not stale: In the hypervisor we can't use get_stub() a
second time, or else we'll invoke map_domain_page() a second
time, discarding (and hence leaking) the result of the earlier
one. And in the harness using get_stub() is the cleanest way
to get hold of the pointer again. I've considered and tried
several variants, but I couldn't come up with an approach not
needing any #ifdef - if you see a way, let me know.

Jan
Andrew Cooper
2017-03-01 18:08:06 UTC
Permalink
Post by Jan Beulich
Post by Andrew Cooper
This seems rather inconsistent at the moment.
Does it? At least in this patch I can't spot an inconsistency.
I meant in general across the current use of stubs, but perhaps that is
just because of the transition to the new model.
Post by Jan Beulich
Post by Andrew Cooper
Post by Jan Beulich
@@ -6159,6 +6551,76 @@ x86_emulate(
goto cannot_emulate;
}
+ if ( state->simd_size )
+ {
+#ifdef __XEN__
+ uint8_t *buf = stub.ptr;
+#else
+ uint8_t *buf = get_stub(stub);
+#endif
Is this stale? Everywhere else is just get_stub() without any ifdefary.
No, it's not stale: In the hypervisor we can't use get_stub() a
second time, or else we'll invoke map_domain_page() a second
time, discarding (and hence leaking) the result of the earlier
one.
As an aside, shouldn't we refcount (and at least assert) that we don't
map the same page twice, to avoid leaks?
Post by Jan Beulich
And in the harness using get_stub() is the cleanest way
to get hold of the pointer again. I've considered and tried
several variants, but I couldn't come up with an approach not
needing any #ifdef - if you see a way, let me know.
As it drops out in the following patch, I am not overly fussed.

~Andrew
Jan Beulich
2017-02-28 12:51:00 UTC
Permalink
Note that other than most scalar instructions, vcvt{,t}s{s,d}2si do #UD
when VEX.l is set on at least some Intel models. To be on the safe
side, implement the most restrictive mode here for now when emulating
an Intel CPU, and simply clear the bit when emulating an AMD one.

Signed-off-by: Jan Beulich <***@suse.com>
---
v3: Ignore VEX.l for scalar insns other than vcvt{,t}s{s,d}2si.
Introduce more labels to reduce redundant code. Add fic.exn_raised
constraint to relevant invoke_stub() uses.
v2: Don't pointlessly set TwoOp for cvtpi2p{s,d} and cvt{,t}p{s,d}2pi.
Set Mov for all converts (with follow-on adjustments to case
labels). Consistently generate #UD when VEX.l is disallowed. Don't
check VEX.vvvv for vcvtsi2s{s,d}.

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -251,9 +251,10 @@ static const struct {
[0x22 ... 0x23] = { DstImplicit|SrcMem|ModRM },
[0x28] = { DstImplicit|SrcMem|ModRM|Mov, simd_packed_fp },
[0x29] = { DstMem|SrcImplicit|ModRM|Mov, simd_packed_fp },
- [0x2a] = { ImplicitOps|ModRM },
+ [0x2a] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
[0x2b] = { DstMem|SrcImplicit|ModRM|Mov, simd_any_fp },
- [0x2c ... 0x2f] = { ImplicitOps|ModRM },
+ [0x2c ... 0x2d] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
+ [0x2e ... 0x2f] = { ImplicitOps|ModRM },
[0x30 ... 0x35] = { ImplicitOps },
[0x37] = { ImplicitOps },
[0x38] = { DstReg|SrcMem|ModRM },
@@ -264,7 +265,7 @@ static const struct {
[0x52 ... 0x53] = { DstImplicit|SrcMem|ModRM|TwoOp, simd_single_fp },
[0x54 ... 0x57] = { DstImplicit|SrcMem|ModRM, simd_packed_fp },
[0x58 ... 0x59] = { DstImplicit|SrcMem|ModRM, simd_any_fp },
- [0x5a ... 0x5b] = { ModRM },
+ [0x5a ... 0x5b] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
[0x5c ... 0x5f] = { DstImplicit|SrcMem|ModRM, simd_any_fp },
[0x60 ... 0x62] = { DstImplicit|SrcMem|ModRM, simd_other },
[0x63 ... 0x67] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
@@ -327,7 +328,7 @@ static const struct {
[0xe0] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
[0xe1 ... 0xe2] = { DstImplicit|SrcMem|ModRM, simd_other },
[0xe3 ... 0xe5] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
- [0xe6] = { ModRM },
+ [0xe6] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
[0xe7] = { DstMem|SrcImplicit|ModRM|Mov, simd_packed_int },
[0xe8 ... 0xef] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
[0xf0] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
@@ -5372,6 +5373,101 @@ x86_emulate(
goto done;
break;

+ case X86EMUL_OPC_66(0x0f, 0x2a): /* cvtpi2pd mm/m64,xmm */
+ if ( ea.type == OP_REG )
+ {
+ case X86EMUL_OPC(0x0f, 0x2a): /* cvtpi2ps mm/m64,xmm */
+ CASE_SIMD_PACKED_FP(, 0x0f, 0x2c): /* cvttp{s,d}2pi xmm/mem,mm */
+ CASE_SIMD_PACKED_FP(, 0x0f, 0x2d): /* cvtp{s,d}2pi xmm/mem,mm */
+ host_and_vcpu_must_have(mmx);
+ }
+ op_bytes = (b & 4) && (vex.pfx & VEX_PREFIX_DOUBLE_MASK) ? 16 : 8;
+ goto simd_0f_fp;
+
+ CASE_SIMD_SCALAR_FP(, 0x0f, 0x2a): /* cvtsi2s{s,d} r/m,xmm */
+ CASE_SIMD_SCALAR_FP(_VEX, 0x0f, 0x2a): /* vcvtsi2s{s,d} r/m,xmm,xmm */
+ if ( vex.opcx == vex_none )
+ {
+ if ( vex.pfx & VEX_PREFIX_DOUBLE_MASK )
+ vcpu_must_have(sse2);
+ else
+ vcpu_must_have(sse);
+ get_fpu(X86EMUL_FPU_xmm, &fic);
+ }
+ else
+ {
+ host_and_vcpu_must_have(avx);
+ get_fpu(X86EMUL_FPU_ymm, &fic);
+ }
+
+ if ( ea.type == OP_MEM )
+ {
+ rc = read_ulong(ea.mem.seg, ea.mem.off, &src.val,
+ rex_prefix & REX_W ? 8 : 4, ctxt, ops);
+ if ( rc != X86EMUL_OKAY )
+ goto done;
+ }
+ else
+ src.val = rex_prefix & REX_W ? *ea.reg : (uint32_t)*ea.reg;
+
+ state->simd_size = simd_none;
+ goto simd_0f_rm;
+
+ CASE_SIMD_SCALAR_FP(, 0x0f, 0x2c): /* cvtts{s,d}2si xmm/mem,reg */
+ CASE_SIMD_SCALAR_FP(_VEX, 0x0f, 0x2c): /* vcvtts{s,d}2si xmm/mem,reg */
+ CASE_SIMD_SCALAR_FP(, 0x0f, 0x2d): /* cvts{s,d}2si xmm/mem,reg */
+ CASE_SIMD_SCALAR_FP(_VEX, 0x0f, 0x2d): /* vcvts{s,d}2si xmm/mem,reg */
+ if ( vex.opcx == vex_none )
+ {
+ if ( vex.pfx & VEX_PREFIX_DOUBLE_MASK )
+ vcpu_must_have(sse2);
+ else
+ vcpu_must_have(sse);
+ get_fpu(X86EMUL_FPU_xmm, &fic);
+ }
+ else
+ {
+ if ( ctxt->vendor == X86_VENDOR_AMD )
+ vex.l = 0;
+ generate_exception_if(vex.l, EXC_UD);
+ host_and_vcpu_must_have(avx);
+ get_fpu(X86EMUL_FPU_ymm, &fic);
+ }
+
+ opc = init_prefixes(stub);
+ opc[0] = b;
+ /* Convert GPR destination to %rAX and memory operand to (%rCX). */
+ rex_prefix &= ~REX_R;
+ vex.r = 1;
+ if ( ea.type == OP_MEM )
+ {
+ rex_prefix &= ~REX_B;
+ vex.b = 1;
+ opc[1] = 0x01;
+
+ rc = ops->read(ea.mem.seg, ea.mem.off, mmvalp,
+ vex.pfx & VEX_PREFIX_DOUBLE_MASK ? 8 : 4, ctxt);
+ if ( rc != X86EMUL_OKAY )
+ goto done;
+ }
+ else
+ opc[1] = modrm & 0xc7;
+ if ( !mode_64bit() )
+ vex.w = 0;
+ fic.insn_bytes = PFX_BYTES + 2;
+ opc[2] = 0xc3;
+
+ copy_REX_VEX(opc, rex_prefix, vex);
+ ea.reg = decode_register(modrm_reg, &_regs, 0);
+ invoke_stub("", "", "=a" (*ea.reg), "+m" (fic.exn_raised)
+ : "c" (mmvalp), "m" (*mmvalp));
+
+ put_stub(stub);
+ put_fpu(&fic);
+
+ state->simd_size = simd_none;
+ break;
+
case X86EMUL_OPC(0x0f, 0x30): /* wrmsr */
generate_exception_if(!mode_ring0(), EXC_GP, 0);
fail_if(ops->write_msr == NULL);
@@ -5560,6 +5656,24 @@ x86_emulate(
dst.bytes = 4;
break;

+ CASE_SIMD_ALL_FP(, 0x0f, 0x5a): /* cvt{p,s}{s,d}2{p,s}{s,d} xmm/mem,xmm */
+ CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x5a): /* vcvtp{s,d}2p{s,d} xmm/mem,xmm */
+ /* vcvts{s,d}2s{s,d} xmm/mem,xmm,xmm */
+ op_bytes = 4 << (((vex.pfx & VEX_PREFIX_SCALAR_MASK) ? 0 : 1 + vex.l) +
+ !!(vex.pfx & VEX_PREFIX_DOUBLE_MASK));
+ simd_0f_cvt:
+ if ( vex.opcx == vex_none )
+ goto simd_0f_sse2;
+ goto simd_0f_avx;
+
+ CASE_SIMD_PACKED_FP(, 0x0f, 0x5b): /* cvt{ps,dq}2{dq,ps} xmm/mem,xmm */
+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x5b): /* vcvt{ps,dq}2{dq,ps} {x,y}mm/mem,{x,y}mm */
+ case X86EMUL_OPC_F3(0x0f, 0x5b): /* cvttps2dq xmm/mem,xmm */
+ case X86EMUL_OPC_VEX_F3(0x0f, 0x5b): /* vcvttps2dq {x,y}mm/mem,{x,y}mm */
+ d |= TwoOp;
+ op_bytes = 16 << vex.l;
+ goto simd_0f_cvt;
+
CASE_SIMD_PACKED_INT(0x0f, 0x60): /* punpcklbw {,x}mm/mem,{,x}mm */
case X86EMUL_OPC_VEX_66(0x0f, 0x60): /* vpunpcklbw {x,y}mm/mem,{x,y}mm,{x,y}mm */
CASE_SIMD_PACKED_INT(0x0f, 0x61): /* punpcklwd {,x}mm/mem,{,x}mm */
@@ -5694,6 +5808,7 @@ x86_emulate(
get_fpu(X86EMUL_FPU_mmx, &fic);
}

+ simd_0f_rm:
opc = init_prefixes(stub);
opc[0] = b;
/* Convert memory/GPR operand to (%rAX). */
@@ -5706,7 +5821,8 @@ x86_emulate(
opc[2] = 0xc3;

copy_REX_VEX(opc, rex_prefix, vex);
- invoke_stub("", "", "+m" (src.val) : "a" (&src.val));
+ invoke_stub("", "", "+m" (src.val), "+m" (fic.exn_raised)
+ : "a" (&src.val));
dst.val = src.val;

put_stub(stub);
@@ -6371,6 +6487,16 @@ x86_emulate(
vcpu_must_have(sse);
goto simd_0f_mmx;

+ case X86EMUL_OPC_66(0x0f, 0xe6): /* cvttpd2dq xmm/mem,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xe6): /* vcvttpd2dq {x,y}mm/mem,xmm */
+ case X86EMUL_OPC_F3(0x0f, 0xe6): /* cvtdq2pd xmm/mem,xmm */
+ case X86EMUL_OPC_VEX_F3(0x0f, 0xe6): /* vcvtdq2pd xmm/mem,{x,y}mm */
+ case X86EMUL_OPC_F2(0x0f, 0xe6): /* cvtpd2dq xmm/mem,xmm */
+ case X86EMUL_OPC_VEX_F2(0x0f, 0xe6): /* vcvtpd2dq {x,y}mm/mem,xmm */
+ d |= TwoOp;
+ op_bytes = 8 << (!!(vex.pfx & VEX_PREFIX_DOUBLE_MASK) + vex.l);
+ goto simd_0f_cvt;
+
CASE_SIMD_PACKED_INT(0x0f, 0xf7): /* maskmov{q,dqu} {,x}mm,{,x}mm */
case X86EMUL_OPC_VEX_66(0x0f, 0xf7): /* vmaskmovdqu xmm,xmm */
generate_exception_if(ea.type != OP_REG, EXC_UD);
Andrew Cooper
2017-03-01 14:09:03 UTC
Permalink
Post by Jan Beulich
Note that other than most scalar instructions, vcvt{,t}s{s,d}2si do #UD
when VEX.l is set on at least some Intel models. To be on the safe
side, implement the most restrictive mode here for now when emulating
an Intel CPU, and simply clear the bit when emulating an AMD one.
@@ -5560,6 +5656,24 @@ x86_emulate(
dst.bytes = 4;
break;
+ CASE_SIMD_ALL_FP(, 0x0f, 0x5a): /* cvt{p,s}{s,d}2{p,s}{s,d} xmm/mem,xmm */
+ CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x5a): /* vcvtp{s,d}2p{s,d} xmm/mem,xmm */
+ /* vcvts{s,d}2s{s,d} xmm/mem,xmm,xmm */
+ op_bytes = 4 << (((vex.pfx & VEX_PREFIX_SCALAR_MASK) ? 0 : 1 + vex.l) +
+ !!(vex.pfx & VEX_PREFIX_DOUBLE_MASK));
:( My head hurts.
Post by Jan Beulich
+ if ( vex.opcx == vex_none )
+ goto simd_0f_sse2;
+ goto simd_0f_avx;
+
+ CASE_SIMD_PACKED_FP(, 0x0f, 0x5b): /* cvt{ps,dq}2{dq,ps} xmm/mem,xmm */
+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x5b): /* vcvt{ps,dq}2{dq,ps} {x,y}mm/mem,{x,y}mm */
+ case X86EMUL_OPC_F3(0x0f, 0x5b): /* cvttps2dq xmm/mem,xmm */
+ case X86EMUL_OPC_VEX_F3(0x0f, 0x5b): /* vcvttps2dq {x,y}mm/mem,{x,y}mm */
+ d |= TwoOp;
+ op_bytes = 16 << vex.l;
+ goto simd_0f_cvt;
+
CASE_SIMD_PACKED_INT(0x0f, 0x60): /* punpcklbw {,x}mm/mem,{,x}mm */
case X86EMUL_OPC_VEX_66(0x0f, 0x60): /* vpunpcklbw {x,y}mm/mem,{x,y}mm,{x,y}mm */
CASE_SIMD_PACKED_INT(0x0f, 0x61): /* punpcklwd {,x}mm/mem,{,x}mm */
Jan Beulich
2017-02-28 12:50:25 UTC
Permalink
Previously supported insns are being converted to the new model, and
several new ones are being added.

To keep the stub handling reasonably simple, integrate SET_SSE_PREFIX()
into copy_REX_VEX(), at once switching the stubs to use an empty REX
prefix instead of a double DS: one (no byte registers are being
accessed, so an empty REX prefix has no effect), except (of course) for
the 32-bit test harness build.

Signed-off-by: Jan Beulich <***@suse.com>
---
v4: Re-base.
v3: Re-base. Introduce more labels to reduce redundant code.
v2: Don't clear TwoOp for vmov{l,h}p{s,d} to memory. Move re-setting of
TwoOp into VEX-specific code paths where possible. Special case
{,v}maskmov{q,dqu} in stub invocation. Move {,v}movq code block to
proper position. Add zero-mask {,v}maskmov{q,dqu} tests.

--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -1566,6 +1566,29 @@ int main(int argc, char **argv)
else
printf("skipped\n");

+ printf("%-40s", "Testing movq 32(%ecx),%xmm1...");
+ if ( stack_exec && cpu_has_sse2 )
+ {
+ decl_insn(movq_from_mem2);
+
+ asm volatile ( "pcmpeqb %%xmm1, %%xmm1\n"
+ put_insn(movq_from_mem2, "movq 32(%0), %%xmm1")
+ :: "c" (NULL) );
+
+ set_insn(movq_from_mem2);
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( rc != X86EMUL_OKAY || !check_eip(movq_from_mem2) )
+ goto fail;
+ asm ( "pcmpgtb %%xmm0, %%xmm0\n\t"
+ "pcmpeqb %%xmm1, %%xmm0\n\t"
+ "pmovmskb %%xmm0, %0" : "=r" (rc) );
+ if ( rc != 0xffff )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
printf("%-40s", "Testing vmovq %xmm1,32(%edx)...");
if ( stack_exec && cpu_has_avx )
{
@@ -1590,6 +1613,29 @@ int main(int argc, char **argv)
else
printf("skipped\n");

+ printf("%-40s", "Testing vmovq 32(%edx),%xmm0...");
+ if ( stack_exec && cpu_has_avx )
+ {
+ decl_insn(vmovq_from_mem);
+
+ asm volatile ( "pcmpeqb %%xmm0, %%xmm0\n"
+ put_insn(vmovq_from_mem, "vmovq 32(%0), %%xmm0")
+ :: "d" (NULL) );
+
+ set_insn(vmovq_from_mem);
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( rc != X86EMUL_OKAY || !check_eip(vmovq_from_mem) )
+ goto fail;
+ asm ( "pcmpgtb %%xmm1, %%xmm1\n\t"
+ "pcmpeqb %%xmm0, %%xmm1\n\t"
+ "pmovmskb %%xmm1, %0" : "=r" (rc) );
+ if ( rc != 0xffff )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
printf("%-40s", "Testing movdqu %xmm2,(%ecx)...");
if ( stack_exec && cpu_has_sse2 )
{
@@ -1821,6 +1867,33 @@ int main(int argc, char **argv)
else
printf("skipped\n");

+ printf("%-40s", "Testing movd 32(%ecx),%mm4...");
+ if ( stack_exec && cpu_has_mmx )
+ {
+ decl_insn(movd_from_mem);
+
+ asm volatile ( "pcmpgtb %%mm4, %%mm4\n"
+ put_insn(movd_from_mem, "movd 32(%0), %%mm4")
+ :: "c" (NULL) );
+
+ set_insn(movd_from_mem);
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( rc != X86EMUL_OKAY || !check_eip(movd_from_mem) )
+ goto fail;
+ asm ( "pxor %%mm2,%%mm2\n\t"
+ "pcmpeqb %%mm4, %%mm2\n\t"
+ "pmovmskb %%mm2, %0" : "=r" (rc) );
+ if ( rc != 0xf0 )
+ goto fail;
+ asm ( "pcmpeqb %%mm4, %%mm3\n\t"
+ "pmovmskb %%mm3, %0" : "=r" (rc) );
+ if ( rc != 0x0f )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
printf("%-40s", "Testing movd %xmm2,32(%edx)...");
if ( stack_exec && cpu_has_sse2 )
{
@@ -1845,6 +1918,34 @@ int main(int argc, char **argv)
else
printf("skipped\n");

+ printf("%-40s", "Testing movd 32(%edx),%xmm3...");
+ if ( stack_exec && cpu_has_sse2 )
+ {
+ decl_insn(movd_from_mem2);
+
+ asm volatile ( "pcmpeqb %%xmm3, %%xmm3\n"
+ put_insn(movd_from_mem2, "movd 32(%0), %%xmm3")
+ :: "d" (NULL) );
+
+ set_insn(movd_from_mem2);
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( rc != X86EMUL_OKAY || !check_eip(movd_from_mem2) )
+ goto fail;
+ asm ( "pxor %%xmm1,%%xmm1\n\t"
+ "pcmpeqb %%xmm3, %%xmm1\n\t"
+ "pmovmskb %%xmm1, %0" : "=r" (rc) );
+ if ( rc != 0xfff0 )
+ goto fail;
+ asm ( "pcmpeqb %%xmm2, %%xmm2\n\t"
+ "pcmpeqb %%xmm3, %%xmm2\n\t"
+ "pmovmskb %%xmm2, %0" : "=r" (rc) );
+ if ( rc != 0x000f )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
printf("%-40s", "Testing vmovd %xmm1,32(%ecx)...");
if ( stack_exec && cpu_has_avx )
{
@@ -1869,6 +1970,34 @@ int main(int argc, char **argv)
else
printf("skipped\n");

+ printf("%-40s", "Testing vmovd 32(%ecx),%xmm2...");
+ if ( stack_exec && cpu_has_avx )
+ {
+ decl_insn(vmovd_from_mem);
+
+ asm volatile ( "pcmpeqb %%xmm2, %%xmm2\n"
+ put_insn(vmovd_from_mem, "vmovd 32(%0), %%xmm2")
+ :: "c" (NULL) );
+
+ set_insn(vmovd_from_mem);
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( rc != X86EMUL_OKAY || !check_eip(vmovd_from_mem) )
+ goto fail;
+ asm ( "pxor %%xmm0,%%xmm0\n\t"
+ "pcmpeqb %%xmm2, %%xmm0\n\t"
+ "pmovmskb %%xmm0, %0" : "=r" (rc) );
+ if ( rc != 0xfff0 )
+ goto fail;
+ asm ( "pcmpeqb %%xmm1, %%xmm1\n\t"
+ "pcmpeqb %%xmm2, %%xmm1\n\t"
+ "pmovmskb %%xmm1, %0" : "=r" (rc) );
+ if ( rc != 0x000f )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
printf("%-40s", "Testing movd %mm3,%ebx...");
if ( stack_exec && cpu_has_mmx )
{
@@ -1899,6 +2028,34 @@ int main(int argc, char **argv)
else
printf("skipped\n");

+ printf("%-40s", "Testing movd %ebx,%mm4...");
+ if ( stack_exec && cpu_has_mmx )
+ {
+ decl_insn(movd_from_reg);
+
+ /* See comment next to movd above. */
+ asm volatile ( "pcmpgtb %%mm4, %%mm4\n"
+ put_insn(movd_from_reg, "movd %%ebx, %%mm4")
+ :: );
+
+ set_insn(movd_from_reg);
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( (rc != X86EMUL_OKAY) || !check_eip(movd_from_reg) )
+ goto fail;
+ asm ( "pxor %%mm2,%%mm2\n\t"
+ "pcmpeqb %%mm4, %%mm2\n\t"
+ "pmovmskb %%mm2, %0" : "=r" (rc) );
+ if ( rc != 0xf0 )
+ goto fail;
+ asm ( "pcmpeqb %%mm4, %%mm3\n\t"
+ "pmovmskb %%mm3, %0" : "=r" (rc) );
+ if ( rc != 0x0f )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
printf("%-40s", "Testing movd %xmm2,%ebx...");
if ( stack_exec && cpu_has_sse2 )
{
@@ -1924,6 +2081,35 @@ int main(int argc, char **argv)
else
printf("skipped\n");

+ printf("%-40s", "Testing movd %ebx,%xmm3...");
+ if ( stack_exec && cpu_has_sse2 )
+ {
+ decl_insn(movd_from_reg2);
+
+ /* See comment next to movd above. */
+ asm volatile ( "pcmpgtb %%xmm3, %%xmm3\n"
+ put_insn(movd_from_reg2, "movd %%ebx, %%xmm3")
+ :: );
+
+ set_insn(movd_from_reg2);
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( (rc != X86EMUL_OKAY) || !check_eip(movd_from_reg2) )
+ goto fail;
+ asm ( "pxor %%xmm1,%%xmm1\n\t"
+ "pcmpeqb %%xmm3, %%xmm1\n\t"
+ "pmovmskb %%xmm1, %0" : "=r" (rc) );
+ if ( rc != 0xfff0 )
+ goto fail;
+ asm ( "pcmpeqb %%xmm2, %%xmm2\n\t"
+ "pcmpeqb %%xmm3, %%xmm2\n\t"
+ "pmovmskb %%xmm2, %0" : "=r" (rc) );
+ if ( rc != 0x000f )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
printf("%-40s", "Testing vmovd %xmm1,%ebx...");
if ( stack_exec && cpu_has_avx )
{
@@ -1949,6 +2135,35 @@ int main(int argc, char **argv)
else
printf("skipped\n");

+ printf("%-40s", "Testing vmovd %ebx,%xmm2...");
+ if ( stack_exec && cpu_has_avx )
+ {
+ decl_insn(vmovd_from_reg);
+
+ /* See comment next to movd above. */
+ asm volatile ( "pcmpgtb %%xmm2, %%xmm2\n"
+ put_insn(vmovd_from_reg, "vmovd %%ebx, %%xmm2")
+ :: );
+
+ set_insn(vmovd_from_reg);
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( (rc != X86EMUL_OKAY) || !check_eip(vmovd_from_reg) )
+ goto fail;
+ asm ( "pxor %%xmm0,%%xmm0\n\t"
+ "pcmpeqb %%xmm2, %%xmm0\n\t"
+ "pmovmskb %%xmm0, %0" : "=r" (rc) );
+ if ( rc != 0xfff0 )
+ goto fail;
+ asm ( "pcmpeqb %%xmm1, %%xmm1\n\t"
+ "pcmpeqb %%xmm2, %%xmm1\n\t"
+ "pmovmskb %%xmm1, %0" : "=r" (rc) );
+ if ( rc != 0x000f )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
#ifdef __x86_64__
printf("%-40s", "Testing movq %mm3,32(%ecx)...");
if ( stack_exec && cpu_has_mmx )
@@ -2087,6 +2302,41 @@ int main(int argc, char **argv)
printf("skipped\n");
#endif

+ printf("%-40s", "Testing maskmovq (zero mask)...");
+ if ( stack_exec && cpu_has_sse )
+ {
+ decl_insn(maskmovq);
+
+ asm volatile ( "pcmpgtb %mm4, %mm4\n"
+ put_insn(maskmovq, "maskmovq %mm4, %mm4") );
+
+ set_insn(maskmovq);
+ regs.edi = 0;
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( rc != X86EMUL_OKAY || !check_eip(maskmovq) )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
+ printf("%-40s", "Testing maskmovdqu (zero mask)...");
+ if ( stack_exec && cpu_has_sse2 )
+ {
+ decl_insn(maskmovdqu);
+
+ asm volatile ( "pcmpgtb %xmm3, %xmm3\n"
+ put_insn(maskmovdqu, "maskmovdqu %xmm3, %xmm3") );
+
+ set_insn(maskmovdqu);
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( rc != X86EMUL_OKAY || !check_eip(maskmovdqu) )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
printf("%-40s", "Testing lddqu 4(%edx),%xmm4...");
if ( stack_exec && cpu_has_sse3 )
{
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -241,9 +241,12 @@ static const struct {
[0x0f] = { ModRM|SrcImmByte },
[0x10] = { DstImplicit|SrcMem|ModRM|Mov, simd_any_fp },
[0x11] = { DstMem|SrcImplicit|ModRM|Mov, simd_any_fp },
- [0x12 ... 0x13] = { ImplicitOps|ModRM },
+ [0x12] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
+ [0x13] = { DstMem|SrcImplicit|ModRM|Mov, simd_other },
[0x14 ... 0x15] = { DstImplicit|SrcMem|ModRM, simd_packed_fp },
- [0x16 ... 0x1f] = { ImplicitOps|ModRM },
+ [0x16] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
+ [0x17] = { DstMem|SrcImplicit|ModRM|Mov, simd_other },
+ [0x18 ... 0x1f] = { ImplicitOps|ModRM },
[0x20 ... 0x21] = { DstMem|SrcImplicit|ModRM },
[0x22 ... 0x23] = { DstImplicit|SrcMem|ModRM },
[0x28] = { DstImplicit|SrcMem|ModRM|Mov, simd_packed_fp },
@@ -256,7 +259,7 @@ static const struct {
[0x38] = { DstReg|SrcMem|ModRM },
[0x3a] = { DstReg|SrcImmByte|ModRM },
[0x40 ... 0x4f] = { DstReg|SrcMem|ModRM|Mov },
- [0x50] = { ModRM },
+ [0x50] = { DstReg|SrcImplicit|ModRM|Mov },
[0x51] = { DstImplicit|SrcMem|ModRM|TwoOp, simd_any_fp },
[0x52 ... 0x53] = { DstImplicit|SrcMem|ModRM|TwoOp, simd_single_fp },
[0x54 ... 0x57] = { DstImplicit|SrcMem|ModRM, simd_packed_fp },
@@ -267,14 +270,16 @@ static const struct {
[0x63 ... 0x67] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
[0x68 ... 0x6a] = { DstImplicit|SrcMem|ModRM, simd_other },
[0x6b ... 0x6d] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
- [0x6e ... 0x6f] = { ImplicitOps|ModRM },
+ [0x6e] = { DstImplicit|SrcMem|ModRM|Mov },
+ [0x6f] = { DstImplicit|SrcMem|ModRM|Mov, simd_packed_int },
[0x70] = { SrcImmByte|ModRM|TwoOp, simd_other },
[0x71 ... 0x73] = { SrcImmByte|ModRM },
[0x74 ... 0x76] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
[0x77] = { DstImplicit|SrcNone },
[0x78 ... 0x79] = { ModRM },
[0x7c ... 0x7d] = { DstImplicit|SrcMem|ModRM, simd_other },
- [0x7e ... 0x7f] = { ImplicitOps|ModRM },
+ [0x7e] = { DstMem|SrcImplicit|ModRM|Mov },
+ [0x7f] = { DstMem|SrcImplicit|ModRM|Mov, simd_packed_int },
[0x80 ... 0x8f] = { DstImplicit|SrcImm },
[0x90 ... 0x9f] = { ByteOp|DstMem|SrcNone|ModRM|Mov },
[0xa0 ... 0xa1] = { ImplicitOps|Mov },
@@ -316,19 +321,19 @@ static const struct {
[0xd0] = { DstImplicit|SrcMem|ModRM, simd_other },
[0xd1 ... 0xd3] = { DstImplicit|SrcMem|ModRM, simd_other },
[0xd4 ... 0xd5] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
- [0xd6] = { ImplicitOps|ModRM },
- [0xd7] = { ModRM },
+ [0xd6] = { DstMem|SrcImplicit|ModRM|Mov, simd_other },
+ [0xd7] = { DstReg|SrcImplicit|ModRM|Mov },
[0xd8 ... 0xdf] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
[0xe0] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
[0xe1 ... 0xe2] = { DstImplicit|SrcMem|ModRM, simd_other },
[0xe3 ... 0xe5] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
[0xe6] = { ModRM },
- [0xe7] = { ImplicitOps|ModRM },
+ [0xe7] = { DstMem|SrcImplicit|ModRM|Mov, simd_packed_int },
[0xe8 ... 0xef] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
[0xf0] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
[0xf1 ... 0xf3] = { DstImplicit|SrcMem|ModRM, simd_other },
[0xf4 ... 0xf6] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
- [0xf7] = { ModRM },
+ [0xf7] = { DstMem|SrcMem|ModRM|Mov, simd_packed_int },
[0xf8 ... 0xfe] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
[0xff] = { ModRM }
};
@@ -364,11 +369,6 @@ enum vex_pfx {

static const uint8_t sse_prefix[] = { 0x66, 0xf3, 0xf2 };

-#define SET_SSE_PREFIX(dst, vex_pfx) do { \
- if ( vex_pfx ) \
- (dst) = sse_prefix[(vex_pfx) - 1]; \
-} while (0)
-
union vex {
uint8_t raw[2];
struct {
@@ -383,15 +383,35 @@ union vex {
};
};

+#ifdef __x86_64__
+# define PFX2 REX_PREFIX
+#else
+# define PFX2 0x3e
+#endif
+#define PFX_BYTES 3
+#define init_prefixes(stub) ({ \
+ uint8_t *buf_ = get_stub(stub); \
+ buf_[0] = 0x3e; \
+ buf_[1] = PFX2; \
+ buf_[2] = 0x0f; \
+ buf_ + 3; \
+})
+
#define copy_REX_VEX(ptr, rex, vex) do { \
if ( (vex).opcx != vex_none ) \
{ \
if ( !mode_64bit() ) \
vex.reg |= 8; \
- ptr[0] = 0xc4, ptr[1] = (vex).raw[0], ptr[2] = (vex).raw[1]; \
+ (ptr)[0 - PFX_BYTES] = 0xc4; \
+ (ptr)[1 - PFX_BYTES] = (vex).raw[0]; \
+ (ptr)[2 - PFX_BYTES] = (vex).raw[1]; \
+ } \
+ else \
+ { \
+ if ( (vex).pfx ) \
+ (ptr)[0 - PFX_BYTES] = sse_prefix[(vex).pfx - 1]; \
+ (ptr)[1 - PFX_BYTES] |= rex; \
} \
- else if ( mode_64bit() ) \
- ptr[1] = rex | REX_PREFIX; \
} while (0)

union evex {
@@ -2149,7 +2169,8 @@ x86_decode_twobyte(
case 0x10 ... 0x18:
case 0x28 ... 0x2f:
case 0x50 ... 0x77:
- case 0x79 ... 0x7f:
+ case 0x79 ... 0x7d:
+ case 0x7f:
case 0xae:
case 0xc2 ... 0xc3:
case 0xc5 ... 0xc6:
@@ -2169,6 +2190,18 @@ x86_decode_twobyte(
op_bytes = mode_64bit() ? 8 : 4;
break;

+ case 0x7e:
+ ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
+ if ( vex.pfx == vex_f3 ) /* movq xmm/m64,xmm */
+ {
+ case X86EMUL_OPC_VEX_F3(0, 0x7e): /* vmovq xmm/m64,xmm */
+ state->desc = DstImplicit | SrcMem | ModRM | Mov;
+ state->simd_size = simd_other;
+ /* Avoid the state->desc adjustment below. */
+ return X86EMUL_OKAY;
+ }
+ break;
+
case 0xb8: /* jmpe / popcnt */
if ( rep_prefix() )
ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
@@ -2766,7 +2799,7 @@ x86_emulate(
struct cpu_user_regs _regs = *ctxt->regs;
struct x86_emulate_state state;
int rc;
- uint8_t b, d;
+ uint8_t b, d, *opc = NULL;
bool singlestep = (_regs._eflags & X86_EFLAGS_TF) &&
!is_branch_step(ctxt, ops);
bool sfence = false;
@@ -5233,6 +5266,7 @@ x86_emulate(
CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x5e): /* vdiv{p,s}{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
CASE_SIMD_ALL_FP(, 0x0f, 0x5f): /* max{p,s}{s,d} xmm/mem,xmm */
CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x5f): /* vmax{p,s}{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ simd_0f_fp:
if ( vex.opcx == vex_none )
{
if ( vex.pfx & VEX_PREFIX_DOUBLE_MASK )
@@ -5256,24 +5290,57 @@ x86_emulate(
get_fpu(X86EMUL_FPU_ymm, &fic);
}
simd_0f_common:
- {
- uint8_t *buf = get_stub(stub);
-
- buf[0] = 0x3e;
- buf[1] = 0x3e;
- buf[2] = 0x0f;
- buf[3] = b;
- buf[4] = modrm;
+ opc = init_prefixes(stub);
+ opc[0] = b;
+ opc[1] = modrm;
if ( ea.type == OP_MEM )
{
/* convert memory operand to (%rAX) */
rex_prefix &= ~REX_B;
vex.b = 1;
- buf[4] &= 0x38;
+ opc[1] &= 0x38;
}
- fic.insn_bytes = 5;
+ fic.insn_bytes = PFX_BYTES + 2;
break;
- }
+
+ case X86EMUL_OPC_66(0x0f, 0x12): /* movlpd m64,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x12): /* vmovlpd m64,xmm,xmm */
+ CASE_SIMD_PACKED_FP(, 0x0f, 0x13): /* movlp{s,d} xmm,m64 */
+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x13): /* vmovlp{s,d} xmm,m64 */
+ case X86EMUL_OPC_66(0x0f, 0x16): /* movhpd m64,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x16): /* vmovhpd m64,xmm,xmm */
+ CASE_SIMD_PACKED_FP(, 0x0f, 0x17): /* movhp{s,d} xmm,m64 */
+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x17): /* vmovhp{s,d} xmm,m64 */
+ generate_exception_if(ea.type != OP_MEM, EXC_UD);
+ /* fall through */
+ case X86EMUL_OPC(0x0f, 0x12): /* movlps m64,xmm */
+ /* movhlps xmm,xmm */
+ case X86EMUL_OPC_VEX(0x0f, 0x12): /* vmovlps m64,xmm,xmm */
+ /* vmovhlps xmm,xmm,xmm */
+ case X86EMUL_OPC(0x0f, 0x16): /* movhps m64,xmm */
+ /* movlhps xmm,xmm */
+ case X86EMUL_OPC_VEX(0x0f, 0x16): /* vmovhps m64,xmm,xmm */
+ /* vmovlhps xmm,xmm,xmm */
+ generate_exception_if(vex.l, EXC_UD);
+ if ( (d & DstMask) != DstMem )
+ d &= ~TwoOp;
+ op_bytes = 8;
+ goto simd_0f_fp;
+
+ case X86EMUL_OPC_F3(0x0f, 0x12): /* movsldup xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_F3(0x0f, 0x12): /* vmovsldup {x,y}mm/mem,{x,y}mm */
+ case X86EMUL_OPC_F2(0x0f, 0x12): /* movddup xmm/m64,xmm */
+ case X86EMUL_OPC_VEX_F2(0x0f, 0x12): /* vmovddup {x,y}mm/mem,{x,y}mm */
+ case X86EMUL_OPC_F3(0x0f, 0x16): /* movshdup xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_F3(0x0f, 0x16): /* vmovshdup {x,y}mm/mem,{x,y}mm */
+ d |= TwoOp;
+ op_bytes = !(vex.pfx & VEX_PREFIX_DOUBLE_MASK) || vex.l
+ ? 16 << vex.l : 8;
+ simd_0f_sse3_avx:
+ if ( vex.opcx != vex_none )
+ goto simd_0f_avx;
+ host_and_vcpu_must_have(sse3);
+ goto simd_0f_xmm;

case X86EMUL_OPC(0x0f, 0x20): /* mov cr,reg */
case X86EMUL_OPC(0x0f, 0x21): /* mov dr,reg */
@@ -5429,6 +5496,57 @@ x86_emulate(
singlestep = _regs._eflags & X86_EFLAGS_TF;
break;

+ CASE_SIMD_PACKED_FP(, 0x0f, 0x50): /* movmskp{s,d} xmm,reg */
+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x50): /* vmovmskp{s,d} {x,y}mm,reg */
+ CASE_SIMD_PACKED_INT(0x0f, 0xd7): /* pmovmskb {,x}mm,reg */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xd7): /* vpmovmskb {x,y}mm,reg */
+ generate_exception_if(ea.type != OP_REG, EXC_UD);
+
+ if ( vex.opcx == vex_none )
+ {
+ if ( vex.pfx & VEX_PREFIX_DOUBLE_MASK )
+ vcpu_must_have(sse2);
+ else
+ {
+ if ( b != 0x50 )
+ host_and_vcpu_must_have(mmx);
+ vcpu_must_have(sse);
+ }
+ if ( b == 0x50 || (vex.pfx & VEX_PREFIX_DOUBLE_MASK) )
+ get_fpu(X86EMUL_FPU_xmm, &fic);
+ else
+ get_fpu(X86EMUL_FPU_mmx, &fic);
+ }
+ else
+ {
+ generate_exception_if(vex.reg != 0xf, EXC_UD);
+ if ( b == 0x50 || !vex.l )
+ host_and_vcpu_must_have(avx);
+ else
+ host_and_vcpu_must_have(avx2);
+ get_fpu(X86EMUL_FPU_ymm, &fic);
+ }
+
+ opc = init_prefixes(stub);
+ opc[0] = b;
+ /* Convert GPR destination to %rAX. */
+ rex_prefix &= ~REX_R;
+ vex.r = 1;
+ if ( !mode_64bit() )
+ vex.w = 0;
+ opc[1] = modrm & 0xc7;
+ fic.insn_bytes = PFX_BYTES + 2;
+ opc[2] = 0xc3;
+
+ copy_REX_VEX(opc, rex_prefix, vex);
+ invoke_stub("", "", "=a" (dst.val) : [dummy] "i" (0));
+
+ put_stub(stub);
+ put_fpu(&fic);
+
+ dst.bytes = 4;
+ break;
+
CASE_SIMD_PACKED_INT(0x0f, 0x60): /* punpcklbw {,x}mm/mem,{,x}mm */
case X86EMUL_OPC_VEX_66(0x0f, 0x60): /* vpunpcklbw {x,y}mm/mem,{x,y}mm,{x,y}mm */
CASE_SIMD_PACKED_INT(0x0f, 0x61): /* punpcklwd {,x}mm/mem,{,x}mm */
@@ -5542,134 +5660,74 @@ x86_emulate(
get_fpu(X86EMUL_FPU_mmx, &fic);
goto simd_0f_common;

- case X86EMUL_OPC(0x0f, 0xe7): /* movntq mm,m64 */
- case X86EMUL_OPC_66(0x0f, 0xe7): /* movntdq xmm,m128 */
- case X86EMUL_OPC_VEX_66(0x0f, 0xe7): /* vmovntdq xmm,m128 */
- /* vmovntdq ymm,m256 */
- fail_if(ea.type != OP_MEM);
- /* fall through */
- case X86EMUL_OPC(0x0f, 0x6f): /* movq mm/m64,mm */
- case X86EMUL_OPC_66(0x0f, 0x6f): /* movdqa xmm/m128,xmm */
- case X86EMUL_OPC_F3(0x0f, 0x6f): /* movdqu xmm/m128,xmm */
- case X86EMUL_OPC_VEX_66(0x0f, 0x6f): /* vmovdqa xmm/m128,xmm */
- /* vmovdqa ymm/m256,ymm */
- case X86EMUL_OPC_VEX_F3(0x0f, 0x6f): /* vmovdqu xmm/m128,xmm */
- /* vmovdqu ymm/m256,ymm */
- case X86EMUL_OPC(0x0f, 0x7e): /* movd mm,r/m32 */
- /* movq mm,r/m64 */
- case X86EMUL_OPC_66(0x0f, 0x7e): /* movd xmm,r/m32 */
- /* movq xmm,r/m64 */
- case X86EMUL_OPC_VEX_66(0x0f, 0x7e): /* vmovd xmm,r/m32 */
- /* vmovq xmm,r/m64 */
- case X86EMUL_OPC(0x0f, 0x7f): /* movq mm,mm/m64 */
- case X86EMUL_OPC_66(0x0f, 0x7f): /* movdqa xmm,xmm/m128 */
- case X86EMUL_OPC_VEX_66(0x0f, 0x7f): /* vmovdqa xmm,xmm/m128 */
- /* vmovdqa ymm,ymm/m256 */
- case X86EMUL_OPC_F3(0x0f, 0x7f): /* movdqu xmm,xmm/m128 */
- case X86EMUL_OPC_VEX_F3(0x0f, 0x7f): /* vmovdqu xmm,xmm/m128 */
- /* vmovdqu ymm,ymm/m256 */
- case X86EMUL_OPC_66(0x0f, 0xd6): /* movq xmm,xmm/m64 */
- case X86EMUL_OPC_VEX_66(0x0f, 0xd6): /* vmovq xmm,xmm/m64 */
- {
- uint8_t *buf = get_stub(stub);
-
- fic.insn_bytes = 5;
- buf[0] = 0x3e;
- buf[1] = 0x3e;
- buf[2] = 0x0f;
- buf[3] = b;
- buf[4] = modrm;
- buf[5] = 0xc3;
- if ( vex.opcx == vex_none )
- {
- switch ( vex.pfx )
- {
- case vex_66:
- case vex_f3:
- vcpu_must_have(sse2);
- /* Converting movdqu to movdqa here: Our buffer is aligned. */
- buf[0] = 0x66;
- get_fpu(X86EMUL_FPU_xmm, &fic);
- ea.bytes = 16;
- break;
- case vex_none:
- if ( b != 0xe7 )
- host_and_vcpu_must_have(mmx);
- else
- vcpu_must_have(sse);
- get_fpu(X86EMUL_FPU_mmx, &fic);
- ea.bytes = 8;
- break;
- default:
- goto cannot_emulate;
- }
- }
- else
+ CASE_SIMD_PACKED_INT(0x0f, 0x6e): /* mov{d,q} r/m,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x6e): /* vmov{d,q} r/m,xmm */
+ CASE_SIMD_PACKED_INT(0x0f, 0x7e): /* mov{d,q} {,x}mm,r/m */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x7e): /* vmov{d,q} xmm,r/m */
+ if ( vex.opcx != vex_none )
{
- fail_if(vex.reg != 0xf);
+ generate_exception_if(vex.l || vex.reg != 0xf, EXC_UD);
host_and_vcpu_must_have(avx);
get_fpu(X86EMUL_FPU_ymm, &fic);
- ea.bytes = 16 << vex.l;
}
- switch ( b )
- {
- case 0x7e:
- generate_exception_if(vex.l, EXC_UD);
- ea.bytes = op_bytes;
- break;
- case 0xd6:
- generate_exception_if(vex.l, EXC_UD);
- ea.bytes = 8;
- break;
- }
- if ( ea.type == OP_MEM )
+ else if ( vex.pfx )
{
- uint32_t mxcsr = 0;
-
- if ( ea.bytes < 16 || vex.pfx == vex_f3 )
- mxcsr = MXCSR_MM;
- else if ( vcpu_has_misalignsse() )
- asm ( "stmxcsr %0" : "=m" (mxcsr) );
- generate_exception_if(!(mxcsr & MXCSR_MM) &&
- !is_aligned(ea.mem.seg, ea.mem.off, ea.bytes,
- ctxt, ops),
- EXC_GP, 0);
- if ( b == 0x6f )
- rc = ops->read(ea.mem.seg, ea.mem.off+0, mmvalp,
- ea.bytes, ctxt);
- else
- fail_if(!ops->write); /* Check before running the stub. */
+ vcpu_must_have(sse2);
+ get_fpu(X86EMUL_FPU_xmm, &fic);
}
- if ( ea.type == OP_MEM || b == 0x7e )
+ else
{
- /* Convert memory operand or GPR destination to (%rAX) */
- rex_prefix &= ~REX_B;
- vex.b = 1;
- buf[4] &= 0x38;
- if ( ea.type == OP_MEM )
- ea.reg = (void *)mmvalp;
- else /* Ensure zero-extension of a 32-bit result. */
- *ea.reg = 0;
- }
- if ( !rc )
- {
- copy_REX_VEX(buf, rex_prefix, vex);
- asm volatile ( "call *%0" : : "r" (stub.func), "a" (ea.reg)
- : "memory" );
+ host_and_vcpu_must_have(mmx);
+ get_fpu(X86EMUL_FPU_mmx, &fic);
}
- put_fpu(&fic);
+
+ opc = init_prefixes(stub);
+ opc[0] = b;
+ /* Convert memory/GPR operand to (%rAX). */
+ rex_prefix &= ~REX_B;
+ vex.b = 1;
+ if ( !mode_64bit() )
+ vex.w = 0;
+ opc[1] = modrm & 0x38;
+ fic.insn_bytes = PFX_BYTES + 2;
+ opc[2] = 0xc3;
+
+ copy_REX_VEX(opc, rex_prefix, vex);
+ invoke_stub("", "", "+m" (src.val) : "a" (&src.val));
+ dst.val = src.val;
+
put_stub(stub);
- if ( !rc && (b != 0x6f) && (ea.type == OP_MEM) )
- {
- ASSERT(ops->write); /* See the fail_if() above. */
- rc = ops->write(ea.mem.seg, ea.mem.off, mmvalp,
- ea.bytes, ctxt);
- }
- if ( rc )
- goto done;
- dst.type = OP_NONE;
+ put_fpu(&fic);
break;
- }
+
+ case X86EMUL_OPC_66(0x0f, 0xe7): /* movntdq xmm,m128 */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xe7): /* vmovntdq {x,y}mm,mem */
+ generate_exception_if(ea.type != OP_MEM, EXC_UD);
+ sfence = true;
+ /* fall through */
+ case X86EMUL_OPC_66(0x0f, 0x6f): /* movdqa xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x6f): /* vmovdqa {x,y}mm/mem,{x,y}mm */
+ case X86EMUL_OPC_F3(0x0f, 0x6f): /* movdqu xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_F3(0x0f, 0x6f): /* vmovdqu {x,y}mm/mem,{x,y}mm */
+ case X86EMUL_OPC_66(0x0f, 0x7f): /* movdqa xmm,xmm/m128 */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x7f): /* vmovdqa {x,y}mm,{x,y}mm/m128 */
+ case X86EMUL_OPC_F3(0x0f, 0x7f): /* movdqu xmm,xmm/m128 */
+ case X86EMUL_OPC_VEX_F3(0x0f, 0x7f): /* vmovdqu {x,y}mm,{x,y}mm/mem */
+ d |= TwoOp;
+ op_bytes = 16 << vex.l;
+ if ( vex.opcx != vex_none )
+ goto simd_0f_avx;
+ goto simd_0f_sse2;
+
+ case X86EMUL_OPC_VEX_66(0x0f, 0xd6): /* vmovq xmm,xmm/m64 */
+ generate_exception_if(vex.l, EXC_UD);
+ d |= TwoOp;
+ /* fall through */
+ case X86EMUL_OPC_66(0x0f, 0xd6): /* movq xmm,xmm/m64 */
+ case X86EMUL_OPC(0x0f, 0x6f): /* movq mm/m64,mm */
+ case X86EMUL_OPC(0x0f, 0x7f): /* movq mm,mm/m64 */
+ op_bytes = 8;
+ goto simd_0f_int;

CASE_SIMD_PACKED_INT(0x0f, 0x70): /* pshuf{w,d} $imm8,{,x}mm/mem,{,x}mm */
case X86EMUL_OPC_VEX_66(0x0f, 0x70): /* vpshufd $imm8,{x,y}mm/mem,{x,y}mm */
@@ -5704,25 +5762,25 @@ x86_emulate(
get_fpu(X86EMUL_FPU_mmx, &fic);
}
simd_0f_imm8:
- {
- uint8_t *buf = get_stub(stub);
-
- buf[0] = 0x3e;
- buf[1] = 0x3e;
- buf[2] = 0x0f;
- buf[3] = b;
- buf[4] = modrm;
+ opc = init_prefixes(stub);
+ opc[0] = b;
+ opc[1] = modrm;
if ( ea.type == OP_MEM )
{
/* Convert memory operand to (%rAX). */
rex_prefix &= ~REX_B;
vex.b = 1;
- buf[4] &= 0x38;
+ opc[1] &= 0x38;
}
- buf[5] = imm1;
- fic.insn_bytes = 6;
+ opc[2] = imm1;
+ fic.insn_bytes = PFX_BYTES + 3;
break;
- }
+
+ case X86EMUL_OPC_F3(0x0f, 0x7e): /* movq xmm/m64,xmm */
+ case X86EMUL_OPC_VEX_F3(0x0f, 0x7e): /* vmovq xmm/m64,xmm */
+ generate_exception_if(vex.l, EXC_UD);
+ op_bytes = 8;
+ goto simd_0f_int;

case X86EMUL_OPC_F2(0x0f, 0xf0): /* lddqu m128,xmm */
case X86EMUL_OPC_VEX_F2(0x0f, 0xf0): /* vlddqu mem,{x,y}mm */
@@ -5741,10 +5799,7 @@ x86_emulate(
case X86EMUL_OPC_VEX_66(0x0f, 0xd0): /* vaddsubpd {x,y}mm/mem,{x,y}mm,{x,y}mm */
case X86EMUL_OPC_VEX_F2(0x0f, 0xd0): /* vaddsubps {x,y}mm/mem,{x,y}mm,{x,y}mm */
op_bytes = 16 << vex.l;
- if ( vex.opcx != vex_none )
- goto simd_0f_avx;
- host_and_vcpu_must_have(sse3);
- goto simd_0f_xmm;
+ goto simd_0f_sse3_avx;

case X86EMUL_OPC(0x0f, 0x80) ... X86EMUL_OPC(0x0f, 0x8f): /* jcc (near) */
if ( test_cc(b, _regs._eflags) )
@@ -6279,6 +6334,17 @@ x86_emulate(
vcpu_must_have(sse2);
goto simd_0f_mmx;

+ case X86EMUL_OPC_F3(0x0f, 0xd6): /* movq2dq mm,xmm */
+ case X86EMUL_OPC_F2(0x0f, 0xd6): /* movdq2q xmm,mm */
+ generate_exception_if(ea.type != OP_REG, EXC_UD);
+ op_bytes = 8;
+ host_and_vcpu_must_have(mmx);
+ goto simd_0f_int;
+
+ case X86EMUL_OPC(0x0f, 0xe7): /* movntq mm,m64 */
+ generate_exception_if(ea.type != OP_MEM, EXC_UD);
+ sfence = true;
+ /* fall through */
case X86EMUL_OPC(0x0f, 0xda): /* pminub mm/m64,mm */
case X86EMUL_OPC(0x0f, 0xde): /* pmaxub mm/m64,mm */
case X86EMUL_OPC(0x0f, 0xea): /* pminsw mm/m64,mm */
@@ -6290,6 +6356,73 @@ x86_emulate(
vcpu_must_have(sse);
goto simd_0f_mmx;

+ CASE_SIMD_PACKED_INT(0x0f, 0xf7): /* maskmov{q,dqu} {,x}mm,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xf7): /* vmaskmovdqu xmm,xmm */
+ generate_exception_if(ea.type != OP_REG, EXC_UD);
+ if ( vex.opcx != vex_none )
+ {
+ generate_exception_if(vex.l || vex.reg != 0xf, EXC_UD);
+ d |= TwoOp;
+ host_and_vcpu_must_have(avx);
+ get_fpu(X86EMUL_FPU_ymm, &fic);
+ }
+ else if ( vex.pfx )
+ {
+ vcpu_must_have(sse2);
+ get_fpu(X86EMUL_FPU_xmm, &fic);
+ }
+ else
+ {
+ host_and_vcpu_must_have(mmx);
+ vcpu_must_have(sse);
+ get_fpu(X86EMUL_FPU_mmx, &fic);
+ }
+
+ /*
+ * While we can't reasonably provide fully correct behavior here
+ * (in particular avoiding the memory read in anticipation of all
+ * bytes in the range eventually being written), we can (and should)
+ * still suppress the memory access if all mask bits are clear. Read
+ * the mask bits via {,v}pmovmskb for that purpose.
+ */
+ opc = init_prefixes(stub);
+ opc[0] = 0xd7; /* {,v}pmovmskb */
+ /* (Ab)use "sfence" for latching the original REX.R / VEX.R. */
+ sfence = rex_prefix & REX_R;
+ /* Convert GPR destination to %rAX. */
+ rex_prefix &= ~REX_R;
+ vex.r = 1;
+ if ( !mode_64bit() )
+ vex.w = 0;
+ opc[1] = modrm & 0xc7;
+ fic.insn_bytes = PFX_BYTES + 2;
+ opc[2] = 0xc3;
+
+ copy_REX_VEX(opc, rex_prefix, vex);
+ invoke_stub("", "", "=a" (ea.val) : [dummy] "i" (0));
+
+ put_stub(stub);
+ if ( !ea.val )
+ {
+ put_fpu(&fic);
+ goto complete_insn;
+ }
+
+ opc = init_prefixes(stub);
+ opc[0] = b;
+ opc[1] = modrm;
+ /* Restore high bit of XMM destination. */
+ if ( sfence )
+ {
+ rex_prefix |= REX_R;
+ vex.r = 0;
+ }
+
+ ea.type = OP_MEM;
+ ea.mem.off = truncate_ea(_regs.r(di));
+ sfence = true;
+ break;
+
case X86EMUL_OPC(0x0f38, 0xf0): /* movbe m,r */
case X86EMUL_OPC(0x0f38, 0xf1): /* movbe r,m */
vcpu_must_have(movbe);
@@ -6553,23 +6686,14 @@ x86_emulate(

if ( state->simd_size )
{
-#ifdef __XEN__
- uint8_t *buf = stub.ptr;
-#else
- uint8_t *buf = get_stub(stub);
-#endif
-
generate_exception_if(!op_bytes, EXC_UD);
generate_exception_if(vex.opcx && (d & TwoOp) && vex.reg != 0xf,
EXC_UD);

- if ( !buf )
+ if ( !opc )
BUG();
- if ( vex.opcx == vex_none )
- SET_SSE_PREFIX(buf[0], vex.pfx);
-
- buf[fic.insn_bytes] = 0xc3;
- copy_REX_VEX(buf, rex_prefix, vex);
+ opc[fic.insn_bytes - PFX_BYTES] = 0xc3;
+ copy_REX_VEX(opc, rex_prefix, vex);

if ( ea.type == OP_MEM )
{
@@ -6577,10 +6701,16 @@ x86_emulate(

if ( op_bytes < 16 ||
(vex.opcx
- ? /* vmov{a,nt}p{s,d} are exceptions. */
- ext != ext_0f || ((b | 1) != 0x29 && b != 0x2b)
- : /* movup{s,d} and lddqu are exceptions. */
- ext == ext_0f && ((b | 1) == 0x11 || b == 0xf0)) )
+ ? /* vmov{{a,nt}p{s,d},dqa,ntdq} are exceptions. */
+ ext != ext_0f ||
+ ((b | 1) != 0x29 && b != 0x2b &&
+ ((b | 0x10) != 0x7f || vex.pfx != vex_66) &&
+ b != 0xe7)
+ : /* movup{s,d}, {,mask}movdqu, and lddqu are exceptions. */
+ ext == ext_0f &&
+ ((b | 1) == 0x11 ||
+ ((b | 0x10) == 0x7f && vex.pfx == vex_f3) ||
+ b == 0xf7 || b == 0xf0)) )
mxcsr = MXCSR_MM;
else if ( vcpu_has_misalignsse() )
asm ( "stmxcsr %0" : "=m" (mxcsr) );
@@ -6588,14 +6718,25 @@ x86_emulate(
!is_aligned(ea.mem.seg, ea.mem.off, op_bytes,
ctxt, ops),
EXC_GP, 0);
- if ( (d & SrcMask) == SrcMem )
+ switch ( d & SrcMask )
{
+ case SrcMem:
rc = ops->read(ea.mem.seg, ea.mem.off, mmvalp, op_bytes, ctxt);
if ( rc != X86EMUL_OKAY )
goto done;
+ /* fall through */
+ case SrcMem16:
dst.type = OP_NONE;
+ break;
+ default:
+ if ( (d & DstMask) != DstMem )
+ {
+ ASSERT_UNREACHABLE();
+ return X86EMUL_UNHANDLEABLE;
+ }
+ break;
}
- else if ( (d & DstMask) == DstMem )
+ if ( (d & DstMask) == DstMem )
{
fail_if(!ops->write); /* Check before running the stub. */
ASSERT(d & Mov);
@@ -6603,19 +6744,18 @@ x86_emulate(
dst.bytes = op_bytes;
dst.mem = ea.mem;
}
- else if ( (d & SrcMask) == SrcMem16 )
- dst.type = OP_NONE;
- else
- {
- ASSERT_UNREACHABLE();
- return X86EMUL_UNHANDLEABLE;
- }
}
else
dst.type = OP_NONE;

- invoke_stub("", "", "+m" (*mmvalp), "+m" (fic.exn_raised)
- : "a" (mmvalp));
+ /* {,v}maskmov{q,dqu}, as an exception, uses rDI. */
+ if ( likely((ctxt->opcode & ~(X86EMUL_OPC_PFX_MASK |
+ X86EMUL_OPC_ENCODING_MASK)) !=
+ X86EMUL_OPC(0x0f, 0xf7)) )
+ invoke_stub("", "", "+m" (*mmvalp), "+m" (fic.exn_raised)
+ : "a" (mmvalp));
+ else
+ invoke_stub("", "", "+m" (*mmvalp) : "D" (mmvalp));

put_stub(stub);
put_fpu(&fic);
@@ -6871,6 +7011,8 @@ x86_insn_is_mem_access(const struct x86_
case 0xa4 ... 0xa7: /* MOVS / CMPS */
case 0xaa ... 0xaf: /* STOS / LODS / SCAS */
case 0xd7: /* XLAT */
+ CASE_SIMD_PACKED_INT(0x0f, 0xf7): /* MASKMOV{Q,DQU} */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xf7): /* VMASKMOVDQU */
return true;

case X86EMUL_OPC(0x0f, 0x01):
@@ -6888,7 +7030,8 @@ x86_insn_is_mem_write(const struct x86_e
switch ( state->desc & DstMask )
{
case DstMem:
- return state->modrm_mod != 3;
+ /* The SrcMem check is to cover {,V}MASKMOV{Q,DQU}. */
+ return state->modrm_mod != 3 || (state->desc & SrcMask) == SrcMem;

case DstBitBase:
case DstImplicit:
@@ -6908,22 +7051,9 @@ x86_insn_is_mem_write(const struct x86_e
case 0x6c: case 0x6d: /* INS */
case 0xa4: case 0xa5: /* MOVS */
case 0xaa: case 0xab: /* STOS */
- case X86EMUL_OPC(0x0f, 0x7e): /* MOVD/MOVQ */
- case X86EMUL_OPC_66(0x0f, 0x7e): /* MOVD/MOVQ */
- case X86EMUL_OPC_VEX_66(0x0f, 0x7e): /* VMOVD/VMOVQ */
- case X86EMUL_OPC(0x0f, 0x7f): /* VMOVQ */
- case X86EMUL_OPC_66(0x0f, 0x7f): /* MOVDQA */
- case X86EMUL_OPC_VEX_66(0x0f, 0x7f): /* VMOVDQA */
- case X86EMUL_OPC_F3(0x0f, 0x7f): /* MOVDQU */
- case X86EMUL_OPC_VEX_F3(0x0f, 0x7f): /* VMOVDQU */
case X86EMUL_OPC(0x0f, 0xab): /* BTS */
case X86EMUL_OPC(0x0f, 0xb3): /* BTR */
case X86EMUL_OPC(0x0f, 0xbb): /* BTC */
- case X86EMUL_OPC_66(0x0f, 0xd6): /* MOVQ */
- case X86EMUL_OPC_VEX_66(0x0f, 0xd6): /* VMOVQ */
- case X86EMUL_OPC(0x0f, 0xe7): /* MOVNTQ */
- case X86EMUL_OPC_66(0x0f, 0xe7): /* MOVNTDQ */
- case X86EMUL_OPC_VEX_66(0x0f, 0xe7): /* VMOVNTDQ */
return true;

case 0xd9:
Andrew Cooper
2017-03-01 13:59:54 UTC
Permalink
Post by Jan Beulich
Previously supported insns are being converted to the new model, and
several new ones are being added.
To keep the stub handling reasonably simple, integrate SET_SSE_PREFIX()
into copy_REX_VEX(), at once switching the stubs to use an empty REX
prefix instead of a double DS: one (no byte registers are being
accessed, so an empty REX prefix has no effect), except (of course) for
the 32-bit test harness build.
Why switch a %ds override to REX? There doesn't appear to be any benefit.
Post by Jan Beulich
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -364,11 +369,6 @@ enum vex_pfx {
static const uint8_t sse_prefix[] = { 0x66, 0xf3, 0xf2 };
-#define SET_SSE_PREFIX(dst, vex_pfx) do { \
- if ( vex_pfx ) \
- (dst) = sse_prefix[(vex_pfx) - 1]; \
-} while (0)
-
union vex {
uint8_t raw[2];
struct {
@@ -383,15 +383,35 @@ union vex {
};
};
+#ifdef __x86_64__
+# define PFX2 REX_PREFIX
+#else
+# define PFX2 0x3e
+#endif
+#define PFX_BYTES 3
+#define init_prefixes(stub) ({ \
+ uint8_t *buf_ = get_stub(stub); \
+ buf_[0] = 0x3e; \
+ buf_[1] = PFX2; \
+ buf_[2] = 0x0f; \
+ buf_ + 3; \
+})
+
#define copy_REX_VEX(ptr, rex, vex) do { \
if ( (vex).opcx != vex_none ) \
{ \
if ( !mode_64bit() ) \
vex.reg |= 8; \
- ptr[0] = 0xc4, ptr[1] = (vex).raw[0], ptr[2] = (vex).raw[1]; \
+ (ptr)[0 - PFX_BYTES] = 0xc4; \
+ (ptr)[1 - PFX_BYTES] = (vex).raw[0]; \
+ (ptr)[2 - PFX_BYTES] = (vex).raw[1]; \
+ } \
+ else \
+ { \
+ if ( (vex).pfx ) \
+ (ptr)[0 - PFX_BYTES] = sse_prefix[(vex).pfx - 1]; \
+ (ptr)[1 - PFX_BYTES] |= rex; \
This is no longer guarded by mode_64bit(). Won't this result in %ds |
rex in the 32bit test stubs?
Post by Jan Beulich
} \
- else if ( mode_64bit() ) \
- ptr[1] = rex | REX_PREFIX; \
} while (0)
union evex {
@@ -5429,6 +5496,57 @@ x86_emulate(
singlestep = _regs._eflags & X86_EFLAGS_TF;
break;
+ CASE_SIMD_PACKED_FP(, 0x0f, 0x50): /* movmskp{s,d} xmm,reg */
+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x50): /* vmovmskp{s,d} {x,y}mm,reg */
+ CASE_SIMD_PACKED_INT(0x0f, 0xd7): /* pmovmskb {,x}mm,reg */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xd7): /* vpmovmskb {x,y}mm,reg */
+ generate_exception_if(ea.type != OP_REG, EXC_UD);
+
+ if ( vex.opcx == vex_none )
+ {
+ if ( vex.pfx & VEX_PREFIX_DOUBLE_MASK )
+ vcpu_must_have(sse2);
+ else
+ {
+ if ( b != 0x50 )
+ host_and_vcpu_must_have(mmx);
+ vcpu_must_have(sse);
+ }
+ if ( b == 0x50 || (vex.pfx & VEX_PREFIX_DOUBLE_MASK) )
+ get_fpu(X86EMUL_FPU_xmm, &fic);
+ else
+ get_fpu(X86EMUL_FPU_mmx, &fic);
+ }
+ else
+ {
+ generate_exception_if(vex.reg != 0xf, EXC_UD);
Isn't this TwoOp?
Post by Jan Beulich
+ if ( b == 0x50 || !vex.l )
+ host_and_vcpu_must_have(avx);
+ else
+ host_and_vcpu_must_have(avx2);
+ get_fpu(X86EMUL_FPU_ymm, &fic);
+ }
+
+ opc = init_prefixes(stub);
+ opc[0] = b;
+ /* Convert GPR destination to %rAX. */
+ rex_prefix &= ~REX_R;
+ vex.r = 1;
+ if ( !mode_64bit() )
+ vex.w = 0;
+ opc[1] = modrm & 0xc7;
+ fic.insn_bytes = PFX_BYTES + 2;
+ opc[2] = 0xc3;
+
+ copy_REX_VEX(opc, rex_prefix, vex);
+ invoke_stub("", "", "=a" (dst.val) : [dummy] "i" (0));
+
+ put_stub(stub);
+ put_fpu(&fic);
+
+ dst.bytes = 4;
Somewhere there should probably be an ASSERT() that state->simd_size is
0, so we don't try to invoke the stub twice.
Post by Jan Beulich
+ break;
+
CASE_SIMD_PACKED_INT(0x0f, 0x60): /* punpcklbw {,x}mm/mem,{,x}mm */
case X86EMUL_OPC_VEX_66(0x0f, 0x60): /* vpunpcklbw {x,y}mm/mem,{x,y}mm,{x,y}mm */
CASE_SIMD_PACKED_INT(0x0f, 0x61): /* punpcklwd {,x}mm/mem,{,x}mm */
@@ -6553,23 +6686,14 @@ x86_emulate(
if ( state->simd_size )
{
-#ifdef __XEN__
- uint8_t *buf = stub.ptr;
-#else
- uint8_t *buf = get_stub(stub);
-#endif
-
generate_exception_if(!op_bytes, EXC_UD);
generate_exception_if(vex.opcx && (d & TwoOp) && vex.reg != 0xf,
EXC_UD);
- if ( !buf )
+ if ( !opc )
BUG();
- if ( vex.opcx == vex_none )
- SET_SSE_PREFIX(buf[0], vex.pfx);
-
- buf[fic.insn_bytes] = 0xc3;
- copy_REX_VEX(buf, rex_prefix, vex);
+ opc[fic.insn_bytes - PFX_BYTES] = 0xc3;
fic.insn_bytes - PFX_BYTES is in the middle of the opcode, isn't it?

~Andrew
Post by Jan Beulich
+ copy_REX_VEX(opc, rex_prefix, vex);
if ( ea.type == OP_MEM )
{
Jan Beulich
2017-03-01 14:19:25 UTC
Permalink
Post by Andrew Cooper
Post by Jan Beulich
Previously supported insns are being converted to the new model, and
several new ones are being added.
To keep the stub handling reasonably simple, integrate SET_SSE_PREFIX()
into copy_REX_VEX(), at once switching the stubs to use an empty REX
prefix instead of a double DS: one (no byte registers are being
accessed, so an empty REX prefix has no effect), except (of course) for
the 32-bit test harness build.
Why switch a %ds override to REX? There doesn't appear to be any benefit.
It eliminates a mode_64bit() conditional from the non-VEX path in
the macro. And then, honestly, this is a question I would have
expected (if at all) the first time you came across this. I also think
avoiding two identical prefixes is (marginally) better architecture-
wise.
Post by Andrew Cooper
Post by Jan Beulich
@@ -383,15 +383,35 @@ union vex {
};
};
+#ifdef __x86_64__
+# define PFX2 REX_PREFIX
+#else
+# define PFX2 0x3e
+#endif
+#define PFX_BYTES 3
+#define init_prefixes(stub) ({ \
+ uint8_t *buf_ = get_stub(stub); \
+ buf_[0] = 0x3e; \
+ buf_[1] = PFX2; \
+ buf_[2] = 0x0f; \
+ buf_ + 3; \
+})
+
#define copy_REX_VEX(ptr, rex, vex) do { \
if ( (vex).opcx != vex_none ) \
{ \
if ( !mode_64bit() ) \
vex.reg |= 8; \
- ptr[0] = 0xc4, ptr[1] = (vex).raw[0], ptr[2] = (vex).raw[1]; \
+ (ptr)[0 - PFX_BYTES] = 0xc4; \
+ (ptr)[1 - PFX_BYTES] = (vex).raw[0]; \
+ (ptr)[2 - PFX_BYTES] = (vex).raw[1]; \
+ } \
+ else \
+ { \
+ if ( (vex).pfx ) \
+ (ptr)[0 - PFX_BYTES] = sse_prefix[(vex).pfx - 1]; \
+ (ptr)[1 - PFX_BYTES] |= rex; \
This is no longer guarded by mode_64bit(). Won't this result in %ds |
rex in the 32bit test stubs?
Correct. But please realize that rex is zero at all times when
emulating other than 64-bit mode.
Post by Andrew Cooper
Post by Jan Beulich
@@ -5429,6 +5496,57 @@ x86_emulate(
singlestep = _regs._eflags & X86_EFLAGS_TF;
break;
+ CASE_SIMD_PACKED_FP(, 0x0f, 0x50): /* movmskp{s,d} xmm,reg */
+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x50): /* vmovmskp{s,d} {x,y}mm,reg */
+ CASE_SIMD_PACKED_INT(0x0f, 0xd7): /* pmovmskb {,x}mm,reg */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xd7): /* vpmovmskb {x,y}mm,reg */
+ generate_exception_if(ea.type != OP_REG, EXC_UD);
+
+ if ( vex.opcx == vex_none )
+ {
+ if ( vex.pfx & VEX_PREFIX_DOUBLE_MASK )
+ vcpu_must_have(sse2);
+ else
+ {
+ if ( b != 0x50 )
+ host_and_vcpu_must_have(mmx);
+ vcpu_must_have(sse);
+ }
+ if ( b == 0x50 || (vex.pfx & VEX_PREFIX_DOUBLE_MASK) )
+ get_fpu(X86EMUL_FPU_xmm, &fic);
+ else
+ get_fpu(X86EMUL_FPU_mmx, &fic);
+ }
+ else
+ {
+ generate_exception_if(vex.reg != 0xf, EXC_UD);
Isn't this TwoOp?
Yes, hence the #UD. Or is the question "Why is this being done
here, instead of on the common code path?" If so - the common
code path doing this isn't being reached, as we invoke the stub
inside the case block.
Post by Andrew Cooper
Post by Jan Beulich
+ if ( b == 0x50 || !vex.l )
+ host_and_vcpu_must_have(avx);
+ else
+ host_and_vcpu_must_have(avx2);
+ get_fpu(X86EMUL_FPU_ymm, &fic);
+ }
+
+ opc = init_prefixes(stub);
+ opc[0] = b;
+ /* Convert GPR destination to %rAX. */
+ rex_prefix &= ~REX_R;
+ vex.r = 1;
+ if ( !mode_64bit() )
+ vex.w = 0;
+ opc[1] = modrm & 0xc7;
+ fic.insn_bytes = PFX_BYTES + 2;
+ opc[2] = 0xc3;
+
+ copy_REX_VEX(opc, rex_prefix, vex);
+ invoke_stub("", "", "=a" (dst.val) : [dummy] "i" (0));
+
+ put_stub(stub);
+ put_fpu(&fic);
+
+ dst.bytes = 4;
Somewhere there should probably be an ASSERT() that state->simd_size is
0, so we don't try to invoke the stub twice.
I can do this, but it didn't seem natural to do so when putting this
together, as - obviously - I did produce/check the table entries at
basically the same time as I did write this code.
Post by Andrew Cooper
Post by Jan Beulich
@@ -6553,23 +6686,14 @@ x86_emulate(
if ( state->simd_size )
{
-#ifdef __XEN__
- uint8_t *buf = stub.ptr;
-#else
- uint8_t *buf = get_stub(stub);
-#endif
Note, btw, how the ugly #ifdef-ary goes away here.
Post by Andrew Cooper
Post by Jan Beulich
generate_exception_if(!op_bytes, EXC_UD);
generate_exception_if(vex.opcx && (d & TwoOp) && vex.reg != 0xf,
EXC_UD);
- if ( !buf )
+ if ( !opc )
BUG();
- if ( vex.opcx == vex_none )
- SET_SSE_PREFIX(buf[0], vex.pfx);
-
- buf[fic.insn_bytes] = 0xc3;
- copy_REX_VEX(buf, rex_prefix, vex);
+ opc[fic.insn_bytes - PFX_BYTES] = 0xc3;
fic.insn_bytes - PFX_BYTES is in the middle of the opcode, isn't it?
No - note the difference between opc and buf: The former points
past the common prefix bytes.

Jan
Andrew Cooper
2017-03-01 19:56:07 UTC
Permalink
Post by Jan Beulich
Post by Andrew Cooper
Post by Jan Beulich
Previously supported insns are being converted to the new model, and
several new ones are being added.
To keep the stub handling reasonably simple, integrate SET_SSE_PREFIX()
into copy_REX_VEX(), at once switching the stubs to use an empty REX
prefix instead of a double DS: one (no byte registers are being
accessed, so an empty REX prefix has no effect), except (of course) for
the 32-bit test harness build.
Why switch a %ds override to REX? There doesn't appear to be any benefit.
It eliminates a mode_64bit() conditional from the non-VEX path in
the macro. And then, honestly, this is a question I would have
expected (if at all) the first time you came across this.
This is an extremely complicated series to review. I am sorry, but I
can't always spot all issues in v1.
Post by Jan Beulich
I also think avoiding two identical prefixes is (marginally) better architecture-
wise.
There is no specific advice in the AMD optimisation guide.

The Intel guide warns against unnecessary use of 0x66 and 0x67
(specifically in the Length-Changing Prefixes section), which
dynamically change the length of the instruction. This doesn't apply to
us in this situation.

The only other reference to prefixes comes from the Other Decoding
Guidelines section, which state (obviously) that extra prefixes decrease
instruction bandwidth (as more bytes need consuming to decode the
instruction), and that any instruction with multiple prefixes at all
require decoding in the first decoder, which builds competition of resource.

I can't see anything suggesting that a double %ds vs a single %ds and
rex prefix would make any difference.
Post by Jan Beulich
Post by Andrew Cooper
Post by Jan Beulich
@@ -383,15 +383,35 @@ union vex {
};
};
+#ifdef __x86_64__
+# define PFX2 REX_PREFIX
+#else
+# define PFX2 0x3e
+#endif
+#define PFX_BYTES 3
+#define init_prefixes(stub) ({ \
+ uint8_t *buf_ = get_stub(stub); \
+ buf_[0] = 0x3e; \
+ buf_[1] = PFX2; \
+ buf_[2] = 0x0f; \
+ buf_ + 3; \
+})
+
#define copy_REX_VEX(ptr, rex, vex) do { \
if ( (vex).opcx != vex_none ) \
{ \
if ( !mode_64bit() ) \
vex.reg |= 8; \
- ptr[0] = 0xc4, ptr[1] = (vex).raw[0], ptr[2] = (vex).raw[1]; \
+ (ptr)[0 - PFX_BYTES] = 0xc4; \
+ (ptr)[1 - PFX_BYTES] = (vex).raw[0]; \
+ (ptr)[2 - PFX_BYTES] = (vex).raw[1]; \
+ } \
+ else \
+ { \
+ if ( (vex).pfx ) \
+ (ptr)[0 - PFX_BYTES] = sse_prefix[(vex).pfx - 1]; \
+ (ptr)[1 - PFX_BYTES] |= rex; \
This is no longer guarded by mode_64bit(). Won't this result in %ds |
rex in the 32bit test stubs?
Correct. But please realize that rex is zero at all times when
emulating other than 64-bit mode.
Then please leave a comment then explaining why this is safe.
Post by Jan Beulich
Post by Andrew Cooper
Post by Jan Beulich
@@ -5429,6 +5496,57 @@ x86_emulate(
singlestep = _regs._eflags & X86_EFLAGS_TF;
break;
+ CASE_SIMD_PACKED_FP(, 0x0f, 0x50): /* movmskp{s,d} xmm,reg */
+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x50): /* vmovmskp{s,d} {x,y}mm,reg */
+ CASE_SIMD_PACKED_INT(0x0f, 0xd7): /* pmovmskb {,x}mm,reg */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xd7): /* vpmovmskb {x,y}mm,reg */
+ generate_exception_if(ea.type != OP_REG, EXC_UD);
+
+ if ( vex.opcx == vex_none )
+ {
+ if ( vex.pfx & VEX_PREFIX_DOUBLE_MASK )
+ vcpu_must_have(sse2);
+ else
+ {
+ if ( b != 0x50 )
+ host_and_vcpu_must_have(mmx);
+ vcpu_must_have(sse);
+ }
+ if ( b == 0x50 || (vex.pfx & VEX_PREFIX_DOUBLE_MASK) )
+ get_fpu(X86EMUL_FPU_xmm, &fic);
+ else
+ get_fpu(X86EMUL_FPU_mmx, &fic);
+ }
+ else
+ {
+ generate_exception_if(vex.reg != 0xf, EXC_UD);
Isn't this TwoOp?
Yes, hence the #UD. Or is the question "Why is this being done
here, instead of on the common code path?" If so - the common
code path doing this isn't being reached, as we invoke the stub
inside the case block.
My question was actually "Why isn't this based on d & TwoByte"?
Post by Jan Beulich
Post by Andrew Cooper
Post by Jan Beulich
+ if ( b == 0x50 || !vex.l )
+ host_and_vcpu_must_have(avx);
+ else
+ host_and_vcpu_must_have(avx2);
+ get_fpu(X86EMUL_FPU_ymm, &fic);
+ }
+
+ opc = init_prefixes(stub);
+ opc[0] = b;
+ /* Convert GPR destination to %rAX. */
+ rex_prefix &= ~REX_R;
+ vex.r = 1;
+ if ( !mode_64bit() )
+ vex.w = 0;
+ opc[1] = modrm & 0xc7;
+ fic.insn_bytes = PFX_BYTES + 2;
+ opc[2] = 0xc3;
+
+ copy_REX_VEX(opc, rex_prefix, vex);
+ invoke_stub("", "", "=a" (dst.val) : [dummy] "i" (0));
+
+ put_stub(stub);
+ put_fpu(&fic);
+
+ dst.bytes = 4;
Somewhere there should probably be an ASSERT() that state->simd_size is
0, so we don't try to invoke the stub twice.
I can do this, but it didn't seem natural to do so when putting this
together, as - obviously - I did produce/check the table entries at
basically the same time as I did write this code.
It is obvious to you now, and I do trust that you checked the
correctness as it related to this patch, but it will not be obvious in 6
months time with another dev-cycles worth of change on top.

The emulator is very hard-to-follow code. Complexity is a necessary
consequence of its purpose, but also the source of a lot of bugs; most
of them very subtle. Wherever possible, I would prefer that we take all
opportunities to make the logic easier to follow, and harder to
accidentally break with new changes, for the sake of everyone needing to
edit it in the future.

In this specific case, I don't wish to prescribe exactly how to prevent
accidental breakage, but some kind of assertion that we don't execute a
stub twice would be very wise, because recent history has shown that AFL
is very good at reaching unintended codepaths.
Post by Jan Beulich
Post by Andrew Cooper
Post by Jan Beulich
generate_exception_if(!op_bytes, EXC_UD);
generate_exception_if(vex.opcx && (d & TwoOp) && vex.reg != 0xf,
EXC_UD);
- if ( !buf )
+ if ( !opc )
BUG();
- if ( vex.opcx == vex_none )
- SET_SSE_PREFIX(buf[0], vex.pfx);
-
- buf[fic.insn_bytes] = 0xc3;
- copy_REX_VEX(buf, rex_prefix, vex);
+ opc[fic.insn_bytes - PFX_BYTES] = 0xc3;
fic.insn_bytes - PFX_BYTES is in the middle of the opcode, isn't it?
No - note the difference between opc and buf: The former points
past the common prefix bytes.
Oh - so it does.

~Andrew
Jan Beulich
2017-03-02 08:07:13 UTC
Permalink
Post by Andrew Cooper
Post by Jan Beulich
I also think avoiding two identical prefixes is (marginally) better architecture-
wise.
There is no specific advice in the AMD optimisation guide.
The Intel guide warns against unnecessary use of 0x66 and 0x67
(specifically in the Length-Changing Prefixes section), which
dynamically change the length of the instruction. This doesn't apply to
us in this situation.
The only other reference to prefixes comes from the Other Decoding
Guidelines section, which state (obviously) that extra prefixes decrease
instruction bandwidth (as more bytes need consuming to decode the
instruction), and that any instruction with multiple prefixes at all
require decoding in the first decoder, which builds competition of resource.
I can't see anything suggesting that a double %ds vs a single %ds and
rex prefix would make any difference.
Well - performance isn't of interest here anyway, only correctness
is. Since not emitting multiple identical prefixes is marginally better
and since using REX here is slightly better overall code, I'd prefer
for it to stay this way.
Post by Andrew Cooper
Post by Jan Beulich
Post by Andrew Cooper
Post by Jan Beulich
@@ -5429,6 +5496,57 @@ x86_emulate(
singlestep = _regs._eflags & X86_EFLAGS_TF;
break;
+ CASE_SIMD_PACKED_FP(, 0x0f, 0x50): /* movmskp{s,d} xmm,reg */
+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x50): /* vmovmskp{s,d} {x,y}mm,reg */
+ CASE_SIMD_PACKED_INT(0x0f, 0xd7): /* pmovmskb {,x}mm,reg */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xd7): /* vpmovmskb {x,y}mm,reg */
+ generate_exception_if(ea.type != OP_REG, EXC_UD);
+
+ if ( vex.opcx == vex_none )
+ {
+ if ( vex.pfx & VEX_PREFIX_DOUBLE_MASK )
+ vcpu_must_have(sse2);
+ else
+ {
+ if ( b != 0x50 )
+ host_and_vcpu_must_have(mmx);
+ vcpu_must_have(sse);
+ }
+ if ( b == 0x50 || (vex.pfx & VEX_PREFIX_DOUBLE_MASK) )
+ get_fpu(X86EMUL_FPU_xmm, &fic);
+ else
+ get_fpu(X86EMUL_FPU_mmx, &fic);
+ }
+ else
+ {
+ generate_exception_if(vex.reg != 0xf, EXC_UD);
Isn't this TwoOp?
Yes, hence the #UD. Or is the question "Why is this being done
here, instead of on the common code path?" If so - the common
code path doing this isn't being reached, as we invoke the stub
inside the case block.
My question was actually "Why isn't this based on d & TwoByte"?
Because the above variant is more explicit imo: Why depend on
some derived info when we can use the original encoding directly?
Post by Andrew Cooper
Post by Jan Beulich
Post by Andrew Cooper
Post by Jan Beulich
+ if ( b == 0x50 || !vex.l )
+ host_and_vcpu_must_have(avx);
+ else
+ host_and_vcpu_must_have(avx2);
+ get_fpu(X86EMUL_FPU_ymm, &fic);
+ }
+
+ opc = init_prefixes(stub);
+ opc[0] = b;
+ /* Convert GPR destination to %rAX. */
+ rex_prefix &= ~REX_R;
+ vex.r = 1;
+ if ( !mode_64bit() )
+ vex.w = 0;
+ opc[1] = modrm & 0xc7;
+ fic.insn_bytes = PFX_BYTES + 2;
+ opc[2] = 0xc3;
+
+ copy_REX_VEX(opc, rex_prefix, vex);
+ invoke_stub("", "", "=a" (dst.val) : [dummy] "i" (0));
+
+ put_stub(stub);
+ put_fpu(&fic);
+
+ dst.bytes = 4;
Somewhere there should probably be an ASSERT() that state->simd_size is
0, so we don't try to invoke the stub twice.
I can do this, but it didn't seem natural to do so when putting this
together, as - obviously - I did produce/check the table entries at
basically the same time as I did write this code.
It is obvious to you now, and I do trust that you checked the
correctness as it related to this patch, but it will not be obvious in 6
months time with another dev-cycles worth of change on top.
Right, that's what I was trying to hint at with my explanation of
why I didn't think of adding ASSERT()s to this effect.

Jan
Jan Beulich
2017-02-28 12:51:36 UTC
Permalink
Signed-off-by: Jan Beulich <***@suse.com>
---
v4: Add missing copy_REX_VEX().
v3: Ignore VEX.l. Add fic.exn_raised constraint to invoke_stub() use.
v2: Add missing RET to stub. Generate #UD (instead of simply failing)
when VEX.l is disallowed.

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -254,7 +254,7 @@ static const struct {
[0x2a] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
[0x2b] = { DstMem|SrcImplicit|ModRM|Mov, simd_any_fp },
[0x2c ... 0x2d] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
- [0x2e ... 0x2f] = { ImplicitOps|ModRM },
+ [0x2e ... 0x2f] = { ImplicitOps|ModRM|TwoOp },
[0x30 ... 0x35] = { ImplicitOps },
[0x37] = { ImplicitOps },
[0x38] = { DstReg|SrcMem|ModRM },
@@ -5468,6 +5468,55 @@ x86_emulate(
state->simd_size = simd_none;
break;

+ CASE_SIMD_PACKED_FP(, 0x0f, 0x2e): /* ucomis{s,d} xmm/mem,xmm */
+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x2e): /* vucomis{s,d} xmm/mem,xmm */
+ CASE_SIMD_PACKED_FP(, 0x0f, 0x2f): /* comis{s,d} xmm/mem,xmm */
+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x2f): /* vcomis{s,d} xmm/mem,xmm */
+ if ( vex.opcx == vex_none )
+ {
+ if ( vex.pfx )
+ vcpu_must_have(sse2);
+ else
+ vcpu_must_have(sse);
+ get_fpu(X86EMUL_FPU_xmm, &fic);
+ }
+ else
+ {
+ host_and_vcpu_must_have(avx);
+ get_fpu(X86EMUL_FPU_ymm, &fic);
+ }
+
+ opc = init_prefixes(stub);
+ opc[0] = b;
+ opc[1] = modrm;
+ if ( ea.type == OP_MEM )
+ {
+ rc = ops->read(ea.mem.seg, ea.mem.off, mmvalp, vex.pfx ? 8 : 4,
+ ctxt);
+ if ( rc != X86EMUL_OKAY )
+ goto done;
+
+ /* Convert memory operand to (%rAX). */
+ rex_prefix &= ~REX_B;
+ vex.b = 1;
+ opc[1] &= 0x38;
+ }
+ fic.insn_bytes = PFX_BYTES + 2;
+ opc[2] = 0xc3;
+
+ copy_REX_VEX(opc, rex_prefix, vex);
+ invoke_stub(_PRE_EFLAGS("[eflags]", "[mask]", "[tmp]"),
+ _POST_EFLAGS("[eflags]", "[mask]", "[tmp]"),
+ [eflags] "+g" (_regs._eflags),
+ [tmp] "=&r" (cr4 /* dummy */), "+m" (*mmvalp),
+ "+m" (fic.exn_raised)
+ : [func] "rm" (stub.func), "a" (mmvalp),
+ [mask] "i" (EFLAGS_MASK));
+
+ put_stub(stub);
+ put_fpu(&fic);
+ break;
+
case X86EMUL_OPC(0x0f, 0x30): /* wrmsr */
generate_exception_if(!mode_ring0(), EXC_GP, 0);
fail_if(ops->write_msr == NULL);
Andrew Cooper
2017-03-01 14:16:59 UTC
Permalink
Post by Jan Beulich
---
v4: Add missing copy_REX_VEX().
v3: Ignore VEX.l. Add fic.exn_raised constraint to invoke_stub() use.
v2: Add missing RET to stub. Generate #UD (instead of simply failing)
when VEX.l is disallowed.
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -254,7 +254,7 @@ static const struct {
[0x2a] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
[0x2b] = { DstMem|SrcImplicit|ModRM|Mov, simd_any_fp },
[0x2c ... 0x2d] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
- [0x2e ... 0x2f] = { ImplicitOps|ModRM },
+ [0x2e ... 0x2f] = { ImplicitOps|ModRM|TwoOp },
[0x30 ... 0x35] = { ImplicitOps },
[0x37] = { ImplicitOps },
[0x38] = { DstReg|SrcMem|ModRM },
@@ -5468,6 +5468,55 @@ x86_emulate(
state->simd_size = simd_none;
break;
+ CASE_SIMD_PACKED_FP(, 0x0f, 0x2e): /* ucomis{s,d} xmm/mem,xmm */
+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x2e): /* vucomis{s,d} xmm/mem,xmm */
+ CASE_SIMD_PACKED_FP(, 0x0f, 0x2f): /* comis{s,d} xmm/mem,xmm */
+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x2f): /* vcomis{s,d} xmm/mem,xmm */
+ if ( vex.opcx == vex_none )
+ {
+ if ( vex.pfx )
+ vcpu_must_have(sse2);
+ else
+ vcpu_must_have(sse);
+ get_fpu(X86EMUL_FPU_xmm, &fic);
+ }
+ else
+ {
+ host_and_vcpu_must_have(avx);
+ get_fpu(X86EMUL_FPU_ymm, &fic);
+ }
This is starting to become a common sequence. Is there any sensible way
to factor it out in a non-macro way, to avoid the compiler instantiating
it at the top of many basic blocks?
Post by Jan Beulich
+
+ opc = init_prefixes(stub);
+ opc[0] = b;
+ opc[1] = modrm;
+ if ( ea.type == OP_MEM )
+ {
+ rc = ops->read(ea.mem.seg, ea.mem.off, mmvalp, vex.pfx ? 8 : 4,
+ ctxt);
+ if ( rc != X86EMUL_OKAY )
+ goto done;
+
+ /* Convert memory operand to (%rAX). */
+ rex_prefix &= ~REX_B;
+ vex.b = 1;
+ opc[1] &= 0x38;
+ }
+ fic.insn_bytes = PFX_BYTES + 2;
+ opc[2] = 0xc3;
+
+ copy_REX_VEX(opc, rex_prefix, vex);
+ invoke_stub(_PRE_EFLAGS("[eflags]", "[mask]", "[tmp]"),
+ _POST_EFLAGS("[eflags]", "[mask]", "[tmp]"),
+ [eflags] "+g" (_regs._eflags),
+ [tmp] "=&r" (cr4 /* dummy */), "+m" (*mmvalp),
This is latently dangerous. It would be better to have an explicit
"unsigned long dummy;", which the compiler will perfectly easily elide
during register scheduling.

~Andrew
Post by Jan Beulich
+ "+m" (fic.exn_raised)
+ : [func] "rm" (stub.func), "a" (mmvalp),
+ [mask] "i" (EFLAGS_MASK));
+
+ put_stub(stub);
+ put_fpu(&fic);
+ break;
+
case X86EMUL_OPC(0x0f, 0x30): /* wrmsr */
generate_exception_if(!mode_ring0(), EXC_GP, 0);
fail_if(ops->write_msr == NULL);
Jan Beulich
2017-03-01 14:26:39 UTC
Permalink
Post by Andrew Cooper
Post by Jan Beulich
@@ -5468,6 +5468,55 @@ x86_emulate(
state->simd_size = simd_none;
break;
+ CASE_SIMD_PACKED_FP(, 0x0f, 0x2e): /* ucomis{s,d} xmm/mem,xmm */
+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x2e): /* vucomis{s,d} xmm/mem,xmm */
+ CASE_SIMD_PACKED_FP(, 0x0f, 0x2f): /* comis{s,d} xmm/mem,xmm */
+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x2f): /* vcomis{s,d} xmm/mem,xmm */
+ if ( vex.opcx == vex_none )
+ {
+ if ( vex.pfx )
+ vcpu_must_have(sse2);
+ else
+ vcpu_must_have(sse);
+ get_fpu(X86EMUL_FPU_xmm, &fic);
+ }
+ else
+ {
+ host_and_vcpu_must_have(avx);
+ get_fpu(X86EMUL_FPU_ymm, &fic);
+ }
This is starting to become a common sequence. Is there any sensible way
to factor it out in a non-macro way, to avoid the compiler instantiating
it at the top of many basic blocks?
While I would have wanted to, I couldn't think of one which wouldn't
be almost as ugly as the redundancy. The main problem being - as
you likely understand yourself - that we'd need parameters for all
the used features as well as all the involved X86EMUL_FPU_* values.
Post by Andrew Cooper
Post by Jan Beulich
+ opc = init_prefixes(stub);
+ opc[0] = b;
+ opc[1] = modrm;
+ if ( ea.type == OP_MEM )
+ {
+ rc = ops->read(ea.mem.seg, ea.mem.off, mmvalp, vex.pfx ? 8 : 4,
+ ctxt);
+ if ( rc != X86EMUL_OKAY )
+ goto done;
+
+ /* Convert memory operand to (%rAX). */
+ rex_prefix &= ~REX_B;
+ vex.b = 1;
+ opc[1] &= 0x38;
+ }
+ fic.insn_bytes = PFX_BYTES + 2;
+ opc[2] = 0xc3;
+
+ copy_REX_VEX(opc, rex_prefix, vex);
+ invoke_stub(_PRE_EFLAGS("[eflags]", "[mask]", "[tmp]"),
+ _POST_EFLAGS("[eflags]", "[mask]", "[tmp]"),
+ [eflags] "+g" (_regs._eflags),
+ [tmp] "=&r" (cr4 /* dummy */), "+m" (*mmvalp),
This is latently dangerous. It would be better to have an explicit
"unsigned long dummy;", which the compiler will perfectly easily elide
during register scheduling.
The thing I want to avoid as much as possible are these ugly,
improperly indented extra scopes following case labels. If
putting a dummy variable into the whole switch() scope is okay
with you, I could live with that. But then again I don't see the
danger here - there's no imaginable use for cr4 in this piece of
code.

Jan
Andrew Cooper
2017-03-01 14:31:52 UTC
Permalink
Post by Jan Beulich
Post by Andrew Cooper
Post by Jan Beulich
+ opc = init_prefixes(stub);
+ opc[0] = b;
+ opc[1] = modrm;
+ if ( ea.type == OP_MEM )
+ {
+ rc = ops->read(ea.mem.seg, ea.mem.off, mmvalp, vex.pfx ? 8 : 4,
+ ctxt);
+ if ( rc != X86EMUL_OKAY )
+ goto done;
+
+ /* Convert memory operand to (%rAX). */
+ rex_prefix &= ~REX_B;
+ vex.b = 1;
+ opc[1] &= 0x38;
+ }
+ fic.insn_bytes = PFX_BYTES + 2;
+ opc[2] = 0xc3;
+
+ copy_REX_VEX(opc, rex_prefix, vex);
+ invoke_stub(_PRE_EFLAGS("[eflags]", "[mask]", "[tmp]"),
+ _POST_EFLAGS("[eflags]", "[mask]", "[tmp]"),
+ [eflags] "+g" (_regs._eflags),
+ [tmp] "=&r" (cr4 /* dummy */), "+m" (*mmvalp),
This is latently dangerous. It would be better to have an explicit
"unsigned long dummy;", which the compiler will perfectly easily elide
during register scheduling.
The thing I want to avoid as much as possible are these ugly,
improperly indented extra scopes following case labels. If
putting a dummy variable into the whole switch() scope is okay
with you, I could live with that. But then again I don't see the
danger here - there's no imaginable use for cr4 in this piece of
code.
Whole switch() scope is fine. I see you have similar dummy examples in
later patches as well.

~Andrew
Jan Beulich
2017-02-28 12:52:13 UTC
Permalink
This involves fixing a decode bug: VEX encoded insns aren't necessarily
followed by a ModR/M byte.

Signed-off-by: Jan Beulich <***@suse.com>
---
v4: Add missing setting of op_bytes to insertq (register form)
handling.
v3: Simplify handling of extrq/insertq register forms. Use simd_0f_xmm
label.
v2: Correct {,v}pextrw operand descriptor.

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -274,10 +274,11 @@ static const struct {
[0x6e] = { DstImplicit|SrcMem|ModRM|Mov },
[0x6f] = { DstImplicit|SrcMem|ModRM|Mov, simd_packed_int },
[0x70] = { SrcImmByte|ModRM|TwoOp, simd_other },
- [0x71 ... 0x73] = { SrcImmByte|ModRM },
+ [0x71 ... 0x73] = { DstImplicit|SrcImmByte|ModRM },
[0x74 ... 0x76] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
[0x77] = { DstImplicit|SrcNone },
- [0x78 ... 0x79] = { ModRM },
+ [0x78] = { ImplicitOps|ModRM },
+ [0x79] = { DstReg|SrcMem|ModRM, simd_packed_int },
[0x7c ... 0x7d] = { DstImplicit|SrcMem|ModRM, simd_other },
[0x7e] = { DstMem|SrcImplicit|ModRM|Mov },
[0x7f] = { DstMem|SrcImplicit|ModRM|Mov, simd_packed_int },
@@ -315,7 +316,7 @@ static const struct {
[0xc2] = { DstImplicit|SrcImmByte|ModRM, simd_any_fp },
[0xc3] = { DstMem|SrcReg|ModRM|Mov },
[0xc4] = { DstReg|SrcImmByte|ModRM, simd_packed_int },
- [0xc5] = { SrcImmByte|ModRM },
+ [0xc5] = { DstReg|SrcImmByte|ModRM|Mov },
[0xc6] = { DstImplicit|SrcImmByte|ModRM, simd_packed_fp },
[0xc7] = { ImplicitOps|ModRM },
[0xc8 ... 0xcf] = { ImplicitOps },
@@ -2505,12 +2506,21 @@ x86_decode(

opcode |= b | MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);

+ if ( !(d & ModRM) )
+ {
+ modrm_reg = modrm_rm = modrm_mod = modrm = 0;
+ break;
+ }
+
modrm = insn_fetch_type(uint8_t);
modrm_mod = (modrm & 0xc0) >> 6;

break;
}
+ }

+ if ( d & ModRM )
+ {
modrm_reg = ((rex_prefix & 4) << 1) | ((modrm & 0x38) >> 3);
modrm_rm = modrm & 0x07;

@@ -5658,6 +5668,18 @@ x86_emulate(
CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x50): /* vmovmskp{s,d} {x,y}mm,reg */
CASE_SIMD_PACKED_INT(0x0f, 0xd7): /* pmovmskb {,x}mm,reg */
case X86EMUL_OPC_VEX_66(0x0f, 0xd7): /* vpmovmskb {x,y}mm,reg */
+ opc = init_prefixes(stub);
+ opc[0] = b;
+ /* Convert GPR destination to %rAX. */
+ rex_prefix &= ~REX_R;
+ vex.r = 1;
+ if ( !mode_64bit() )
+ vex.w = 0;
+ opc[1] = modrm & 0xc7;
+ fic.insn_bytes = PFX_BYTES + 2;
+ simd_0f_to_gpr:
+ opc[fic.insn_bytes - PFX_BYTES] = 0xc3;
+
generate_exception_if(ea.type != OP_REG, EXC_UD);

if ( vex.opcx == vex_none )
@@ -5685,17 +5707,6 @@ x86_emulate(
get_fpu(X86EMUL_FPU_ymm, &fic);
}

- opc = init_prefixes(stub);
- opc[0] = b;
- /* Convert GPR destination to %rAX. */
- rex_prefix &= ~REX_R;
- vex.r = 1;
- if ( !mode_64bit() )
- vex.w = 0;
- opc[1] = modrm & 0xc7;
- fic.insn_bytes = PFX_BYTES + 2;
- opc[2] = 0xc3;
-
copy_REX_VEX(opc, rex_prefix, vex);
invoke_stub("", "", "=a" (dst.val) : [dummy] "i" (0));

@@ -5954,6 +5965,132 @@ x86_emulate(
fic.insn_bytes = PFX_BYTES + 3;
break;

+ CASE_SIMD_PACKED_INT(0x0f, 0x71): /* Grp12 */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x71):
+ CASE_SIMD_PACKED_INT(0x0f, 0x72): /* Grp13 */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x72):
+ switch ( modrm_reg & 7 )
+ {
+ case 2: /* psrl{w,d} $imm8,{,x}mm */
+ /* vpsrl{w,d} $imm8,{x,y}mm,{x,y}mm */
+ case 4: /* psra{w,d} $imm8,{,x}mm */
+ /* vpsra{w,d} $imm8,{x,y}mm,{x,y}mm */
+ case 6: /* psll{w,d} $imm8,{,x}mm */
+ /* vpsll{w,d} $imm8,{x,y}mm,{x,y}mm */
+ break;
+ default:
+ goto cannot_emulate;
+ }
+ simd_0f_shift_imm:
+ generate_exception_if(ea.type != OP_REG, EXC_UD);
+
+ if ( vex.opcx != vex_none )
+ {
+ if ( vex.l )
+ host_and_vcpu_must_have(avx2);
+ else
+ host_and_vcpu_must_have(avx);
+ get_fpu(X86EMUL_FPU_ymm, &fic);
+ }
+ else if ( vex.pfx )
+ {
+ vcpu_must_have(sse2);
+ get_fpu(X86EMUL_FPU_xmm, &fic);
+ }
+ else
+ {
+ host_and_vcpu_must_have(mmx);
+ get_fpu(X86EMUL_FPU_mmx, &fic);
+ }
+
+ opc = init_prefixes(stub);
+ opc[0] = b;
+ opc[1] = modrm;
+ opc[2] = imm1;
+ fic.insn_bytes = PFX_BYTES + 3;
+ simd_0f_reg_only:
+ opc[fic.insn_bytes - PFX_BYTES] = 0xc3;
+
+ copy_REX_VEX(opc, rex_prefix, vex);
+ invoke_stub("", "", [dummy_out] "=g" (cr4) : [dummy_in] "i" (0) );
+
+ put_stub(stub);
+ put_fpu(&fic);
+ break;
+
+ case X86EMUL_OPC(0x0f, 0x73): /* Grp14 */
+ switch ( modrm_reg & 7 )
+ {
+ case 2: /* psrlq $imm8,mm */
+ case 6: /* psllq $imm8,mm */
+ goto simd_0f_shift_imm;
+ }
+ goto cannot_emulate;
+
+ case X86EMUL_OPC_66(0x0f, 0x73):
+ case X86EMUL_OPC_VEX_66(0x0f, 0x73):
+ switch ( modrm_reg & 7 )
+ {
+ case 2: /* psrlq $imm8,xmm */
+ /* vpsrlq $imm8,{x,y}mm,{x,y}mm */
+ case 3: /* psrldq $imm8,xmm */
+ /* vpsrldq $imm8,{x,y}mm,{x,y}mm */
+ case 6: /* psllq $imm8,xmm */
+ /* vpsllq $imm8,{x,y}mm,{x,y}mm */
+ case 7: /* pslldq $imm8,xmm */
+ /* vpslldq $imm8,{x,y}mm,{x,y}mm */
+ goto simd_0f_shift_imm;
+ }
+ goto cannot_emulate;
+
+ case X86EMUL_OPC(0x0f, 0x77): /* emms */
+ case X86EMUL_OPC_VEX(0x0f, 0x77): /* vzero{all,upper} */
+ if ( vex.opcx != vex_none )
+ {
+ host_and_vcpu_must_have(avx);
+ get_fpu(X86EMUL_FPU_ymm, &fic);
+ }
+ else
+ {
+ host_and_vcpu_must_have(mmx);
+ get_fpu(X86EMUL_FPU_mmx, &fic);
+ }
+
+ opc = init_prefixes(stub);
+ opc[0] = b;
+ fic.insn_bytes = PFX_BYTES + 1;
+ goto simd_0f_reg_only;
+
+ case X86EMUL_OPC_66(0x0f, 0x78): /* Grp17 */
+ switch ( modrm_reg & 7 )
+ {
+ case 0: /* extrq $imm8,$imm8,xmm */
+ break;
+ default:
+ goto cannot_emulate;
+ }
+ /* fall through */
+ case X86EMUL_OPC_F2(0x0f, 0x78): /* insertq $imm8,$imm8,xmm,xmm */
+ generate_exception_if(ea.type != OP_REG, EXC_UD);
+
+ host_and_vcpu_must_have(sse4a);
+ get_fpu(X86EMUL_FPU_xmm, &fic);
+
+ opc = init_prefixes(stub);
+ opc[0] = b;
+ opc[1] = modrm;
+ opc[2] = imm1;
+ opc[3] = imm2;
+ fic.insn_bytes = PFX_BYTES + 4;
+ goto simd_0f_reg_only;
+
+ case X86EMUL_OPC_66(0x0f, 0x79): /* extrq xmm,xmm */
+ case X86EMUL_OPC_F2(0x0f, 0x79): /* insertq xmm,xmm */
+ generate_exception_if(ea.type != OP_REG, EXC_UD);
+ host_and_vcpu_must_have(sse4a);
+ op_bytes = 8;
+ goto simd_0f_xmm;
+
case X86EMUL_OPC_F3(0x0f, 0x7e): /* movq xmm/m64,xmm */
case X86EMUL_OPC_VEX_F3(0x0f, 0x7e): /* vmovq xmm/m64,xmm */
generate_exception_if(vex.l, EXC_UD);
@@ -6309,6 +6446,22 @@ x86_emulate(
ea.type = OP_MEM;
goto simd_0f_int_imm8;

+ case X86EMUL_OPC_VEX_66(0x0f, 0xc5): /* vpextrw $imm8,xmm,reg */
+ generate_exception_if(vex.l, EXC_UD);
+ /* fall through */
+ CASE_SIMD_PACKED_INT(0x0f, 0xc5): /* pextrw $imm8,{,x}mm,reg */
+ opc = init_prefixes(stub);
+ opc[0] = b;
+ /* Convert GPR destination to %rAX. */
+ rex_prefix &= ~REX_R;
+ vex.r = 1;
+ if ( !mode_64bit() )
+ vex.w = 0;
+ opc[1] = modrm & 0xc7;
+ opc[2] = imm1;
+ fic.insn_bytes = PFX_BYTES + 3;
+ goto simd_0f_to_gpr;
+
case X86EMUL_OPC(0x0f, 0xc7): /* Grp9 */
{
union {
Andrew Cooper
2017-03-01 14:36:42 UTC
Permalink
Post by Jan Beulich
@@ -2505,12 +2506,21 @@ x86_decode(
opcode |= b | MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
+ if ( !(d & ModRM) )
+ {
+ modrm_reg = modrm_rm = modrm_mod = modrm = 0;
+ break;
+ }
+
modrm = insn_fetch_type(uint8_t);
modrm_mod = (modrm & 0xc0) >> 6;
break;
}
+ }
+ if ( d & ModRM )
+ {
modrm_reg = ((rex_prefix & 4) << 1) | ((modrm & 0x38) >> 3);
modrm_rm = modrm & 0x07;
Doesn't this hunk want splitting out into its own patch and
backporting? Xen 4.8's x86_decode_insn() was supposedly able to provide
the correct length.
Post by Jan Beulich
@@ -5658,6 +5668,18 @@ x86_emulate(
CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x50): /* vmovmskp{s,d} {x,y}mm,reg */
CASE_SIMD_PACKED_INT(0x0f, 0xd7): /* pmovmskb {,x}mm,reg */
case X86EMUL_OPC_VEX_66(0x0f, 0xd7): /* vpmovmskb {x,y}mm,reg */
+ opc = init_prefixes(stub);
+ opc[0] = b;
+ /* Convert GPR destination to %rAX. */
+ rex_prefix &= ~REX_R;
+ vex.r = 1;
+ if ( !mode_64bit() )
+ vex.w = 0;
+ opc[1] = modrm & 0xc7;
As an observation, converting GPR to %rAX is also becoming a common
sequence.

~Andrew
Jan Beulich
2017-03-01 14:43:24 UTC
Permalink
Post by Andrew Cooper
Post by Jan Beulich
@@ -2505,12 +2506,21 @@ x86_decode(
opcode |= b | MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
+ if ( !(d & ModRM) )
+ {
+ modrm_reg = modrm_rm = modrm_mod = modrm = 0;
+ break;
+ }
+
modrm = insn_fetch_type(uint8_t);
modrm_mod = (modrm & 0xc0) >> 6;
break;
}
+ }
+ if ( d & ModRM )
+ {
modrm_reg = ((rex_prefix & 4) << 1) | ((modrm & 0x38) >> 3);
modrm_rm = modrm & 0x07;
Doesn't this hunk want splitting out into its own patch and
backporting? Xen 4.8's x86_decode_insn() was supposedly able to provide
the correct length.
Well, if this was affecting instructions we could even remotely
expect to make it here, I would have done it in a separate
patch, but vmzero{all,upper} just seem to unlikely to warrant
a backport.
So I'd prefer to keep it as one patch, but if you make your R-b
dependent on the split, then I'll do so. Let me know.

Jan
Andrew Cooper
2017-03-01 20:01:29 UTC
Permalink
Post by Jan Beulich
Post by Andrew Cooper
Post by Jan Beulich
@@ -2505,12 +2506,21 @@ x86_decode(
opcode |= b | MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
+ if ( !(d & ModRM) )
+ {
+ modrm_reg = modrm_rm = modrm_mod = modrm = 0;
+ break;
+ }
+
modrm = insn_fetch_type(uint8_t);
modrm_mod = (modrm & 0xc0) >> 6;
break;
}
+ }
+ if ( d & ModRM )
+ {
modrm_reg = ((rex_prefix & 4) << 1) | ((modrm & 0x38) >> 3);
modrm_rm = modrm & 0x07;
Doesn't this hunk want splitting out into its own patch and
backporting? Xen 4.8's x86_decode_insn() was supposedly able to provide
the correct length.
Well, if this was affecting instructions we could even remotely
expect to make it here, I would have done it in a separate
patch, but vmzero{all,upper} just seem to unlikely to warrant
a backport.
So I'd prefer to keep it as one patch, but if you make your R-b
dependent on the split, then I'll do so. Let me know.
It really depends on what are the chances that anyone is making use of
the enhancements. I'd personally err on the side of backporting, but I
also accept that it is probably very unlikely.

At least it should be obvious to diagnose and easy to backport if anyone
comes across the problem.

~Andrew
Jan Beulich
2017-02-28 12:52:46 UTC
Permalink
Signed-off-by: Jan Beulich <***@suse.com>
---
v4: Drop the host_and_ part from the AVX checks.
v3: Re-base.

--- a/tools/fuzz/x86_instruction_emulator/x86-insn-emulator-fuzzer.c
+++ b/tools/fuzz/x86_instruction_emulator/x86-insn-emulator-fuzzer.c
@@ -660,7 +660,7 @@ int LLVMFuzzerTestOneInput(const uint8_t
};
int rc;

- stack_exec = emul_test_make_stack_executable();
+ stack_exec = emul_test_init();
if ( !stack_exec )
{
printf("Warning: Stack could not be made executable (%d).\n", errno);
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -219,7 +219,7 @@ int main(int argc, char **argv)
}
instr = (char *)res + 0x100;

- stack_exec = emul_test_make_stack_executable();
+ stack_exec = emul_test_init();

if ( !stack_exec )
printf("Warning: Stack could not be made executable (%d).\n", errno);
@@ -2395,6 +2395,87 @@ int main(int argc, char **argv)
goto fail;
printf("okay\n");
}
+ else
+ printf("skipped\n");
+
+ printf("%-40s", "Testing stmxcsr (%edx)...");
+ if ( cpu_has_sse )
+ {
+ decl_insn(stmxcsr);
+
+ asm volatile ( put_insn(stmxcsr, "stmxcsr (%0)") :: "d" (NULL) );
+
+ res[0] = 0x12345678;
+ res[1] = 0x87654321;
+ asm ( "stmxcsr %0" : "=m" (res[2]) );
+ set_insn(stmxcsr);
+ regs.edx = (unsigned long)res;
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( rc != X86EMUL_OKAY || !check_eip(stmxcsr) ||
+ res[0] != res[2] || res[1] != 0x87654321 )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
+ printf("%-40s", "Testing ldmxcsr 4(%ecx)...");
+ if ( cpu_has_sse )
+ {
+ decl_insn(ldmxcsr);
+
+ asm volatile ( put_insn(ldmxcsr, "ldmxcsr 4(%0)") :: "c" (NULL) );
+
+ set_insn(ldmxcsr);
+ res[1] = mxcsr_mask;
+ regs.ecx = (unsigned long)res;
+ rc = x86_emulate(&ctxt, &emulops);
+ asm ( "stmxcsr %0; ldmxcsr %1" : "=m" (res[0]) : "m" (res[2]) );
+ if ( rc != X86EMUL_OKAY || !check_eip(ldmxcsr) ||
+ res[0] != mxcsr_mask )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
+ printf("%-40s", "Testing vstmxcsr (%ecx)...");
+ if ( cpu_has_avx )
+ {
+ decl_insn(vstmxcsr);
+
+ asm volatile ( put_insn(vstmxcsr, "vstmxcsr (%0)") :: "c" (NULL) );
+
+ res[0] = 0x12345678;
+ res[1] = 0x87654321;
+ set_insn(vstmxcsr);
+ regs.ecx = (unsigned long)res;
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( rc != X86EMUL_OKAY || !check_eip(vstmxcsr) ||
+ res[0] != res[2] || res[1] != 0x87654321 )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
+ printf("%-40s", "Testing vldmxcsr 4(%edx)...");
+ if ( cpu_has_avx )
+ {
+ decl_insn(vldmxcsr);
+
+ asm volatile ( put_insn(vldmxcsr, "vldmxcsr 4(%0)") :: "d" (NULL) );
+
+ set_insn(vldmxcsr);
+ res[1] = mxcsr_mask;
+ regs.edx = (unsigned long)res;
+ rc = x86_emulate(&ctxt, &emulops);
+ asm ( "stmxcsr %0; ldmxcsr %1" : "=m" (res[0]) : "m" (res[2]) );
+ if ( rc != X86EMUL_OKAY || !check_eip(vldmxcsr) ||
+ res[0] != mxcsr_mask )
+ goto fail;
+ printf("okay\n");
+ }
else
printf("skipped\n");

--- a/tools/tests/x86_emulator/x86_emulate.c
+++ b/tools/tests/x86_emulator/x86_emulate.c
@@ -22,10 +22,29 @@
#define get_stub(stb) ((void *)((stb).addr = (uintptr_t)(stb).buf))
#define put_stub(stb)

-bool emul_test_make_stack_executable(void)
+uint32_t mxcsr_mask = 0x0000ffbf;
+
+bool emul_test_init(void)
{
unsigned long sp;

+ if ( cpu_has_fxsr )
+ {
+ static union __attribute__((__aligned__(16))) {
+ char x[464];
+ struct {
+ uint32_t other[6];
+ uint32_t mxcsr;
+ uint32_t mxcsr_mask;
+ /* ... */
+ };
+ } fxs;
+
+ asm ( "fxsave %0" : "=m" (fxs) );
+ if ( fxs.mxcsr_mask )
+ mxcsr_mask = fxs.mxcsr_mask;
+ }
+
/*
* Mark the entire stack executable so that the stub executions
* don't fault
--- a/tools/tests/x86_emulator/x86_emulate.h
+++ b/tools/tests/x86_emulator/x86_emulate.h
@@ -42,8 +42,10 @@

#define is_canonical_address(x) (((int64_t)(x) >> 47) == ((int64_t)(x) >> 63))

+extern uint32_t mxcsr_mask;
+
#define MMAP_SZ 16384
-bool emul_test_make_stack_executable(void);
+bool emul_test_init(void);

#include "x86_emulate/x86_emulate.h"

@@ -68,6 +70,12 @@ static inline uint64_t xgetbv(uint32_t x
(res.d & (1U << 23)) != 0; \
})

+#define cpu_has_fxsr ({ \
+ struct cpuid_leaf res; \
+ emul_test_cpuid(1, 0, &res, NULL); \
+ (res.d & (1U << 24)) != 0; \
+})
+
#define cpu_has_sse ({ \
struct cpuid_leaf res; \
emul_test_cpuid(1, 0, &res, NULL); \
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -2173,7 +2173,6 @@ x86_decode_twobyte(
case 0x50 ... 0x77:
case 0x79 ... 0x7d:
case 0x7f:
- case 0xae:
case 0xc2 ... 0xc3:
case 0xc5 ... 0xc6:
case 0xd0 ... 0xfe:
@@ -2204,6 +2203,24 @@ x86_decode_twobyte(
}
break;

+ case 0xae:
+ ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
+ /* fall through */
+ case X86EMUL_OPC_VEX(0, 0xae):
+ switch ( modrm_reg & 7 )
+ {
+ case 2: /* {,v}ldmxcsr */
+ state->desc = DstImplicit | SrcMem | ModRM | Mov;
+ op_bytes = 4;
+ break;
+
+ case 3: /* {,v}stmxcsr */
+ state->desc = DstMem | SrcImplicit | ModRM | Mov;
+ op_bytes = 4;
+ break;
+ }
+ break;
+
case 0xb8: /* jmpe / popcnt */
if ( rep_prefix() )
ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
@@ -6191,6 +6208,23 @@ x86_emulate(
case X86EMUL_OPC(0x0f, 0xae): case X86EMUL_OPC_66(0x0f, 0xae): /* Grp15 */
switch ( modrm_reg & 7 )
{
+ case 2: /* ldmxcsr */
+ generate_exception_if(vex.pfx, EXC_UD);
+ vcpu_must_have(sse);
+ ldmxcsr:
+ generate_exception_if(src.type != OP_MEM, EXC_UD);
+ generate_exception_if(src.val & ~mxcsr_mask, EXC_GP, 0);
+ asm volatile ( "ldmxcsr %0" :: "m" (src.val) );
+ break;
+
+ case 3: /* stmxcsr */
+ generate_exception_if(vex.pfx, EXC_UD);
+ vcpu_must_have(sse);
+ stmxcsr:
+ generate_exception_if(dst.type != OP_MEM, EXC_UD);
+ asm volatile ( "stmxcsr %0" : "=m" (dst.val) );
+ break;
+
case 5: /* lfence */
fail_if(modrm_mod != 3);
generate_exception_if(vex.pfx, EXC_UD);
@@ -6234,6 +6268,20 @@ x86_emulate(
}
break;

+ case X86EMUL_OPC_VEX(0x0f, 0xae): /* Grp15 */
+ switch ( modrm_reg & 7 )
+ {
+ case 2: /* vldmxcsr */
+ generate_exception_if(vex.l || vex.reg != 0xf, EXC_UD);
+ vcpu_must_have(avx);
+ goto ldmxcsr;
+ case 3: /* vstmxcsr */
+ generate_exception_if(vex.l || vex.reg != 0xf, EXC_UD);
+ vcpu_must_have(avx);
+ goto stmxcsr;
+ }
+ goto cannot_emulate;
+
case X86EMUL_OPC_F3(0x0f, 0xae): /* Grp15 */
fail_if(modrm_mod != 3);
generate_exception_if((modrm_reg & 4) || !mode_64bit(), EXC_UD);
--- a/xen/arch/x86/xstate.c
+++ b/xen/arch/x86/xstate.c
@@ -29,7 +29,7 @@ unsigned int *__read_mostly xstate_sizes
u64 __read_mostly xstate_align;
static unsigned int __read_mostly xstate_features;

-static uint32_t __read_mostly mxcsr_mask = 0x0000ffbf;
+uint32_t __read_mostly mxcsr_mask = 0x0000ffbf;

/* Cached xcr0 for fast read */
static DEFINE_PER_CPU(uint64_t, xcr0);
--- a/xen/include/asm-x86/xstate.h
+++ b/xen/include/asm-x86/xstate.h
@@ -15,6 +15,8 @@
#define FCW_RESET 0x0040
#define MXCSR_DEFAULT 0x1f80

+extern uint32_t mxcsr_mask;
+
#define XSTATE_CPUID 0x0000000d

#define XCR_XFEATURE_ENABLED_MASK 0x00000000 /* index of XCR0 */
Andrew Cooper
2017-03-01 14:57:44 UTC
Permalink
Reviewed-by: Andrew Cooper <***@citrix.com>
Jan Beulich
2017-02-28 12:53:24 UTC
Permalink
... as the only post-SSE2 move insn.

Signed-off-by: Jan Beulich <***@suse.com>
---
v3: Re-base.
v2: Re-base.

--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -2398,6 +2398,74 @@ int main(int argc, char **argv)
else
printf("skipped\n");

+ printf("%-40s", "Testing movntdqa 16(%edx),%xmm4...");
+ if ( stack_exec && cpu_has_sse4_1 )
+ {
+ decl_insn(movntdqa);
+
+ asm volatile ( "pcmpgtb %%xmm4, %%xmm4\n"
+ put_insn(movntdqa, "movntdqa 16(%0), %%xmm4")
+ :: "d" (NULL) );
+
+ set_insn(movntdqa);
+ memset(res, 0x55, 64);
+ memset(res + 4, 0xff, 16);
+ regs.edx = (unsigned long)res;
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( rc != X86EMUL_OKAY || !check_eip(movntdqa) )
+ goto fail;
+ asm ( "pcmpeqb %%xmm2, %%xmm2\n\t"
+ "pcmpeqb %%xmm4, %%xmm2\n\t"
+ "pmovmskb %%xmm2, %0" : "=r" (rc) );
+ if ( rc != 0xffff )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
+ printf("%-40s", "Testing vmovntdqa (%ecx),%ymm4...");
+ if ( stack_exec && cpu_has_avx2 )
+ {
+ decl_insn(vmovntdqa);
+
+#if 0 /* Don't use AVX2 instructions for now */
+ asm volatile ( "vpxor %%ymm4, %%ymm4, %%ymm4\n"
+ put_insn(vmovntdqa, "vmovntdqa (%0), %%ymm4")
+ :: "c" (NULL) );
+#else
+ asm volatile ( "vpxor %xmm4, %xmm4, %xmm4\n"
+ put_insn(vmovntdqa,
+ ".byte 0xc4, 0xe2, 0x7d, 0x2a, 0x21") );
+#endif
+
+ set_insn(vmovntdqa);
+ memset(res, 0x55, 96);
+ memset(res + 8, 0xff, 32);
+ regs.ecx = (unsigned long)(res + 8);
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( rc != X86EMUL_OKAY || !check_eip(vmovntdqa) )
+ goto fail;
+#if 0 /* Don't use AVX2 instructions for now */
+ asm ( "vpcmpeqb %%ymm2, %%ymm2, %%ymm2\n\t"
+ "vpcmpeqb %%ymm4, %%ymm2, %%ymm0\n\t"
+ "vpmovmskb %%ymm0, %0" : "=r" (rc) );
+#else
+ asm ( "vextractf128 $1, %%ymm4, %%xmm3\n\t"
+ "vpcmpeqb %%xmm2, %%xmm2, %%xmm2\n\t"
+ "vpcmpeqb %%xmm4, %%xmm2, %%xmm0\n\t"
+ "vpcmpeqb %%xmm3, %%xmm2, %%xmm1\n\t"
+ "vpmovmskb %%xmm0, %0\n\t"
+ "vpmovmskb %%xmm1, %1" : "=r" (rc), "=r" (i) );
+ rc |= i << 16;
+#endif
+ if ( ~rc )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
printf("%-40s", "Testing stmxcsr (%edx)...");
if ( cpu_has_sse )
{
--- a/tools/tests/x86_emulator/x86_emulate.h
+++ b/tools/tests/x86_emulator/x86_emulate.h
@@ -94,6 +94,12 @@ static inline uint64_t xgetbv(uint32_t x
(res.c & (1U << 0)) != 0; \
})

+#define cpu_has_sse4_1 ({ \
+ struct cpuid_leaf res; \
+ emul_test_cpuid(1, 0, &res, NULL); \
+ (res.c & (1U << 19)) != 0; \
+})
+
#define cpu_has_popcnt ({ \
struct cpuid_leaf res; \
emul_test_cpuid(1, 0, &res, NULL); \
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -1399,6 +1399,7 @@ static bool vcpu_has(
#define vcpu_has_sse2() vcpu_has( 1, EDX, 26, ctxt, ops)
#define vcpu_has_sse3() vcpu_has( 1, ECX, 0, ctxt, ops)
#define vcpu_has_cx16() vcpu_has( 1, ECX, 13, ctxt, ops)
+#define vcpu_has_sse4_1() vcpu_has( 1, ECX, 19, ctxt, ops)
#define vcpu_has_sse4_2() vcpu_has( 1, ECX, 20, ctxt, ops)
#define vcpu_has_movbe() vcpu_has( 1, ECX, 22, ctxt, ops)
#define vcpu_has_popcnt() vcpu_has( 1, ECX, 23, ctxt, ops)
@@ -5919,6 +5920,7 @@ x86_emulate(
case X86EMUL_OPC_VEX_66(0x0f, 0x7f): /* vmovdqa {x,y}mm,{x,y}mm/m128 */
case X86EMUL_OPC_F3(0x0f, 0x7f): /* movdqu xmm,xmm/m128 */
case X86EMUL_OPC_VEX_F3(0x0f, 0x7f): /* vmovdqu {x,y}mm,{x,y}mm/mem */
+ movdqa:
d |= TwoOp;
op_bytes = 16 << vex.l;
if ( vex.opcx != vex_none )
@@ -6814,6 +6816,23 @@ x86_emulate(
sfence = true;
break;

+ case X86EMUL_OPC_66(0x0f38, 0x2a): /* movntdqa m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x2a): /* vmovntdqa mem,{x,y}mm */
+ generate_exception_if(ea.type != OP_MEM, EXC_UD);
+ /* Ignore the non-temporal hint for now, using movdqa instead. */
+ asm volatile ( "mfence" ::: "memory" );
+ b = 0x6f;
+ if ( vex.opcx == vex_none )
+ vcpu_must_have(sse4_1);
+ else
+ {
+ vex.opcx = vex_0f;
+ if ( vex.l )
+ vcpu_must_have(avx2);
+ }
+ state->simd_size = simd_packed_int;
+ goto movdqa;
+
case X86EMUL_OPC(0x0f38, 0xf0): /* movbe m,r */
case X86EMUL_OPC(0x0f38, 0xf1): /* movbe r,m */
vcpu_must_have(movbe);
Andrew Cooper
2017-03-01 14:58:56 UTC
Permalink
Post by Jan Beulich
... as the only post-SSE2 move insn.
Reviewed-by: Andrew Cooper <***@citrix.com>
Jan Beulich
2017-02-28 12:54:29 UTC
Permalink
This being a strict (MMX register only) subset of SSE, we can simply
adjust the respective checks while making the new predicate look at
both flags.

Signed-off-by: Jan Beulich <***@suse.com>
Reviewed-by: Andrew Cooper <***@citrix.com>

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -1405,6 +1405,8 @@ static bool vcpu_has(
#define vcpu_has_popcnt() vcpu_has( 1, ECX, 23, ctxt, ops)
#define vcpu_has_avx() vcpu_has( 1, ECX, 28, ctxt, ops)
#define vcpu_has_rdrand() vcpu_has( 1, ECX, 30, ctxt, ops)
+#define vcpu_has_mmxext() (vcpu_has(0x80000001, EDX, 22, ctxt, ops) || \
+ vcpu_has_sse())
#define vcpu_has_lahf_lm() vcpu_has(0x80000001, ECX, 0, ctxt, ops)
#define vcpu_has_cr8_legacy() vcpu_has(0x80000001, ECX, 4, ctxt, ops)
#define vcpu_has_lzcnt() vcpu_has(0x80000001, ECX, 5, ctxt, ops)
@@ -5707,8 +5709,12 @@ x86_emulate(
else
{
if ( b != 0x50 )
+ {
host_and_vcpu_must_have(mmx);
- vcpu_must_have(sse);
+ vcpu_must_have(mmxext);
+ }
+ else
+ vcpu_must_have(sse);
}
if ( b == 0x50 || (vex.pfx & VEX_PREFIX_DOUBLE_MASK) )
get_fpu(X86EMUL_FPU_xmm, &fic);
@@ -5966,7 +5972,7 @@ x86_emulate(
else
{
host_and_vcpu_must_have(mmx);
- vcpu_must_have(sse);
+ vcpu_must_have(mmxext);
get_fpu(X86EMUL_FPU_mmx, &fic);
}
simd_0f_imm8:
@@ -6252,7 +6258,7 @@ x86_emulate(
if ( modrm_mod == 3 ) /* sfence */
{
generate_exception_if(vex.pfx, EXC_UD);
- vcpu_must_have(sse);
+ vcpu_must_have(mmxext);
asm volatile ( "sfence" ::: "memory" );
break;
}
@@ -6736,7 +6742,7 @@ x86_emulate(
case X86EMUL_OPC(0x0f, 0xe3): /* pavgw mm/m64,mm */
case X86EMUL_OPC(0x0f, 0xe4): /* pmulhuw mm/m64,mm */
case X86EMUL_OPC(0x0f, 0xf6): /* psadbw mm/m64,mm */
- vcpu_must_have(sse);
+ vcpu_must_have(mmxext);
goto simd_0f_mmx;

case X86EMUL_OPC_66(0x0f, 0xe6): /* cvttpd2dq xmm/mem,xmm */
@@ -6767,7 +6773,7 @@ x86_emulate(
else
{
host_and_vcpu_must_have(mmx);
- vcpu_must_have(sse);
+ vcpu_must_have(mmxext);
get_fpu(X86EMUL_FPU_mmx, &fic);
}
Jan Beulich
2017-02-28 12:53:57 UTC
Permalink
... and their AVX equivalents. Note that a few instructions aren't
covered (yet), but those all fall into common pattern groups, so I
would hope that for now we can do with what is there.

MMX insns aren't being covered at all, as they're not easy to deal
with: The compiler refuses to emit such for other than uses of built-in
functions.

The current way of testing AVX insns is meant to be temporary only:
Once we fully support that feature, the present tests should rather be
replaced than full ones simply added.

Signed-off-by: Jan Beulich <***@suse.com>
Acked-by: Andrew Cooper <***@citrix.com>
---
v4: Put spaces around ##. Parenthesize uses of macro parameters. Fix
indentation for a few preprocessor directives.
v2: New.

--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -11,11 +11,36 @@ all: $(TARGET)
run: $(TARGET)
./$(TARGET)

-TESTCASES := blowfish
+TESTCASES := blowfish simd

blowfish-cflags := ""
blowfish-cflags-x86_32 := "-mno-accumulate-outgoing-args -Dstatic="

+sse-vecs := 16
+sse-ints :=
+sse-flts := 4
+sse2-vecs := $(sse-vecs)
+sse2-ints := 1 2 4 8
+sse2-flts := 4 8
+
+# When converting SSE to AVX, have the compiler avoid XMM0 to widen
+# coverage of the VEX.vvvv checks in the emulator.
+sse2avx := -ffixed-xmm0 -Wa,-msse2avx
+
+simd-cflags := $(foreach flavor,sse sse2, \
+ $(foreach vec,$($(flavor)-vecs), \
+ $(foreach int,$($(flavor)-ints), \
+ "-D$(flavor)_$(vec)i$(int) -m$(flavor) -O2 -DVEC_SIZE=$(vec) -DINT_SIZE=$(int)" \
+ "-D$(flavor)_$(vec)u$(int) -m$(flavor) -O2 -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)" \
+ "-D$(flavor)_avx_$(vec)i$(int) -m$(flavor) $(sse2avx) -O2 -DVEC_SIZE=$(vec) -DINT_SIZE=$(int)" \
+ "-D$(flavor)_avx_$(vec)u$(int) -m$(flavor) $(sse2avx) -O2 -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)") \
+ $(foreach flt,$($(flavor)-flts), \
+ "-D$(flavor)_$(vec)f$(flt) -m$(flavor) -O2 -DVEC_SIZE=$(vec) -DFLOAT_SIZE=$(flt)" \
+ "-D$(flavor)_avx_$(vec)f$(flt) -m$(flavor) $(sse2avx) -O2 -DVEC_SIZE=$(vec) -DFLOAT_SIZE=$(flt)")) \
+ $(foreach flt,$($(flavor)-flts), \
+ "-D$(flavor)_f$(flt) -m$(flavor) -mfpmath=sse -O2 -DFLOAT_SIZE=$(flt)" \
+ "-D$(flavor)_avx_f$(flt) -m$(flavor) -mfpmath=sse $(sse2avx) -O2 -DFLOAT_SIZE=$(flt)"))
+
$(addsuffix .h,$(TESTCASES)): %.h: %.c testcase.mk Makefile
rm -f $@.new $*.bin
$(foreach arch,$(filter-out $(XEN_COMPILE_ARCH),x86_32) $(XEN_COMPILE_ARCH), \
--- /dev/null
+++ b/tools/tests/x86_emulator/simd.c
@@ -0,0 +1,450 @@
+#include <stdbool.h>
+
+asm (
+ "\t.text\n"
+ "\t.globl _start\n"
+ "_start:\n"
+#if defined(__i386__) && VEC_SIZE == 16
+ "\tpush %ebp\n"
+ "\tmov %esp,%ebp\n"
+ "\tand $~0xf,%esp\n"
+ "\tcall simd_test\n"
+ "\tleave\n"
+ "\tret"
+#else
+ "\tjmp simd_test"
+#endif
+ );
+
+typedef
+#if defined(INT_SIZE)
+# define ELEM_SIZE INT_SIZE
+signed int
+# if INT_SIZE == 1
+# define MODE QI
+# elif INT_SIZE == 2
+# define MODE HI
+# elif INT_SIZE == 4
+# define MODE SI
+# elif INT_SIZE == 8
+# define MODE DI
+# endif
+#elif defined(UINT_SIZE)
+# define ELEM_SIZE UINT_SIZE
+unsigned int
+# if UINT_SIZE == 1
+# define MODE QI
+# elif UINT_SIZE == 2
+# define MODE HI
+# elif UINT_SIZE == 4
+# define MODE SI
+# elif UINT_SIZE == 8
+# define MODE DI
+# endif
+#elif defined(FLOAT_SIZE)
+float
+# define ELEM_SIZE FLOAT_SIZE
+# if FLOAT_SIZE == 4
+# define MODE SF
+# elif FLOAT_SIZE == 8
+# define MODE DF
+# endif
+#endif
+#ifndef VEC_SIZE
+# define VEC_SIZE ELEM_SIZE
+#endif
+__attribute__((mode(MODE), vector_size(VEC_SIZE))) vec_t;
+
+#define ELEM_COUNT (VEC_SIZE / ELEM_SIZE)
+
+typedef unsigned int __attribute__((mode(QI), vector_size(VEC_SIZE))) byte_vec_t;
+
+/* Various builtins want plain char / int / long long vector types ... */
+typedef char __attribute__((vector_size(VEC_SIZE))) vqi_t;
+typedef short __attribute__((vector_size(VEC_SIZE))) vhi_t;
+typedef int __attribute__((vector_size(VEC_SIZE))) vsi_t;
+#if VEC_SIZE >= 8
+typedef long long __attribute__((vector_size(VEC_SIZE))) vdi_t;
+#endif
+
+#if VEC_SIZE == 8 && defined(__SSE__)
+# define to_bool(cmp) (__builtin_ia32_pmovmskb(cmp) == 0xff)
+#elif VEC_SIZE == 16
+# if defined(__SSE__) && ELEM_SIZE == 4
+# define to_bool(cmp) (__builtin_ia32_movmskps(cmp) == 0xf)
+# elif defined(__SSE2__)
+# if ELEM_SIZE == 8
+# define to_bool(cmp) (__builtin_ia32_movmskpd(cmp) == 3)
+# else
+# define to_bool(cmp) (__builtin_ia32_pmovmskb128(cmp) == 0xffff)
+# endif
+# endif
+#endif
+
+#ifndef to_bool
+static inline bool _to_bool(byte_vec_t bv)
+{
+ unsigned int i;
+
+ for ( i = 0; i < VEC_SIZE; ++i )
+ if ( bv[i] != 0xff )
+ return false;
+
+ return true;
+}
+# define to_bool(cmp) _to_bool((byte_vec_t)(cmp))
+#endif
+
+#if VEC_SIZE == FLOAT_SIZE
+# define to_int(x) ((vec_t){ (int)(x)[0] })
+#elif VEC_SIZE == 16 && defined(__SSE2__)
+# if FLOAT_SIZE == 4
+# define to_int(x) __builtin_ia32_cvtdq2ps(__builtin_ia32_cvtps2dq(x))
+# elif FLOAT_SIZE == 8
+# define to_int(x) __builtin_ia32_cvtdq2pd(__builtin_ia32_cvtpd2dq(x))
+# endif
+#endif
+
+#if VEC_SIZE == FLOAT_SIZE
+# define scalar_1op(x, op) ({ \
+ typeof((x)[0]) __attribute__((vector_size(16))) r_; \
+ asm ( op : [out] "=&x" (r_) : [in] "m" (x) ); \
+ (vec_t){ r_[0] }; \
+})
+#endif
+
+#if FLOAT_SIZE == 4 && defined(__SSE__)
+# if VEC_SIZE == 16
+# define interleave_hi(x, y) __builtin_ia32_unpckhps(x, y)
+# define interleave_lo(x, y) __builtin_ia32_unpcklps(x, y)
+# define max(x, y) __builtin_ia32_maxps(x, y)
+# define min(x, y) __builtin_ia32_minps(x, y)
+# define recip(x) __builtin_ia32_rcpps(x)
+# define rsqrt(x) __builtin_ia32_rsqrtps(x)
+# define sqrt(x) __builtin_ia32_sqrtps(x)
+# define swap(x) __builtin_ia32_shufps(x, x, 0b00011011)
+# elif VEC_SIZE == 4
+# define recip(x) scalar_1op(x, "rcpss %[in], %[out]")
+# define rsqrt(x) scalar_1op(x, "rsqrtss %[in], %[out]")
+# define sqrt(x) scalar_1op(x, "sqrtss %[in], %[out]")
+# endif
+#elif FLOAT_SIZE == 8 && defined(__SSE2__)
+# if VEC_SIZE == 16
+# define interleave_hi(x, y) __builtin_ia32_unpckhpd(x, y)
+# define interleave_lo(x, y) __builtin_ia32_unpcklpd(x, y)
+# define max(x, y) __builtin_ia32_maxpd(x, y)
+# define min(x, y) __builtin_ia32_minpd(x, y)
+# define recip(x) __builtin_ia32_cvtps2pd(__builtin_ia32_rcpps(__builtin_ia32_cvtpd2ps(x)))
+# define rsqrt(x) __builtin_ia32_cvtps2pd(__builtin_ia32_rsqrtps(__builtin_ia32_cvtpd2ps(x)))
+# define sqrt(x) __builtin_ia32_sqrtpd(x)
+# define swap(x) __builtin_ia32_shufpd(x, x, 0b01)
+# elif VEC_SIZE == 8
+# define recip(x) scalar_1op(x, "cvtsd2ss %[in], %[out]; rcpss %[out], %[out]; cvtss2sd %[out], %[out]")
+# define rsqrt(x) scalar_1op(x, "cvtsd2ss %[in], %[out]; rsqrtss %[out], %[out]; cvtss2sd %[out], %[out]")
+# define sqrt(x) scalar_1op(x, "sqrtsd %[in], %[out]")
+# endif
+#endif
+#if VEC_SIZE == 16 && defined(__SSE2__)
+# if INT_SIZE == 1 || UINT_SIZE == 1
+# define interleave_hi(x, y) ((vec_t)__builtin_ia32_punpckhbw128((vqi_t)(x), (vqi_t)(y)))
+# define interleave_lo(x, y) ((vec_t)__builtin_ia32_punpcklbw128((vqi_t)(x), (vqi_t)(y)))
+# elif INT_SIZE == 2 || UINT_SIZE == 2
+# define interleave_hi(x, y) ((vec_t)__builtin_ia32_punpckhwd128((vhi_t)(x), (vhi_t)(y)))
+# define interleave_lo(x, y) ((vec_t)__builtin_ia32_punpcklwd128((vhi_t)(x), (vhi_t)(y)))
+# define swap(x) ((vec_t)__builtin_ia32_pshufd( \
+ (vsi_t)__builtin_ia32_pshufhw( \
+ __builtin_ia32_pshuflw((vhi_t)(x), 0b00011011), 0b00011011), 0b01001110))
+# elif INT_SIZE == 4 || UINT_SIZE == 4
+# define interleave_hi(x, y) ((vec_t)__builtin_ia32_punpckhdq128((vsi_t)(x), (vsi_t)(y)))
+# define interleave_lo(x, y) ((vec_t)__builtin_ia32_punpckldq128((vsi_t)(x), (vsi_t)(y)))
+# define swap(x) ((vec_t)__builtin_ia32_pshufd((vsi_t)(x), 0b00011011))
+# elif INT_SIZE == 8 || UINT_SIZE == 8
+# define interleave_hi(x, y) ((vec_t)__builtin_ia32_punpckhqdq128((vdi_t)(x), (vdi_t)(y)))
+# define interleave_lo(x, y) ((vec_t)__builtin_ia32_punpcklqdq128((vdi_t)(x), (vdi_t)(y)))
+# define swap(x) ((vec_t)__builtin_ia32_pshufd((vsi_t)(x), 0b01001110))
+# endif
+# if UINT_SIZE == 1
+# define max(x, y) ((vec_t)__builtin_ia32_pmaxub128((vqi_t)(x), (vqi_t)(y)))
+# define min(x, y) ((vec_t)__builtin_ia32_pminub128((vqi_t)(x), (vqi_t)(y)))
+# elif INT_SIZE == 2
+# define max(x, y) __builtin_ia32_pmaxsw128(x, y)
+# define min(x, y) __builtin_ia32_pminsw128(x, y)
+# define mul_hi(x, y) __builtin_ia32_pmulhw128(x, y)
+# elif UINT_SIZE == 2
+# define mul_hi(x, y) ((vec_t)__builtin_ia32_pmulhuw128((vhi_t)(x), (vhi_t)(y)))
+# elif UINT_SIZE == 4
+# define mul_full(x, y) ((vec_t)__builtin_ia32_pmuludq128((vsi_t)(x), (vsi_t)(y)))
+# endif
+# define select(d, x, y, m) ({ \
+ void *d_ = (d); \
+ vqi_t m_ = (vqi_t)(m); \
+ __builtin_ia32_maskmovdqu((vqi_t)(x), m_, d_); \
+ __builtin_ia32_maskmovdqu((vqi_t)(y), ~m_, d_); \
+})
+#endif
+#if VEC_SIZE == FLOAT_SIZE
+# define max(x, y) ((vec_t){({ typeof(x[0]) x_ = (x)[0], y_ = (y)[0]; x_ > y_ ? x_ : y_; })})
+# define min(x, y) ((vec_t){({ typeof(x[0]) x_ = (x)[0], y_ = (y)[0]; x_ < y_ ? x_ : y_; })})
+#endif
+
+/*
+ * Suppress value propagation by the compiler, preventing unwanted
+ * optimization. This at once makes the compiler use memory operands
+ * more often, which for our purposes is the more interesting case.
+ */
+#define touch(var) asm volatile ( "" : "+m" (var) )
+
+int simd_test(void)
+{
+ unsigned int i, j;
+ vec_t x, y, z, src, inv, alt, sh;
+
+ for ( i = 0, j = ELEM_SIZE << 3; i < ELEM_COUNT; ++i )
+ {
+ src[i] = i + 1;
+ inv[i] = ELEM_COUNT - i;
+#ifdef UINT_SIZE
+ alt[i] = -!(i & 1);
+#else
+ alt[i] = i & 1 ? -1 : 1;
+#endif
+ if ( !(i & (i + 1)) )
+ --j;
+ sh[i] = j;
+ }
+
+ touch(src);
+ x = src;
+ touch(x);
+ if ( !to_bool(x == src) ) return __LINE__;
+
+ touch(src);
+ y = x + src;
+ touch(src);
+ touch(y);
+ if ( !to_bool(y == 2 * src) ) return __LINE__;
+
+ touch(src);
+ z = y -= src;
+ touch(z);
+ if ( !to_bool(x == z) ) return __LINE__;
+
+#if defined(UINT_SIZE)
+
+ touch(inv);
+ x |= inv;
+ touch(inv);
+ y &= inv;
+ touch(inv);
+ z ^= inv;
+ touch(inv);
+ touch(x);
+ if ( !to_bool((x & ~y) == z) ) return __LINE__;
+
+#elif ELEM_SIZE > 1 || VEC_SIZE <= 8
+
+ touch(src);
+ x *= src;
+ y = inv * inv;
+ touch(src);
+ z = src + inv;
+ touch(inv);
+ z *= (src - inv);
+ if ( !to_bool(x - y == z) ) return __LINE__;
+
+#endif
+
+#if defined(FLOAT_SIZE)
+
+ x = src * alt;
+ touch(alt);
+ y = src / alt;
+ if ( !to_bool(x == y) ) return __LINE__;
+ touch(alt);
+ touch(src);
+ if ( !to_bool(x * -alt == -src) ) return __LINE__;
+
+# if defined(recip) && defined(to_int)
+
+ touch(src);
+ x = recip(src);
+ touch(src);
+ touch(x);
+ if ( !to_bool(to_int(recip(x)) == src) ) return __LINE__;
+
+# ifdef rsqrt
+ x = src * src;
+ touch(x);
+ y = rsqrt(x);
+ touch(y);
+ if ( !to_bool(to_int(recip(y)) == src) ) return __LINE__;
+ touch(src);
+ if ( !to_bool(to_int(y) == to_int(recip(src))) ) return __LINE__;
+# endif
+
+# endif
+
+# ifdef sqrt
+ x = src * src;
+ touch(x);
+ if ( !to_bool(sqrt(x) == src) ) return __LINE__;
+# endif
+
+#else
+
+# if ELEM_SIZE > 1
+
+ touch(inv);
+ x = src * inv;
+ touch(inv);
+ y[ELEM_COUNT - 1] = y[0] = j = ELEM_COUNT;
+ for ( i = 1; i < ELEM_COUNT / 2; ++i )
+ y[ELEM_COUNT - i - 1] = y[i] = y[i - 1] + (j -= 2);
+ if ( !to_bool(x == y) ) return __LINE__;
+
+# ifdef mul_hi
+ touch(alt);
+ x = mul_hi(src, alt);
+ touch(alt);
+# ifdef INT_SIZE
+ if ( !to_bool(x == (alt < 0)) ) return __LINE__;
+# else
+ if ( !to_bool(x == (src & alt) + alt) ) return __LINE__;
+# endif
+# endif
+
+# ifdef mul_full
+ x = src ^ alt;
+ touch(inv);
+ y = mul_full(x, inv);
+ touch(inv);
+ for ( i = 0; i < ELEM_COUNT; i += 2 )
+ {
+ unsigned long long res = x[i] * 1ULL * inv[i];
+
+ z[i] = res;
+ z[i + 1] = res >> (ELEM_SIZE << 3);
+ }
+ if ( !to_bool(y == z) ) return __LINE__;
+# endif
+
+ z = src;
+# ifdef INT_SIZE
+ z *= alt;
+# endif
+ touch(z);
+ x = z << 3;
+ touch(z);
+ y = z << 2;
+ touch(z);
+ if ( !to_bool(x == y + y) ) return __LINE__;
+
+ touch(x);
+ z = x >> 2;
+ touch(x);
+ if ( !to_bool(y == z + z) ) return __LINE__;
+
+ z = src;
+# ifdef INT_SIZE
+ z *= alt;
+# endif
+ /*
+ * Note that despite the touch()-es here there doesn't appear to be a way
+ * to make the compiler use a memory operand for the shift instruction (at
+ * least without resorting to built-ins).
+ */
+ j = 3;
+ touch(j);
+ x = z << j;
+ touch(j);
+ j = 2;
+ touch(j);
+ y = z << j;
+ touch(j);
+ if ( !to_bool(x == y + y) ) return __LINE__;
+
+ z = x >> j;
+ touch(j);
+ if ( !to_bool(y == z + z) ) return __LINE__;
+
+# endif
+
+# if ELEM_SIZE == 2 || defined(__SSE4_1__)
+ /*
+ * Even when there are no instructions with varying shift counts per
+ * field, the code turns out to be a nice exercise for pextr/pinsr.
+ */
+ z = src;
+# ifdef INT_SIZE
+ z *= alt;
+# endif
+ /*
+ * Zap elements for which the shift count is negative (and the hence the
+ * decrement below would yield a negative count.
+ */
+ z &= (sh > 0);
+ touch(sh);
+ x = z << sh;
+ touch(sh);
+ --sh;
+ touch(sh);
+ y = z << sh;
+ touch(sh);
+ if ( !to_bool(x == y + y) ) return __LINE__;
+
+# endif
+
+#endif
+
+#if defined(max) && defined(min)
+# ifdef UINT_SIZE
+ touch(inv);
+ x = min(src, inv);
+ touch(inv);
+ y = max(src, inv);
+ touch(inv);
+ if ( !to_bool(x + y == src + inv) ) return __LINE__;
+# else
+ x = src * alt;
+ y = inv * alt;
+ touch(y);
+ z = max(x, y);
+ touch(y);
+ y = min(x, y);
+ touch(y);
+ if ( !to_bool((y + z) * alt == src + inv) ) return __LINE__;
+# endif
+#endif
+
+#ifdef swap
+ touch(src);
+ if ( !to_bool(swap(src) == inv) ) return __LINE__;
+#endif
+
+#if defined(interleave_lo) && defined(interleave_hi)
+ touch(src);
+ x = interleave_lo(inv, src);
+ touch(src);
+ y = interleave_hi(inv, src);
+ touch(src);
+# ifdef UINT_SIZE
+ z = ((x - y) ^ ~alt) - ~alt;
+# else
+ z = (x - y) * alt;
+# endif
+ if ( !to_bool(z == ELEM_COUNT / 2) ) return __LINE__;
+#endif
+
+#ifdef select
+# ifdef UINT_SIZE
+ select(&z, src, inv, alt);
+# else
+ select(&z, src, inv, alt > 0);
+# endif
+ for ( i = 0; i < ELEM_COUNT; ++i )
+ y[i] = (i & 1 ? inv : src)[i];
+ if ( !to_bool(z == y) ) return __LINE__;
+#endif
+
+ return 0;
+}
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -5,6 +5,7 @@

#include "x86_emulate.h"
#include "blowfish.h"
+#include "simd.h"

#define verbose false /* Switch to true for far more logging. */

@@ -19,11 +20,43 @@ static bool blowfish_check_regs(const st
return regs->eax == 2 && regs->edx == 1;
}

+static bool simd_check_sse(void)
+{
+ return cpu_has_sse;
+}
+
+static bool simd_check_sse2(void)
+{
+ return cpu_has_sse2;
+}
+
+static bool simd_check_avx(void)
+{
+ return cpu_has_avx;
+}
+#define simd_check_sse_avx simd_check_avx
+#define simd_check_sse2_avx simd_check_avx
+
+static void simd_set_regs(struct cpu_user_regs *regs)
+{
+ if ( cpu_has_mmx )
+ asm volatile ( "emms" );
+}
+
+static bool simd_check_regs(const struct cpu_user_regs *regs)
+{
+ if ( !regs->eax )
+ return true;
+ printf("[line %u] ", (unsigned int)regs->eax);
+ return false;
+}
+
static const struct {
const void *code;
size_t size;
unsigned int bitness;
const char*name;
+ bool (*check_cpu)(void);
void (*set_regs)(struct cpu_user_regs *);
bool (*check_regs)(const struct cpu_user_regs *);
} blobs[] = {
@@ -39,6 +72,49 @@ static const struct {
BLOWFISH(32, blowfish, ),
BLOWFISH(32, blowfish (push), _mno_accumulate_outgoing_args),
#undef BLOWFISH
+#define SIMD_(bits, desc, feat, form) \
+ { .code = simd_x86_ ## bits ## _D ## feat ## _ ## form, \
+ .size = sizeof(simd_x86_ ## bits ## _D ## feat ## _ ## form), \
+ .bitness = bits, .name = #desc, \
+ .check_cpu = simd_check_ ## feat, \
+ .set_regs = simd_set_regs, \
+ .check_regs = simd_check_regs }
+#ifdef __x86_64__
+# define SIMD(desc, feat, form) SIMD_(64, desc, feat, form), \
+ SIMD_(32, desc, feat, form)
+#else
+# define SIMD(desc, feat, form) SIMD_(32, desc, feat, form)
+#endif
+ SIMD(SSE scalar single, sse, f4),
+ SIMD(SSE packed single, sse, 16f4),
+ SIMD(SSE2 scalar single, sse2, f4),
+ SIMD(SSE2 packed single, sse2, 16f4),
+ SIMD(SSE2 scalar double, sse2, f8),
+ SIMD(SSE2 packed double, sse2, 16f8),
+ SIMD(SSE2 packed s8, sse2, 16i1),
+ SIMD(SSE2 packed u8, sse2, 16u1),
+ SIMD(SSE2 packed s16, sse2, 16i2),
+ SIMD(SSE2 packed u16, sse2, 16u2),
+ SIMD(SSE2 packed s32, sse2, 16i4),
+ SIMD(SSE2 packed u32, sse2, 16u4),
+ SIMD(SSE2 packed s64, sse2, 16i8),
+ SIMD(SSE2 packed u64, sse2, 16u8),
+ SIMD(SSE/AVX scalar single, sse_avx, f4),
+ SIMD(SSE/AVX packed single, sse_avx, 16f4),
+ SIMD(SSE2/AVX scalar single, sse2_avx, f4),
+ SIMD(SSE2/AVX packed single, sse2_avx, 16f4),
+ SIMD(SSE2/AVX scalar double, sse2_avx, f8),
+ SIMD(SSE2/AVX packed double, sse2_avx, 16f8),
+ SIMD(SSE2/AVX packed s8, sse2_avx, 16i1),
+ SIMD(SSE2/AVX packed u8, sse2_avx, 16u1),
+ SIMD(SSE2/AVX packed s16, sse2_avx, 16i2),
+ SIMD(SSE2/AVX packed u16, sse2_avx, 16u2),
+ SIMD(SSE2/AVX packed s32, sse2_avx, 16i4),
+ SIMD(SSE2/AVX packed u32, sse2_avx, 16u4),
+ SIMD(SSE2/AVX packed s64, sse2_avx, 16i8),
+ SIMD(SSE2/AVX packed u64, sse2_avx, 16u8),
+#undef SIMD_
+#undef SIMD
};

static unsigned int bytes_read;
@@ -2589,6 +2665,9 @@ int main(int argc, char **argv)
continue;
}

+ if ( blobs[j].check_cpu && !blobs[j].check_cpu() )
+ continue;
+
memcpy(res, blobs[j].code, blobs[j].size);
ctxt.addr_size = ctxt.sp_size = blobs[j].bitness;
Jan Beulich
2017-02-28 12:54:59 UTC
Permalink
Convert the few existing opcodes so far supported.

Signed-off-by: Jan Beulich <***@suse.com>
---
v3: New.

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -43,6 +43,8 @@
#define SrcMask (7<<3)
/* Generic ModRM decode. */
#define ModRM (1<<6)
+/* vSIB addressing mode (0f38 extension opcodes only), aliasing ModRM. */
+#define vSIB (1<<6)
/* Destination is only written; never read. */
#define Mov (1<<7)
/* VEX/EVEX (SIMD only): 2nd source operand unused (must be all ones) */
@@ -340,6 +342,28 @@ static const struct {
[0xff] = { ModRM }
};

+static const struct {
+ uint8_t simd_size:5;
+ uint8_t to_memory:1;
+ uint8_t two_op:1;
+ uint8_t vsib:1;
+} ext0f38_table[256] = {
+ [0x2a] = { .simd_size = simd_packed_int, .two_op = 1 },
+ [0xf0] = { .two_op = 1 },
+ [0xf1] = { .to_memory = 1, .two_op = 1 },
+ [0xf2 ... 0xf3] = {},
+ [0xf5 ... 0xf7] = {},
+};
+
+static const struct {
+ uint8_t simd_size:5;
+ uint8_t to_memory:1;
+ uint8_t two_op:1;
+ uint8_t four_op:1;
+} ext0f3a_table[256] = {
+ [0xf0] = {},
+};
+
static const opcode_desc_t xop_table[] = {
DstReg|SrcImmByte|ModRM,
DstReg|SrcMem|ModRM,
@@ -2129,7 +2153,7 @@ x86_decode_onebyte(
/* fall through */
case 3: /* call (far, absolute indirect) */
case 5: /* jmp (far, absolute indirect) */
- state->desc = DstNone | SrcMem | ModRM | Mov;
+ state->desc = DstNone | SrcMem | Mov;
break;
}
break;
@@ -2199,7 +2223,7 @@ x86_decode_twobyte(
if ( vex.pfx == vex_f3 ) /* movq xmm/m64,xmm */
{
case X86EMUL_OPC_VEX_F3(0, 0x7e): /* vmovq xmm/m64,xmm */
- state->desc = DstImplicit | SrcMem | ModRM | Mov;
+ state->desc = DstImplicit | SrcMem | Mov;
state->simd_size = simd_other;
/* Avoid the state->desc adjustment below. */
return X86EMUL_OKAY;
@@ -2213,12 +2237,12 @@ x86_decode_twobyte(
switch ( modrm_reg & 7 )
{
case 2: /* {,v}ldmxcsr */
- state->desc = DstImplicit | SrcMem | ModRM | Mov;
+ state->desc = DstImplicit | SrcMem | Mov;
op_bytes = 4;
break;

case 3: /* {,v}stmxcsr */
- state->desc = DstMem | SrcImplicit | ModRM | Mov;
+ state->desc = DstMem | SrcImplicit | Mov;
op_bytes = 4;
break;
}
@@ -2239,7 +2263,7 @@ x86_decode_twobyte(
ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
/* fall through */
case X86EMUL_OPC_VEX_66(0, 0xc4): /* vpinsrw */
- state->desc = DstReg | SrcMem16 | ModRM;
+ state->desc = DstReg | SrcMem16;
break;
}

@@ -2275,8 +2299,8 @@ x86_decode_0f38(
break;

case 0xf1: /* movbe / crc32 */
- if ( !repne_prefix() )
- state->desc = (state->desc & ~(DstMask | SrcMask)) | DstMem | SrcReg | Mov;
+ if ( repne_prefix() )
+ state->desc = DstReg | SrcMem;
if ( rep_prefix() )
ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
break;
@@ -2527,10 +2551,7 @@ x86_decode(
opcode |= b | MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);

if ( !(d & ModRM) )
- {
- modrm_reg = modrm_rm = modrm_mod = modrm = 0;
break;
- }

modrm = insn_fetch_type(uint8_t);
modrm_mod = (modrm & 0xc0) >> 6;
@@ -2541,6 +2562,8 @@ x86_decode(

if ( d & ModRM )
{
+ d &= ~ModRM;
+#undef ModRM /* Only its aliases are valid to use from here on. */
modrm_reg = ((rex_prefix & 4) << 1) | ((modrm & 0x38) >> 3);
modrm_rm = modrm & 0x07;

@@ -2550,8 +2573,9 @@ x86_decode(
* normally be only addition/removal of SrcImm/SrcImm16, so their
* fetching can be taken care of by the common code below.
*/
- if ( ext == ext_none )
+ switch ( ext )
{
+ case ext_none:
switch ( b )
{
case 0xf6 ... 0xf7: /* Grp3 */
@@ -2577,6 +2601,25 @@ x86_decode(
}
break;
}
+ break;
+
+ case vex_0f38:
+ d = ext0f38_table[b].to_memory ? DstMem | SrcReg
+ : DstReg | SrcMem;
+ if ( ext0f38_table[b].two_op )
+ d |= TwoOp;
+ if ( ext0f38_table[b].vsib )
+ d |= vSIB;
+ state->simd_size = ext0f38_table[b].simd_size;
+ break;
+
+ case vex_0f3a:
+ /*
+ * Cannot update d here yet, as the immediate operand still
+ * needs fetching.
+ */
+ default:
+ break;
}

if ( modrm_mod == 3 )
@@ -2587,6 +2630,7 @@ x86_decode(
else if ( ad_bytes == 2 )
{
/* 16-bit ModR/M decode. */
+ generate_exception_if(d & vSIB, EXC_UD);
ea.type = OP_MEM;
switch ( modrm_rm )
{
@@ -2643,7 +2687,7 @@ x86_decode(
sib = insn_fetch_type(uint8_t);
sib_index = ((sib >> 3) & 7) | ((rex_prefix << 2) & 8);
sib_base = (sib & 7) | ((rex_prefix << 3) & 8);
- if ( sib_index != 4 )
+ if ( sib_index != 4 && !(d & vSIB) )
ea.mem.off = *(long *)decode_register(sib_index,
state->regs, 0);
ea.mem.off <<= (sib >> 6) & 3;
@@ -2669,6 +2713,7 @@ x86_decode(
}
else
{
+ generate_exception_if(d & vSIB, EXC_UD);
modrm_rm |= (rex_prefix & 1) << 3;
ea.mem.off = *(long *)decode_register(modrm_rm,
state->regs, 0);
@@ -2692,6 +2737,11 @@ x86_decode(
}
}
}
+ else
+ {
+ modrm_mod = 0xff;
+ modrm_reg = modrm_rm = modrm = 0;
+ }

if ( override_seg != x86_seg_none )
ea.mem.seg = override_seg;
@@ -2740,6 +2790,13 @@ x86_decode(
break;

case ext_0f3a:
+ d = ext0f3a_table[b].to_memory ? DstMem | SrcReg : DstReg | SrcMem;
+ if ( ext0f3a_table[b].two_op )
+ d |= TwoOp;
+ else if ( ext0f3a_table[b].four_op && !mode_64bit() && vex.opcx )
+ imm1 &= 0x7f;
+ state->desc = d;
+ state->simd_size = ext0f3a_table[b].simd_size;
if ( !vex.opcx )
ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
break;
@@ -6836,7 +6893,6 @@ x86_emulate(
if ( vex.l )
vcpu_must_have(avx2);
}
- state->simd_size = simd_packed_int;
goto movdqa;

case X86EMUL_OPC(0x0f38, 0xf0): /* movbe m,r */
@@ -7390,7 +7446,7 @@ x86_insn_modrm(const struct x86_emulate_
{
check_state(state);

- if ( !(state->desc & ModRM) )
+ if ( state->modrm_mod > 3 )
return -EINVAL;

if ( rm )
Andrew Cooper
2017-03-01 15:49:59 UTC
Permalink
Post by Jan Beulich
Convert the few existing opcodes so far supported.
---
v3: New.
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -43,6 +43,8 @@
#define SrcMask (7<<3)
/* Generic ModRM decode. */
#define ModRM (1<<6)
+/* vSIB addressing mode (0f38 extension opcodes only), aliasing ModRM. */
+#define vSIB (1<<6)
/* Destination is only written; never read. */
#define Mov (1<<7)
/* VEX/EVEX (SIMD only): 2nd source operand unused (must be all ones) */
@@ -340,6 +342,28 @@ static const struct {
[0xff] = { ModRM }
};
+static const struct {
+ uint8_t simd_size:5;
+ uint8_t to_memory:1;
Depending on how often it is used, what about shortening to "to_mem"?
It is no less clear.
Post by Jan Beulich
+ uint8_t two_op:1;
+ uint8_t vsib:1;
+} ext0f38_table[256] = {
+ [0x2a] = { .simd_size = simd_packed_int, .two_op = 1 },
+ [0xf0] = { .two_op = 1 },
+ [0xf1] = { .to_memory = 1, .two_op = 1 },
+ [0xf2 ... 0xf3] = {},
+ [0xf5 ... 0xf7] = {},
+};
+
+static const struct {
+ uint8_t simd_size:5;
+ uint8_t to_memory:1;
+ uint8_t two_op:1;
+ uint8_t four_op:1;
+} ext0f3a_table[256] = {
+ [0xf0] = {},
+};
+
static const opcode_desc_t xop_table[] = {
DstReg|SrcImmByte|ModRM,
DstReg|SrcMem|ModRM,
@@ -2692,6 +2737,11 @@ x86_decode(
}
}
}
+ else
+ {
+ modrm_mod = 0xff;
+ modrm_reg = modrm_rm = modrm = 0;
+ }
if ( override_seg != x86_seg_none )
ea.mem.seg = override_seg;
@@ -2740,6 +2790,13 @@ x86_decode(
break;
+ d = ext0f3a_table[b].to_memory ? DstMem | SrcReg : DstReg | SrcMem;
+ if ( ext0f3a_table[b].two_op )
+ d |= TwoOp;
+ else if ( ext0f3a_table[b].four_op && !mode_64bit() && vex.opcx )
+ imm1 &= 0x7f;
Is this sensible to do? The behaviour of imm1 doesn't appear to be very
consistent across encodings. As it is all passed onto hardware anyway
via stub, does it really matter?

~Andrew
Post by Jan Beulich
+ state->desc = d;
+ state->simd_size = ext0f3a_table[b].simd_size;
if ( !vex.opcx )
ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
break;
Jan Beulich
2017-03-01 16:11:32 UTC
Permalink
Post by Andrew Cooper
Post by Jan Beulich
@@ -340,6 +342,28 @@ static const struct {
[0xff] = { ModRM }
};
+static const struct {
+ uint8_t simd_size:5;
+ uint8_t to_memory:1;
Depending on how often it is used, what about shortening to "to_mem"?
It is no less clear.
No problem.
Post by Andrew Cooper
Post by Jan Beulich
@@ -2740,6 +2790,13 @@ x86_decode(
break;
+ d = ext0f3a_table[b].to_memory ? DstMem | SrcReg : DstReg | SrcMem;
+ if ( ext0f3a_table[b].two_op )
+ d |= TwoOp;
+ else if ( ext0f3a_table[b].four_op && !mode_64bit() && vex.opcx )
+ imm1 &= 0x7f;
Is this sensible to do? The behaviour of imm1 doesn't appear to be very
consistent across encodings. As it is all passed onto hardware anyway
via stub, does it really matter?
Oh, yes, it does matter: We're running in 64-bit mode, and the high
bit, when representing a register, is ignored outside of 64-bit mode.
If we didn't mask it off, we'd access the wrong register if 32- or 16-
bit code had the bit set.

Jan
Andrew Cooper
2017-03-01 20:35:08 UTC
Permalink
Post by Jan Beulich
Post by Andrew Cooper
Post by Jan Beulich
@@ -340,6 +342,28 @@ static const struct {
[0xff] = { ModRM }
};
+static const struct {
+ uint8_t simd_size:5;
+ uint8_t to_memory:1;
Depending on how often it is used, what about shortening to "to_mem"?
It is no less clear.
No problem.
Post by Andrew Cooper
Post by Jan Beulich
@@ -2740,6 +2790,13 @@ x86_decode(
break;
+ d = ext0f3a_table[b].to_memory ? DstMem | SrcReg : DstReg | SrcMem;
+ if ( ext0f3a_table[b].two_op )
+ d |= TwoOp;
+ else if ( ext0f3a_table[b].four_op && !mode_64bit() && vex.opcx )
+ imm1 &= 0x7f;
Is this sensible to do? The behaviour of imm1 doesn't appear to be very
consistent across encodings. As it is all passed onto hardware anyway
via stub, does it really matter?
Oh, yes, it does matter: We're running in 64-bit mode, and the high
bit, when representing a register, is ignored outside of 64-bit mode.
If we didn't mask it off, we'd access the wrong register if 32- or 16-
bit code had the bit set.
Ok, but my first question still stands.

Across this entire series, the only .four_op instructions appear to be
the Vex blend instructions at 660f3a4{a,b,c}, and these are called out
as special cases in the instruction manual.

How does the above condition logically equate to "this instruction uses
its imm8 byte to encode an extra source register", because that is the
purpose of the applied mask.

~Andrew
Jan Beulich
2017-03-02 08:15:18 UTC
Permalink
Post by Andrew Cooper
Post by Jan Beulich
Post by Andrew Cooper
Post by Jan Beulich
@@ -2740,6 +2790,13 @@ x86_decode(
break;
+ d = ext0f3a_table[b].to_memory ? DstMem | SrcReg : DstReg | SrcMem;
+ if ( ext0f3a_table[b].two_op )
+ d |= TwoOp;
+ else if ( ext0f3a_table[b].four_op && !mode_64bit() && vex.opcx )
+ imm1 &= 0x7f;
Is this sensible to do? The behaviour of imm1 doesn't appear to be very
consistent across encodings. As it is all passed onto hardware anyway
via stub, does it really matter?
Oh, yes, it does matter: We're running in 64-bit mode, and the high
bit, when representing a register, is ignored outside of 64-bit mode.
If we didn't mask it off, we'd access the wrong register if 32- or 16-
bit code had the bit set.
Ok, but my first question still stands.
Across this entire series, the only .four_op instructions appear to be
the Vex blend instructions at 660f3a4{a,b,c}, and these are called out
as special cases in the instruction manual.
How does the above condition logically equate to "this instruction uses
its imm8 byte to encode an extra source register", because that is the
purpose of the applied mask.
Well, "four_op" means four register operands (one of them
possibly also allowing for memory), just like two_op also doesn't
include possible immediates. "four_regs" would seem worse to
me, as that would be more along the lines of excluding memory
ones. I'll add a comment to this effect.

Jan
Jan Beulich
2017-02-28 12:55:42 UTC
Permalink
... and their AVX equivalents.

Signed-off-by: Jan Beulich <***@suse.com>
---
v3: New.

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -348,6 +348,8 @@ static const struct {
uint8_t two_op:1;
uint8_t vsib:1;
} ext0f38_table[256] = {
+ [0x00 ... 0x0b] = { .simd_size = simd_packed_int },
+ [0x1c ... 0x1e] = { .simd_size = simd_packed_int, .two_op = 1 },
[0x2a] = { .simd_size = simd_packed_int, .two_op = 1 },
[0xf0] = { .two_op = 1 },
[0xf1] = { .to_memory = 1, .two_op = 1 },
@@ -361,6 +363,7 @@ static const struct {
uint8_t two_op:1;
uint8_t four_op:1;
} ext0f3a_table[256] = {
+ [0x0f] = { .simd_size = simd_packed_int },
[0xf0] = {},
};

@@ -1422,6 +1425,7 @@ static bool vcpu_has(
#define vcpu_has_sse() vcpu_has( 1, EDX, 25, ctxt, ops)
#define vcpu_has_sse2() vcpu_has( 1, EDX, 26, ctxt, ops)
#define vcpu_has_sse3() vcpu_has( 1, ECX, 0, ctxt, ops)
+#define vcpu_has_ssse3() vcpu_has( 1, ECX, 9, ctxt, ops)
#define vcpu_has_cx16() vcpu_has( 1, ECX, 13, ctxt, ops)
#define vcpu_has_sse4_1() vcpu_has( 1, ECX, 19, ctxt, ops)
#define vcpu_has_sse4_2() vcpu_has( 1, ECX, 20, ctxt, ops)
@@ -5916,6 +5920,21 @@ x86_emulate(
simd_0f_int:
if ( vex.opcx != vex_none )
{
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x00): /* vpshufb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x01): /* vphaddw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x02): /* vphaddd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x03): /* vphaddsw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x04): /* vpmaddubsw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x05): /* vphsubw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x06): /* vphsubd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x07): /* vphsubsw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x08): /* vpsignb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x09): /* vpsignw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x0a): /* vpsignd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x0b): /* vpmulhrsw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x1c): /* vpabsb {x,y}mm/mem,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x1d): /* vpabsw {x,y}mm/mem,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x1e): /* vpabsd {x,y}mm/mem,{x,y}mm */
if ( !vex.l )
goto simd_0f_avx;
host_and_vcpu_must_have(avx2);
@@ -6011,6 +6030,7 @@ x86_emulate(
simd_0f_int_imm8:
if ( vex.opcx != vex_none )
{
+ case X86EMUL_OPC_VEX_66(0x0f3a, 0x0f): /* vpalignr $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
if ( vex.l )
host_and_vcpu_must_have(avx2);
else
@@ -6879,6 +6899,58 @@ x86_emulate(
sfence = true;
break;

+ case X86EMUL_OPC(0x0f38, 0x00): /* pshufb mm/m64,mm */
+ case X86EMUL_OPC_66(0x0f38, 0x00): /* pshufb xmm/m128,xmm */
+ case X86EMUL_OPC(0x0f38, 0x01): /* phaddw mm/m64,mm */
+ case X86EMUL_OPC_66(0x0f38, 0x01): /* phaddw xmm/m128,xmm */
+ case X86EMUL_OPC(0x0f38, 0x02): /* phaddd mm/m64,mm */
+ case X86EMUL_OPC_66(0x0f38, 0x02): /* phaddd xmm/m128,xmm */
+ case X86EMUL_OPC(0x0f38, 0x03): /* phaddsw mm/m64,mm */
+ case X86EMUL_OPC_66(0x0f38, 0x03): /* phaddsw xmm/m128,xmm */
+ case X86EMUL_OPC(0x0f38, 0x04): /* pmaddubsw mm/m64,mm */
+ case X86EMUL_OPC_66(0x0f38, 0x04): /* pmaddubsw xmm/m128,xmm */
+ case X86EMUL_OPC(0x0f38, 0x05): /* phsubw mm/m64,mm */
+ case X86EMUL_OPC_66(0x0f38, 0x05): /* phsubw xmm/m128,xmm */
+ case X86EMUL_OPC(0x0f38, 0x06): /* phsubd mm/m64,mm */
+ case X86EMUL_OPC_66(0x0f38, 0x06): /* phsubd xmm/m128,xmm */
+ case X86EMUL_OPC(0x0f38, 0x07): /* phsubsw mm/m64,mm */
+ case X86EMUL_OPC_66(0x0f38, 0x07): /* phsubsw xmm/m128,xmm */
+ case X86EMUL_OPC(0x0f38, 0x08): /* psignb mm/m64,mm */
+ case X86EMUL_OPC_66(0x0f38, 0x08): /* psignb xmm/m128,xmm */
+ case X86EMUL_OPC(0x0f38, 0x09): /* psignw mm/m64,mm */
+ case X86EMUL_OPC_66(0x0f38, 0x09): /* psignw xmm/m128,xmm */
+ case X86EMUL_OPC(0x0f38, 0x0a): /* psignd mm/m64,mm */
+ case X86EMUL_OPC_66(0x0f38, 0x0a): /* psignd xmm/m128,xmm */
+ case X86EMUL_OPC(0x0f38, 0x0b): /* pmulhrsw mm/m64,mm */
+ case X86EMUL_OPC_66(0x0f38, 0x0b): /* pmulhrsw xmm/m128,xmm */
+ case X86EMUL_OPC(0x0f38, 0x1c): /* pabsb mm/m64,mm */
+ case X86EMUL_OPC_66(0x0f38, 0x1c): /* pabsb xmm/m128,xmm */
+ case X86EMUL_OPC(0x0f38, 0x1d): /* pabsw mm/m64,mm */
+ case X86EMUL_OPC_66(0x0f38, 0x1d): /* pabsw xmm/m128,xmm */
+ case X86EMUL_OPC(0x0f38, 0x1e): /* pabsd mm/m64,mm */
+ case X86EMUL_OPC_66(0x0f38, 0x1e): /* pabsd xmm/m128,xmm */
+ host_and_vcpu_must_have(ssse3);
+ if ( vex.pfx )
+ get_fpu(X86EMUL_FPU_xmm, &fic);
+ else
+ {
+ host_and_vcpu_must_have(mmx);
+ get_fpu(X86EMUL_FPU_mmx, &fic);
+ }
+ opc = init_prefixes(stub);
+ opc[0] = 0x38;
+ opc[1] = b;
+ opc[2] = modrm;
+ if ( ea.type == OP_MEM )
+ {
+ /* Convert memory operand to (%rAX). */
+ rex_prefix &= ~REX_B;
+ vex.b = 1;
+ opc[2] &= 0x38;
+ }
+ fic.insn_bytes = PFX_BYTES + 3;
+ break;
+
case X86EMUL_OPC_66(0x0f38, 0x2a): /* movntdqa m128,xmm */
case X86EMUL_OPC_VEX_66(0x0f38, 0x2a): /* vmovntdqa mem,{x,y}mm */
generate_exception_if(ea.type != OP_MEM, EXC_UD);
@@ -7060,6 +7132,31 @@ x86_emulate(
: "0" ((uint32_t)src.val), "rm" (_regs._edx) );
break;

+ case X86EMUL_OPC(0x0f3a, 0x0f): /* palignr $imm8,mm/m64,mm */
+ case X86EMUL_OPC_66(0x0f3a, 0x0f): /* palignr $imm8,xmm/m128,xmm */
+ host_and_vcpu_must_have(ssse3);
+ if ( vex.pfx )
+ get_fpu(X86EMUL_FPU_xmm, &fic);
+ else
+ {
+ host_and_vcpu_must_have(mmx);
+ get_fpu(X86EMUL_FPU_mmx, &fic);
+ }
+ opc = init_prefixes(stub);
+ opc[0] = 0x3a;
+ opc[1] = b;
+ opc[2] = modrm;
+ if ( ea.type == OP_MEM )
+ {
+ /* Convert memory operand to (%rAX). */
+ rex_prefix &= ~REX_B;
+ vex.b = 1;
+ opc[2] &= 0x38;
+ }
+ opc[3] = imm1;
+ fic.insn_bytes = PFX_BYTES + 4;
+ break;
+
case X86EMUL_OPC_VEX_F2(0x0f3a, 0xf0): /* rorx imm,r/m,r */
vcpu_must_have(bmi2);
generate_exception_if(vex.l || vex.reg != 0xf, EXC_UD);
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -39,6 +39,7 @@
#define cpu_has_mtrr 1
#define cpu_has_mmx boot_cpu_has(X86_FEATURE_MMX)
#define cpu_has_sse3 boot_cpu_has(X86_FEATURE_SSE3)
+#define cpu_has_ssse3 boot_cpu_has(X86_FEATURE_SSSE3)
#define cpu_has_sse4_2 boot_cpu_has(X86_FEATURE_SSE4_2)
#define cpu_has_popcnt boot_cpu_has(X86_FEATURE_POPCNT)
#define cpu_has_htt boot_cpu_has(X86_FEATURE_HTT)
Andrew Cooper
2017-03-01 16:06:40 UTC
Permalink
Post by Jan Beulich
... and their AVX equivalents.
Reviewed-by: Andrew Cooper <***@citrix.com>
Jan Beulich
2017-02-28 12:56:59 UTC
Permalink
... and their AVX equivalents.

Signed-off-by: Jan Beulich <***@suse.com>
---
v3: New.

--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -2542,6 +2542,149 @@ int main(int argc, char **argv)
else
printf("skipped\n");

+ printf("%-40s", "Testing pcmpestri $0x1a,(%ecx),%xmm2...");
+ if ( stack_exec && cpu_has_sse4_2 )
+ {
+ decl_insn(pcmpestri);
+
+ memcpy(res, "abcdefgh\0\1\2\3\4\5\6\7", 16);
+ asm volatile ( "movq %0, %%xmm2\n"
+ put_insn(pcmpestri, "pcmpestri $0b00011010, (%1), %%xmm2")
+ :: "m" (res[0]), "c" (NULL) );
+
+ set_insn(pcmpestri);
+ regs.eax = regs.edx = 12;
+ regs.ecx = (unsigned long)res;
+ regs.eflags = X86_EFLAGS_PF | X86_EFLAGS_AF |
+ X86_EFLAGS_IF | X86_EFLAGS_OF;
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( rc != X86EMUL_OKAY || !check_eip(pcmpestri) ||
+ regs.ecx != 9 ||
+ (regs.eflags & X86_EFLAGS_ARITH_MASK) !=
+ (X86_EFLAGS_CF | X86_EFLAGS_ZF | X86_EFLAGS_SF) )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
+ printf("%-40s", "Testing pcmpestrm $0x5a,(%ecx),%xmm2...");
+ if ( stack_exec && cpu_has_sse4_2 )
+ {
+ decl_insn(pcmpestrm);
+
+ asm volatile ( "movq %0, %%xmm2\n"
+ put_insn(pcmpestrm, "pcmpestrm $0b01011010, (%1), %%xmm2")
+ :: "m" (res[0]), "c" (NULL) );
+
+ set_insn(pcmpestrm);
+ regs.ecx = (unsigned long)res;
+ regs.eflags = X86_EFLAGS_PF | X86_EFLAGS_AF |
+ X86_EFLAGS_IF | X86_EFLAGS_OF;
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( rc != X86EMUL_OKAY || !check_eip(pcmpestrm) )
+ goto fail;
+ asm ( "pmovmskb %%xmm0, %0" : "=r" (rc) );
+ if ( rc != 0x0e00 ||
+ (regs.eflags & X86_EFLAGS_ARITH_MASK) !=
+ (X86_EFLAGS_CF | X86_EFLAGS_ZF | X86_EFLAGS_SF) )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
+ printf("%-40s", "Testing pcmpistri $0x1a,(%ecx),%xmm2...");
+ if ( stack_exec && cpu_has_sse4_2 )
+ {
+ decl_insn(pcmpistri);
+
+ asm volatile ( "movq %0, %%xmm2\n"
+ put_insn(pcmpistri, "pcmpistri $0b00011010, (%1), %%xmm2")
+ :: "m" (res[0]), "c" (NULL) );
+
+ set_insn(pcmpistri);
+ regs.eflags = X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
+ X86_EFLAGS_IF | X86_EFLAGS_OF;
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( rc != X86EMUL_OKAY || !check_eip(pcmpistri) ||
+ regs.ecx != 16 ||
+ (regs.eflags & X86_EFLAGS_ARITH_MASK) !=
+ (X86_EFLAGS_ZF | X86_EFLAGS_SF) )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
+ printf("%-40s", "Testing pcmpistrm $0x4a,(%ecx),%xmm2...");
+ if ( stack_exec && cpu_has_sse4_2 )
+ {
+ decl_insn(pcmpistrm);
+
+ asm volatile ( "movq %0, %%xmm2\n"
+ put_insn(pcmpistrm, "pcmpistrm $0b01001010, (%1), %%xmm2")
+ :: "m" (res[0]), "c" (NULL) );
+
+ set_insn(pcmpistrm);
+ regs.ecx = (unsigned long)res;
+ regs.eflags = X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_IF;
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( rc != X86EMUL_OKAY || !check_eip(pcmpistrm) )
+ goto fail;
+ asm ( "pmovmskb %%xmm0, %0" : "=r" (rc) );
+ if ( rc != 0xffff ||
+ (regs.eflags & X86_EFLAGS_ARITH_MASK) !=
+ (X86_EFLAGS_CF | X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF) )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
+ printf("%-40s", "Testing vpcmpestri $0x7a,(%esi),%xmm2...");
+ if ( stack_exec && cpu_has_avx )
+ {
+ decl_insn(vpcmpestri);
+
+#ifdef __x86_64__
+ /*
+ * gas up to at least 2.27 doesn't honor explict "rex.w" for
+ * VEX/EVEX encoded instructions, and also doesn't provide any
+ * other means to control VEX.W.
+ */
+ asm volatile ( "movq %0, %%xmm2\n"
+ put_insn(vpcmpestri,
+ ".byte 0xC4, 0xE3, 0xF9, 0x61, 0x16, 0x7A")
+ :: "m" (res[0]) );
+#else
+ asm volatile ( "movq %0, %%xmm2\n"
+ put_insn(vpcmpestri,
+ "vpcmpestri $0b01111010, (%1), %%xmm2")
+ :: "m" (res[0]), "S" (NULL) );
+#endif
+
+ set_insn(vpcmpestri);
+#ifdef __x86_64__
+ regs.rax = ~0U + 1UL;
+ regs.rcx = ~0UL;
+#else
+ regs.eax = 0x7fffffff;
+#endif
+ regs.esi = (unsigned long)res;
+ regs.eflags = X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_SF |
+ X86_EFLAGS_IF | X86_EFLAGS_OF;
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( rc != X86EMUL_OKAY || !check_eip(vpcmpestri) ||
+ regs.ecx != 11 ||
+ (regs.eflags & X86_EFLAGS_ARITH_MASK) !=
+ (X86_EFLAGS_ZF | X86_EFLAGS_CF) )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
printf("%-40s", "Testing stmxcsr (%edx)...");
if ( cpu_has_sse )
{
--- a/tools/tests/x86_emulator/x86_emulate.h
+++ b/tools/tests/x86_emulator/x86_emulate.h
@@ -100,6 +100,12 @@ static inline uint64_t xgetbv(uint32_t x
(res.c & (1U << 19)) != 0; \
})

+#define cpu_has_sse4_2 ({ \
+ struct cpuid_leaf res; \
+ emul_test_cpuid(1, 0, &res, NULL); \
+ (res.c & (1U << 20)) != 0; \
+})
+
#define cpu_has_popcnt ({ \
struct cpuid_leaf res; \
emul_test_cpuid(1, 0, &res, NULL); \
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -365,7 +365,7 @@ static const struct {
[0x2a] = { .simd_size = simd_packed_int, .two_op = 1 },
[0x2b] = { .simd_size = simd_packed_int },
[0x30 ... 0x35] = { .simd_size = simd_other, .two_op = 1 },
- [0x38 ... 0x3f] = { .simd_size = simd_packed_int },
+ [0x37 ... 0x3f] = { .simd_size = simd_packed_int },
[0x40] = { .simd_size = simd_packed_int },
[0x41] = { .simd_size = simd_packed_int, .two_op = 1 },
[0xf0] = { .two_op = 1 },
@@ -395,6 +395,7 @@ static const struct {
[0x42] = { .simd_size = simd_packed_int },
[0x4a ... 0x4b] = { .simd_size = simd_packed_fp, .four_op = 1 },
[0x4c] = { .simd_size = simd_packed_int, .four_op = 1 },
+ [0x60 ... 0x63] = { .simd_size = simd_packed_int, .two_op = 1 },
[0xf0] = {},
};

@@ -5999,6 +6000,7 @@ x86_emulate(
case X86EMUL_OPC_VEX_66(0x0f38, 0x28): /* vpmuldq {x,y}mm/mem,{x,y}mm,{x,y}mm */
case X86EMUL_OPC_VEX_66(0x0f38, 0x29): /* vpcmpeqq {x,y}mm/mem,{x,y}mm,{x,y}mm */
case X86EMUL_OPC_VEX_66(0x0f38, 0x2b): /* vpackusdw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x37): /* vpcmpgtq {x,y}mm/mem,{x,y}mm,{x,y}mm */
case X86EMUL_OPC_VEX_66(0x0f38, 0x38): /* vpminsb {x,y}mm/mem,{x,y}mm,{x,y}mm */
case X86EMUL_OPC_VEX_66(0x0f38, 0x39): /* vpminsd {x,y}mm/mem,{x,y}mm,{x,y}mm */
case X86EMUL_OPC_VEX_66(0x0f38, 0x3a): /* vpminub {x,y}mm/mem,{x,y}mm,{x,y}mm */
@@ -7147,6 +7149,10 @@ x86_emulate(
}
goto movdqa;

+ case X86EMUL_OPC_66(0x0f38, 0x37): /* pcmpgtq xmm/m128,xmm */
+ host_and_vcpu_must_have(sse4_2);
+ goto simd_0f38_common;
+
case X86EMUL_OPC(0x0f38, 0xf0): /* movbe m,r */
case X86EMUL_OPC(0x0f38, 0xf1): /* movbe r,m */
vcpu_must_have(movbe);
@@ -7437,6 +7443,63 @@ x86_emulate(
generate_exception_if(vex.w, EXC_UD);
goto simd_0f_int_imm8;

+ case X86EMUL_OPC_66(0x0f3a, 0x60): /* pcmpestrm $imm8,xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f3a, 0x60): /* vpcmpestrm $imm8,xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f3a, 0x61): /* pcmpestri $imm8,xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f3a, 0x61): /* vpcmpestri $imm8,xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f3a, 0x62): /* pcmpistrm $imm8,xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f3a, 0x62): /* vpcmpistrm $imm8,xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f3a, 0x63): /* pcmpistri $imm8,xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f3a, 0x63): /* vpcmpistri $imm8,xmm/m128,xmm */
+ if ( vex.opcx == vex_none )
+ {
+ host_and_vcpu_must_have(sse4_2);
+ get_fpu(X86EMUL_FPU_xmm, &fic);
+ }
+ else
+ {
+ generate_exception_if(vex.l || vex.reg != 0xf, EXC_UD);
+ host_and_vcpu_must_have(avx);
+ get_fpu(X86EMUL_FPU_ymm, &fic);
+ }
+
+ opc = init_prefixes(stub);
+ if ( vex.opcx == vex_none )
+ opc[0] = 0x3a;
+ opc[vex.opcx == vex_none] = b;
+ opc[1 + (vex.opcx == vex_none)] = modrm;
+ if ( ea.type == OP_MEM )
+ {
+ /* Convert memory operand to (%rDI). */
+ rex_prefix &= ~REX_B;
+ vex.b = 1;
+ opc[1 + (vex.opcx == vex_none)] &= 0x3f;
+ opc[1 + (vex.opcx == vex_none)] |= 0x07;
+
+ rc = ops->read(ea.mem.seg, ea.mem.off, mmvalp, 16, ctxt);
+ if ( rc != X86EMUL_OKAY )
+ goto done;
+ }
+ opc[2 + (vex.opcx == vex_none)] = imm1;
+ fic.insn_bytes = PFX_BYTES + 3 + (vex.opcx == vex_none);
+ opc[3 + (vex.opcx == vex_none)] = 0xc3;
+
+ copy_REX_VEX(opc, rex_prefix, vex);
+#ifdef __x86_64__
+ if ( rex_prefix & REX_W )
+ emulate_stub("=c" (dst.val), "m" (*mmvalp), "D" (mmvalp),
+ "a" (_regs.rax), "d" (_regs.rdx));
+ else
+#endif
+ emulate_stub("=c" (dst.val), "m" (*mmvalp), "D" (mmvalp),
+ "a" (_regs._eax), "d" (_regs._edx));
+
+ state->simd_size = simd_none;
+ if ( b & 1 )
+ _regs.r(cx) = (uint32_t)dst.val;
+ dst.type = OP_NONE;
+ break;
+
case X86EMUL_OPC_VEX_F2(0x0f3a, 0xf0): /* rorx imm,r/m,r */
vcpu_must_have(bmi2);
generate_exception_if(vex.l || vex.reg != 0xf, EXC_UD);
Andrew Cooper
2017-03-01 17:21:13 UTC
Permalink
Post by Jan Beulich
... and their AVX equivalents.
(Subject to the same (vex.opcx == vex_none) concern as the previous
patch), Reviewed-by: Andrew Cooper <***@citrix.com>
Jan Beulich
2017-02-28 12:56:29 UTC
Permalink
... and their AVX equivalents.

Signed-off-by: Jan Beulich <***@suse.com>
---
v4: Or in ByteOp for {,v}pinsrb instead of assigning it (in
x86_decode_0f3a()). Correct case label for ptest. Add missing
copy_REX_VEX() to {,v}ptest handling. Add missing immediate bytes
to {,v}pextr* etc handling. dppd requires vex.l clear. Use
consistent approach for stub setup in code paths shared between
VEX- and non-VEX-encoded insns.
v3: New.

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -219,6 +219,13 @@ enum simd_opsize {
*/
simd_single_fp,

+ /*
+ * Scalar floating point:
+ * - 32 bits with low opcode bit clear (scalar single)
+ * - 64 bits with low opcode bit set (scalar double)
+ */
+ simd_scalar_fp,
+
/* Operand size encoded in non-standard way. */
simd_other
};
@@ -349,21 +356,45 @@ static const struct {
uint8_t vsib:1;
} ext0f38_table[256] = {
[0x00 ... 0x0b] = { .simd_size = simd_packed_int },
+ [0x10] = { .simd_size = simd_packed_int },
+ [0x14 ... 0x15] = { .simd_size = simd_packed_fp },
+ [0x17] = { .simd_size = simd_packed_int, .two_op = 1 },
[0x1c ... 0x1e] = { .simd_size = simd_packed_int, .two_op = 1 },
+ [0x20 ... 0x25] = { .simd_size = simd_other, .two_op = 1 },
+ [0x28 ... 0x29] = { .simd_size = simd_packed_int },
[0x2a] = { .simd_size = simd_packed_int, .two_op = 1 },
+ [0x2b] = { .simd_size = simd_packed_int },
+ [0x30 ... 0x35] = { .simd_size = simd_other, .two_op = 1 },
+ [0x38 ... 0x3f] = { .simd_size = simd_packed_int },
+ [0x40] = { .simd_size = simd_packed_int },
+ [0x41] = { .simd_size = simd_packed_int, .two_op = 1 },
[0xf0] = { .two_op = 1 },
[0xf1] = { .to_memory = 1, .two_op = 1 },
[0xf2 ... 0xf3] = {},
[0xf5 ... 0xf7] = {},
};

+/* Shift values between src and dst sizes of pmov{s,z}x{b,w,d}{w,d,q}. */
+static const uint8_t pmov_convert_delta[] = { 1, 2, 3, 1, 2, 1 };
+
static const struct {
uint8_t simd_size:5;
uint8_t to_memory:1;
uint8_t two_op:1;
uint8_t four_op:1;
} ext0f3a_table[256] = {
- [0x0f] = { .simd_size = simd_packed_int },
+ [0x08 ... 0x09] = { .simd_size = simd_packed_fp, .two_op = 1 },
+ [0x0a ... 0x0b] = { .simd_size = simd_scalar_fp },
+ [0x0c ... 0x0d] = { .simd_size = simd_packed_fp },
+ [0x0e ... 0x0f] = { .simd_size = simd_packed_int },
+ [0x14 ... 0x17] = { .simd_size = simd_none, .to_memory = 1, .two_op = 1 },
+ [0x20] = { .simd_size = simd_none },
+ [0x21] = { .simd_size = simd_other },
+ [0x22] = { .simd_size = simd_none },
+ [0x40 ... 0x41] = { .simd_size = simd_packed_fp },
+ [0x42] = { .simd_size = simd_packed_int },
+ [0x4a ... 0x4b] = { .simd_size = simd_packed_fp, .four_op = 1 },
+ [0x4c] = { .simd_size = simd_packed_int, .four_op = 1 },
[0xf0] = {},
};

@@ -2314,6 +2345,33 @@ x86_decode_0f38(
}

static int
+x86_decode_0f3a(
+ struct x86_emulate_state *state,
+ struct x86_emulate_ctxt *ctxt,
+ const struct x86_emulate_ops *ops)
+{
+ if ( !vex.opcx )
+ ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
+
+ switch ( ctxt->opcode & X86EMUL_OPC_MASK )
+ {
+ case X86EMUL_OPC_66(0, 0x20): /* pinsrb */
+ case X86EMUL_OPC_VEX_66(0, 0x20): /* vpinsrb */
+ state->desc = DstImplicit | SrcMem;
+ if ( modrm_mod != 3 )
+ state->desc |= ByteOp;
+ break;
+
+ case X86EMUL_OPC_66(0, 0x22): /* pinsr{d,q} */
+ case X86EMUL_OPC_VEX_66(0, 0x22): /* vpinsr{d,q} */
+ state->desc = DstImplicit | SrcMem;
+ break;
+ }
+
+ return X86EMUL_OKAY;
+}
+
+static int
x86_decode(
struct x86_emulate_state *state,
struct x86_emulate_ctxt *ctxt,
@@ -2801,8 +2859,7 @@ x86_decode(
imm1 &= 0x7f;
state->desc = d;
state->simd_size = ext0f3a_table[b].simd_size;
- if ( !vex.opcx )
- ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
+ rc = x86_decode_0f3a(state, ctxt, ops);
break;

case ext_8f08:
@@ -2866,6 +2923,10 @@ x86_decode(
}
break;

+ case simd_scalar_fp:
+ op_bytes = 4 << (ctxt->opcode & 1);
+ break;
+
default:
op_bytes = 0;
break;
@@ -5935,6 +5996,18 @@ x86_emulate(
case X86EMUL_OPC_VEX_66(0x0f38, 0x1c): /* vpabsb {x,y}mm/mem,{x,y}mm */
case X86EMUL_OPC_VEX_66(0x0f38, 0x1d): /* vpabsw {x,y}mm/mem,{x,y}mm */
case X86EMUL_OPC_VEX_66(0x0f38, 0x1e): /* vpabsd {x,y}mm/mem,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x28): /* vpmuldq {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x29): /* vpcmpeqq {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x2b): /* vpackusdw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x38): /* vpminsb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x39): /* vpminsd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x3a): /* vpminub {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x3b): /* vpminud {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x3c): /* vpmaxsb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x3d): /* vpmaxsd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x3e): /* vpmaxub {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x3f): /* vpmaxud {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x40): /* vpmulld {x,y}mm/mem,{x,y}mm,{x,y}mm */
if ( !vex.l )
goto simd_0f_avx;
host_and_vcpu_must_have(avx2);
@@ -5947,6 +6020,10 @@ x86_emulate(
get_fpu(X86EMUL_FPU_mmx, &fic);
goto simd_0f_common;

+ case X86EMUL_OPC_VEX_66(0x0f38, 0x41): /* vphminposuw xmm/m128,xmm,xmm */
+ generate_exception_if(vex.l, EXC_UD);
+ goto simd_0f_avx;
+
CASE_SIMD_PACKED_INT(0x0f, 0x6e): /* mov{d,q} r/m,{,x}mm */
case X86EMUL_OPC_VEX_66(0x0f, 0x6e): /* vmov{d,q} r/m,xmm */
CASE_SIMD_PACKED_INT(0x0f, 0x7e): /* mov{d,q} {,x}mm,r/m */
@@ -6030,11 +6107,20 @@ x86_emulate(
simd_0f_int_imm8:
if ( vex.opcx != vex_none )
{
+ case X86EMUL_OPC_VEX_66(0x0f3a, 0x0e): /* vpblendw $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
case X86EMUL_OPC_VEX_66(0x0f3a, 0x0f): /* vpalignr $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f3a, 0x42): /* vmpsadbw $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
if ( vex.l )
host_and_vcpu_must_have(avx2);
else
{
+ case X86EMUL_OPC_VEX_66(0x0f3a, 0x08): /* vroundps $imm8,{x,y}mm/mem,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f3a, 0x09): /* vroundpd $imm8,{x,y}mm/mem,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f3a, 0x0a): /* vroundss $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f3a, 0x0b): /* vroundsd $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f3a, 0x0c): /* vblendps $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f3a, 0x0d): /* vblendpd $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f3a, 0x40): /* vdpps $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
simd_0f_imm8_avx:
host_and_vcpu_must_have(avx);
}
@@ -6931,7 +7017,10 @@ x86_emulate(
case X86EMUL_OPC_66(0x0f38, 0x1e): /* pabsd xmm/m128,xmm */
host_and_vcpu_must_have(ssse3);
if ( vex.pfx )
+ {
+ simd_0f38_common:
get_fpu(X86EMUL_FPU_xmm, &fic);
+ }
else
{
host_and_vcpu_must_have(mmx);
@@ -6951,6 +7040,97 @@ x86_emulate(
fic.insn_bytes = PFX_BYTES + 3;
break;

+ case X86EMUL_OPC_66(0x0f38, 0x20): /* pmovsxbw xmm/m64,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x21): /* pmovsxbd xmm/m32,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x22): /* pmovsxbq xmm/m16,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x23): /* pmovsxwd xmm/m64,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x24): /* pmovsxwq xmm/m32,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x25): /* pmovsxdq xmm/m64,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x30): /* pmovzxbw xmm/m64,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x31): /* pmovzxbd xmm/m32,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x32): /* pmovzxbq xmm/m16,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x33): /* pmovzxwd xmm/m64,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x34): /* pmovzxwq xmm/m32,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x35): /* pmovzxdq xmm/m64,xmm */
+ op_bytes = 16 >> pmov_convert_delta[b & 7];
+ /* fall through */
+ case X86EMUL_OPC_66(0x0f38, 0x10): /* pblendvb XMM0,xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x14): /* blendvps XMM0,xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x15): /* blendvpd XMM0,xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x28): /* pmuldq xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x29): /* pcmpeqq xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x2b): /* packusdw xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x38): /* pminsb xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x39): /* pminsd xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x3a): /* pminub xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x3b): /* pminud xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x3c): /* pmaxsb xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x3d): /* pmaxsd xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x3e): /* pmaxub xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x3f): /* pmaxud xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x40): /* pmulld xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x41): /* phminposuw xmm/m128,xmm */
+ host_and_vcpu_must_have(sse4_1);
+ goto simd_0f38_common;
+
+ case X86EMUL_OPC_66(0x0f38, 0x17): /* ptest xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x17): /* vptest {x,y}mm/mem,{x,y}mm */
+ if ( vex.opcx == vex_none )
+ {
+ host_and_vcpu_must_have(sse4_1);
+ get_fpu(X86EMUL_FPU_xmm, &fic);
+ }
+ else
+ {
+ generate_exception_if(vex.reg != 0xf, EXC_UD);
+ host_and_vcpu_must_have(avx);
+ get_fpu(X86EMUL_FPU_ymm, &fic);
+ }
+
+ opc = init_prefixes(stub);
+ if ( vex.opcx == vex_none )
+ opc[0] = 0x38;
+ opc[vex.opcx == vex_none] = b;
+ opc[1 + (vex.opcx == vex_none)] = modrm;
+ if ( ea.type == OP_MEM )
+ {
+ rc = ops->read(ea.mem.seg, ea.mem.off, mmvalp, 16 << vex.l, ctxt);
+ if ( rc != X86EMUL_OKAY )
+ goto done;
+
+ /* Convert memory operand to (%rAX). */
+ rex_prefix &= ~REX_B;
+ vex.b = 1;
+ opc[1 + (vex.opcx == vex_none)] &= 0x38;
+ }
+ fic.insn_bytes = PFX_BYTES + 2 + (vex.opcx == vex_none);
+ opc[2 + (vex.opcx == vex_none)] = 0xc3;
+
+ copy_REX_VEX(opc, rex_prefix, vex);
+ emulate_stub("+m" (*mmvalp), "a" (mmvalp));
+
+ put_stub(stub);
+ put_fpu(&fic);
+
+ state->simd_size = simd_none;
+ dst.type = OP_NONE;
+ break;
+
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x20): /* vpmovsxbw xmm/mem,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x21): /* vpmovsxbd xmm/mem,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x22): /* vpmovsxbq xmm/mem,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x23): /* vpmovsxwd xmm/mem,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x24): /* vpmovsxwq xmm/mem,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x25): /* vpmovsxdq xmm/mem,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x30): /* vpmovzxbw xmm/mem,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x31): /* vpmovzxbd xmm/mem,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x32): /* vpmovzxbq xmm/mem,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x33): /* vpmovzxwd xmm/mem,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x34): /* vpmovzxwq xmm/mem,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x35): /* vpmovzxdq xmm/mem,{x,y}mm */
+ op_bytes = 16 >> (pmov_convert_delta[b & 7] - vex.l);
+ goto simd_0f_int;
+
case X86EMUL_OPC_66(0x0f38, 0x2a): /* movntdqa m128,xmm */
case X86EMUL_OPC_VEX_66(0x0f38, 0x2a): /* vmovntdqa mem,{x,y}mm */
generate_exception_if(ea.type != OP_MEM, EXC_UD);
@@ -7136,7 +7316,10 @@ x86_emulate(
case X86EMUL_OPC_66(0x0f3a, 0x0f): /* palignr $imm8,xmm/m128,xmm */
host_and_vcpu_must_have(ssse3);
if ( vex.pfx )
+ {
+ simd_0f3a_common:
get_fpu(X86EMUL_FPU_xmm, &fic);
+ }
else
{
host_and_vcpu_must_have(mmx);
@@ -7157,6 +7340,103 @@ x86_emulate(
fic.insn_bytes = PFX_BYTES + 4;
break;

+ case X86EMUL_OPC_66(0x0f3a, 0x08): /* roundps $imm8,xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f3a, 0x09): /* roundpd $imm8,xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f3a, 0x0a): /* roundss $imm8,xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f3a, 0x0b): /* roundsd $imm8,xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f3a, 0x0c): /* blendps $imm8,xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f3a, 0x0d): /* blendpd $imm8,xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f3a, 0x0e): /* pblendw $imm8,xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f3a, 0x40): /* dpps $imm8,xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f3a, 0x41): /* dppd $imm8,xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f3a, 0x42): /* mpsadbw $imm8,xmm/m128,xmm */
+ host_and_vcpu_must_have(sse4_1);
+ goto simd_0f3a_common;
+
+ case X86EMUL_OPC_66(0x0f3a, 0x14): /* pextrb $imm8,xmm,r/m */
+ case X86EMUL_OPC_66(0x0f3a, 0x15): /* pextrw $imm8,xmm,r/m */
+ case X86EMUL_OPC_66(0x0f3a, 0x16): /* pextr{d,q} $imm8,xmm,r/m */
+ case X86EMUL_OPC_66(0x0f3a, 0x17): /* extractps $imm8,xmm,r/m */
+ host_and_vcpu_must_have(sse4_1);
+ get_fpu(X86EMUL_FPU_xmm, &fic);
+
+ opc = init_prefixes(stub);
+ opc[0] = 0x3a;
+ pextr:
+ opc[vex.opcx == vex_none] = b;
+ /* Convert memory/GPR operand to (%rAX). */
+ rex_prefix &= ~REX_B;
+ vex.b = 1;
+ if ( !mode_64bit() )
+ vex.w = 0;
+ opc[1 + (vex.opcx == vex_none)] = modrm & 0x38;
+ opc[2 + (vex.opcx == vex_none)] = imm1;
+ fic.insn_bytes = PFX_BYTES + 3 + (vex.opcx == vex_none);
+ opc[3 + (vex.opcx == vex_none)] = 0xc3;
+
+ copy_REX_VEX(opc, rex_prefix, vex);
+ invoke_stub("", "", "=m" (dst.val) : "a" (&dst.val));
+
+ put_stub(stub);
+ put_fpu(&fic);
+
+ dst.bytes = dst.type == OP_REG || b == 0x17 ? 4 : 1 << (b & 3);
+ if ( b == 0x16 && (rex_prefix & REX_W) )
+ dst.bytes = 8;
+ break;
+
+ case X86EMUL_OPC_VEX_66(0x0f3a, 0x14): /* vpextrb $imm8,xmm,r/m */
+ case X86EMUL_OPC_VEX_66(0x0f3a, 0x15): /* vpextrw $imm8,xmm,r/m */
+ case X86EMUL_OPC_VEX_66(0x0f3a, 0x16): /* vpextr{d,q} $imm8,xmm,r/m */
+ case X86EMUL_OPC_VEX_66(0x0f3a, 0x17): /* vextractps $imm8,xmm,r/m */
+ generate_exception_if(vex.l || vex.reg != 0xf, EXC_UD);
+ host_and_vcpu_must_have(avx);
+ get_fpu(X86EMUL_FPU_ymm, &fic);
+ opc = init_prefixes(stub);
+ goto pextr;
+
+ case X86EMUL_OPC_66(0x0f3a, 0x20): /* pinsrb $imm8,r32/m8,xmm */
+ case X86EMUL_OPC_66(0x0f3a, 0x22): /* pinsr{d,q} $imm8,r/m,xmm */
+ host_and_vcpu_must_have(sse4_1);
+ get_fpu(X86EMUL_FPU_xmm, &fic);
+ memcpy(mmvalp, &src.val, op_bytes);
+ ea.type = OP_MEM;
+ op_bytes = src.bytes;
+ d = SrcMem16; /* Fake for the common SIMD code below. */
+ state->simd_size = simd_other;
+ goto simd_0f3a_common;
+
+ case X86EMUL_OPC_VEX_66(0x0f3a, 0x20): /* vpinsrb $imm8,r32/m8,xmm,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f3a, 0x22): /* vpinsr{d,q} $imm8,r/m,xmm,xmm */
+ generate_exception_if(vex.l, EXC_UD);
+ memcpy(mmvalp, &src.val, op_bytes);
+ ea.type = OP_MEM;
+ op_bytes = src.bytes;
+ d = SrcMem16; /* Fake for the common SIMD code below. */
+ state->simd_size = simd_other;
+ goto simd_0f_int_imm8;
+
+ case X86EMUL_OPC_66(0x0f3a, 0x21): /* insertps $imm8,xmm/m32,xmm */
+ host_and_vcpu_must_have(sse4_1);
+ op_bytes = 4;
+ goto simd_0f3a_common;
+
+ case X86EMUL_OPC_VEX_66(0x0f3a, 0x21): /* vinsertps $imm8,xmm/m128,xmm,xmm */
+ op_bytes = 4;
+ /* fall through */
+ case X86EMUL_OPC_VEX_66(0x0f3a, 0x41): /* vdppd $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+ generate_exception_if(vex.l, EXC_UD);
+ goto simd_0f_imm8_avx;
+
+ case X86EMUL_OPC_VEX_66(0x0f3a, 0x4a): /* vblendvps {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f3a, 0x4b): /* vblendvpd {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+ generate_exception_if(vex.w, EXC_UD);
+ goto simd_0f_imm8_avx;
+
+ case X86EMUL_OPC_VEX_66(0x0f3a, 0x4c): /* vpblendvb {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+ generate_exception_if(vex.w, EXC_UD);
+ goto simd_0f_int_imm8;
+
case X86EMUL_OPC_VEX_F2(0x0f3a, 0xf0): /* rorx imm,r/m,r */
vcpu_must_have(bmi2);
generate_exception_if(vex.l || vex.reg != 0xf, EXC_UD);
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -40,6 +40,7 @@
#define cpu_has_mmx boot_cpu_has(X86_FEATURE_MMX)
#define cpu_has_sse3 boot_cpu_has(X86_FEATURE_SSE3)
#define cpu_has_ssse3 boot_cpu_has(X86_FEATURE_SSSE3)
+#define cpu_has_sse4_1 boot_cpu_has(X86_FEATURE_SSE4_1)
#define cpu_has_sse4_2 boot_cpu_has(X86_FEATURE_SSE4_2)
#define cpu_has_popcnt boot_cpu_has(X86_FEATURE_POPCNT)
#define cpu_has_htt boot_cpu_has(X86_FEATURE_HTT)
Andrew Cooper
2017-03-01 16:58:05 UTC
Permalink
Post by Jan Beulich
@@ -6951,6 +7040,97 @@ x86_emulate(
fic.insn_bytes = PFX_BYTES + 3;
break;
+ case X86EMUL_OPC_66(0x0f38, 0x20): /* pmovsxbw xmm/m64,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x21): /* pmovsxbd xmm/m32,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x22): /* pmovsxbq xmm/m16,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x23): /* pmovsxwd xmm/m64,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x24): /* pmovsxwq xmm/m32,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x25): /* pmovsxdq xmm/m64,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x30): /* pmovzxbw xmm/m64,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x31): /* pmovzxbd xmm/m32,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x32): /* pmovzxbq xmm/m16,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x33): /* pmovzxwd xmm/m64,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x34): /* pmovzxwq xmm/m32,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x35): /* pmovzxdq xmm/m64,xmm */
+ op_bytes = 16 >> pmov_convert_delta[b & 7];
+ /* fall through */
+ case X86EMUL_OPC_66(0x0f38, 0x10): /* pblendvb XMM0,xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x14): /* blendvps XMM0,xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x15): /* blendvpd XMM0,xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x28): /* pmuldq xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x29): /* pcmpeqq xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x2b): /* packusdw xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x38): /* pminsb xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x39): /* pminsd xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x3a): /* pminub xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x3b): /* pminud xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x3c): /* pmaxsb xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x3d): /* pmaxsd xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x3e): /* pmaxub xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x3f): /* pmaxud xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x40): /* pmulld xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x41): /* phminposuw xmm/m128,xmm */
+ host_and_vcpu_must_have(sse4_1);
+ goto simd_0f38_common;
+
+ case X86EMUL_OPC_66(0x0f38, 0x17): /* ptest xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x17): /* vptest {x,y}mm/mem,{x,y}mm */
+ if ( vex.opcx == vex_none )
+ {
+ host_and_vcpu_must_have(sse4_1);
+ get_fpu(X86EMUL_FPU_xmm, &fic);
+ }
+ else
+ {
+ generate_exception_if(vex.reg != 0xf, EXC_UD);
+ host_and_vcpu_must_have(avx);
+ get_fpu(X86EMUL_FPU_ymm, &fic);
+ }
+
+ opc = init_prefixes(stub);
+ if ( vex.opcx == vex_none )
+ opc[0] = 0x38;
+ opc[vex.opcx == vex_none] = b;
+ opc[1 + (vex.opcx == vex_none)] = modrm;
This use of (vex.opcx == vex_none) for construction is very awkward to read.

How about:

if ( vex.opcx == vex_none )
{
opc[0] = 0x38;
opc++; /* Adjust for extra prefix. */
}

...

if ( vex.opcx == vex_none )
opc--; /* Undo adjustment for extra prefix. */

which allows the rest of the opc[] setup to read like all the other
similar code.


In fact, thinking more about this, using a pointer-arithmatic-based
method of filling the stub would allow for the removal of
"fic.insn_bytes = PFX_BYTES + $X", and any chance of getting the count
wrong.

~Andrew
Jan Beulich
2017-03-02 08:26:56 UTC
Permalink
Post by Andrew Cooper
Post by Jan Beulich
@@ -6951,6 +7040,97 @@ x86_emulate(
fic.insn_bytes = PFX_BYTES + 3;
break;
+ case X86EMUL_OPC_66(0x0f38, 0x20): /* pmovsxbw xmm/m64,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x21): /* pmovsxbd xmm/m32,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x22): /* pmovsxbq xmm/m16,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x23): /* pmovsxwd xmm/m64,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x24): /* pmovsxwq xmm/m32,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x25): /* pmovsxdq xmm/m64,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x30): /* pmovzxbw xmm/m64,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x31): /* pmovzxbd xmm/m32,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x32): /* pmovzxbq xmm/m16,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x33): /* pmovzxwd xmm/m64,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x34): /* pmovzxwq xmm/m32,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x35): /* pmovzxdq xmm/m64,xmm */
+ op_bytes = 16 >> pmov_convert_delta[b & 7];
+ /* fall through */
+ case X86EMUL_OPC_66(0x0f38, 0x10): /* pblendvb XMM0,xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x14): /* blendvps XMM0,xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x15): /* blendvpd XMM0,xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x28): /* pmuldq xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x29): /* pcmpeqq xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x2b): /* packusdw xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x38): /* pminsb xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x39): /* pminsd xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x3a): /* pminub xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x3b): /* pminud xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x3c): /* pmaxsb xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x3d): /* pmaxsd xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x3e): /* pmaxub xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x3f): /* pmaxud xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x40): /* pmulld xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0x41): /* phminposuw xmm/m128,xmm */
+ host_and_vcpu_must_have(sse4_1);
+ goto simd_0f38_common;
+
+ case X86EMUL_OPC_66(0x0f38, 0x17): /* ptest xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x17): /* vptest {x,y}mm/mem,{x,y}mm */
+ if ( vex.opcx == vex_none )
+ {
+ host_and_vcpu_must_have(sse4_1);
+ get_fpu(X86EMUL_FPU_xmm, &fic);
+ }
+ else
+ {
+ generate_exception_if(vex.reg != 0xf, EXC_UD);
+ host_and_vcpu_must_have(avx);
+ get_fpu(X86EMUL_FPU_ymm, &fic);
+ }
+
+ opc = init_prefixes(stub);
+ if ( vex.opcx == vex_none )
+ opc[0] = 0x38;
+ opc[vex.opcx == vex_none] = b;
+ opc[1 + (vex.opcx == vex_none)] = modrm;
This use of (vex.opcx == vex_none) for construction is very awkward to read.
if ( vex.opcx == vex_none )
{
opc[0] = 0x38;
opc++; /* Adjust for extra prefix. */
}
...
if ( vex.opcx == vex_none )
opc--; /* Undo adjustment for extra prefix. */
which allows the rest of the opc[] setup to read like all the other
similar code.
I can do that, as you think it's better to read (which I'm not sure of).
Post by Andrew Cooper
In fact, thinking more about this, using a pointer-arithmatic-based
method of filling the stub would allow for the removal of
"fic.insn_bytes = PFX_BYTES + $X", and any chance of getting the count
wrong.
Well, that would leave the need to store the base address of the
stub somewhere (requiring to keep the very #ifdef-ary you disliked
in an earlier patch). I'd rather keep it the way it is.

Jan
Jan Beulich
2017-02-28 12:58:10 UTC
Permalink
... and its AVX equivalent.

Signed-off-by: Jan Beulich <***@suse.com>
---
v3: New.

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -393,6 +393,7 @@ static const struct {
[0x22] = { .simd_size = simd_none },
[0x40 ... 0x41] = { .simd_size = simd_packed_fp },
[0x42] = { .simd_size = simd_packed_int },
+ [0x44] = { .simd_size = simd_packed_int },
[0x4a ... 0x4b] = { .simd_size = simd_packed_fp, .four_op = 1 },
[0x4c] = { .simd_size = simd_packed_int, .four_op = 1 },
[0x60 ... 0x63] = { .simd_size = simd_packed_int, .two_op = 1 },
@@ -1457,6 +1458,7 @@ static bool vcpu_has(
#define vcpu_has_sse() vcpu_has( 1, EDX, 25, ctxt, ops)
#define vcpu_has_sse2() vcpu_has( 1, EDX, 26, ctxt, ops)
#define vcpu_has_sse3() vcpu_has( 1, ECX, 0, ctxt, ops)
+#define vcpu_has_pclmulqdq() vcpu_has( 1, ECX, 1, ctxt, ops)
#define vcpu_has_ssse3() vcpu_has( 1, ECX, 9, ctxt, ops)
#define vcpu_has_cx16() vcpu_has( 1, ECX, 13, ctxt, ops)
#define vcpu_has_sse4_1() vcpu_has( 1, ECX, 19, ctxt, ops)
@@ -7434,6 +7436,14 @@ x86_emulate(
generate_exception_if(vex.l, EXC_UD);
goto simd_0f_imm8_avx;

+ case X86EMUL_OPC_66(0x0f3a, 0x44): /* pclmulqdq $imm8,xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f3a, 0x44): /* vpclmulqdq $imm8,xmm/m128,xmm,xmm */
+ host_and_vcpu_must_have(pclmulqdq);
+ if ( vex.opcx == vex_none )
+ goto simd_0f3a_common;
+ generate_exception_if(vex.l, EXC_UD);
+ goto simd_0f_imm8_avx;
+
case X86EMUL_OPC_VEX_66(0x0f3a, 0x4a): /* vblendvps {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
case X86EMUL_OPC_VEX_66(0x0f3a, 0x4b): /* vblendvpd {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
generate_exception_if(vex.w, EXC_UD);
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -42,6 +42,7 @@
#define cpu_has_ssse3 boot_cpu_has(X86_FEATURE_SSSE3)
#define cpu_has_sse4_1 boot_cpu_has(X86_FEATURE_SSE4_1)
#define cpu_has_sse4_2 boot_cpu_has(X86_FEATURE_SSE4_2)
+#define cpu_has_pclmulqdq boot_cpu_has(X86_FEATURE_PCLMULQDQ)
#define cpu_has_popcnt boot_cpu_has(X86_FEATURE_POPCNT)
#define cpu_has_htt boot_cpu_has(X86_FEATURE_HTT)
#define cpu_has_nx boot_cpu_has(X86_FEATURE_NX)
Andrew Cooper
2017-03-01 17:44:17 UTC
Permalink
Post by Jan Beulich
... and its AVX equivalent.
---
v3: New.
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -393,6 +393,7 @@ static const struct {
[0x22] = { .simd_size = simd_none },
[0x40 ... 0x41] = { .simd_size = simd_packed_fp },
[0x42] = { .simd_size = simd_packed_int },
+ [0x44] = { .simd_size = simd_packed_int },
[0x4a ... 0x4b] = { .simd_size = simd_packed_fp, .four_op = 1 },
[0x4c] = { .simd_size = simd_packed_int, .four_op = 1 },
[0x60 ... 0x63] = { .simd_size = simd_packed_int, .two_op = 1 },
@@ -1457,6 +1458,7 @@ static bool vcpu_has(
#define vcpu_has_sse() vcpu_has( 1, EDX, 25, ctxt, ops)
#define vcpu_has_sse2() vcpu_has( 1, EDX, 26, ctxt, ops)
#define vcpu_has_sse3() vcpu_has( 1, ECX, 0, ctxt, ops)
+#define vcpu_has_pclmulqdq() vcpu_has( 1, ECX, 1, ctxt, ops)
#define vcpu_has_ssse3() vcpu_has( 1, ECX, 9, ctxt, ops)
#define vcpu_has_cx16() vcpu_has( 1, ECX, 13, ctxt, ops)
#define vcpu_has_sse4_1() vcpu_has( 1, ECX, 19, ctxt, ops)
@@ -7434,6 +7436,14 @@ x86_emulate(
generate_exception_if(vex.l, EXC_UD);
goto simd_0f_imm8_avx;
+ case X86EMUL_OPC_66(0x0f3a, 0x44): /* pclmulqdq $imm8,xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f3a, 0x44): /* vpclmulqdq $imm8,xmm/m128,xmm,xmm */
+ host_and_vcpu_must_have(pclmulqdq);
+ if ( vex.opcx == vex_none )
+ goto simd_0f3a_common;
What is this for? There are no other instructions defined (that I can
find) in 0f3a44.
Post by Jan Beulich
+ generate_exception_if(vex.l, EXC_UD);
+ goto simd_0f_imm8_avx;
+
case X86EMUL_OPC_VEX_66(0x0f3a, 0x4a): /* vblendvps {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
case X86EMUL_OPC_VEX_66(0x0f3a, 0x4b): /* vblendvpd {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
generate_exception_if(vex.w, EXC_UD);
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -42,6 +42,7 @@
#define cpu_has_ssse3 boot_cpu_has(X86_FEATURE_SSSE3)
#define cpu_has_sse4_1 boot_cpu_has(X86_FEATURE_SSE4_1)
#define cpu_has_sse4_2 boot_cpu_has(X86_FEATURE_SSE4_2)
+#define cpu_has_pclmulqdq boot_cpu_has(X86_FEATURE_PCLMULQDQ)
#define cpu_has_popcnt boot_cpu_has(X86_FEATURE_POPCNT)
#define cpu_has_htt boot_cpu_has(X86_FEATURE_HTT)
#define cpu_has_nx boot_cpu_has(X86_FEATURE_NX)
Jan Beulich
2017-03-02 08:30:12 UTC
Permalink
Post by Andrew Cooper
Post by Jan Beulich
@@ -7434,6 +7436,14 @@ x86_emulate(
generate_exception_if(vex.l, EXC_UD);
goto simd_0f_imm8_avx;
+ case X86EMUL_OPC_66(0x0f3a, 0x44): /* pclmulqdq $imm8,xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f3a, 0x44): /* vpclmulqdq $imm8,xmm/m128,xmm,xmm */
+ host_and_vcpu_must_have(pclmulqdq);
+ if ( vex.opcx == vex_none )
+ goto simd_0f3a_common;
What is this for? There are no other instructions defined (that I can
find) in 0f3a44.
Perhaps I'm misunderstanding the question. Are you mixing up
vex.opcx and vex.pfx? We want the non-VEX case handled by
the code at simd_0f3a_common, and the VEX one by the code
further down.

Jan

Jan Beulich
2017-02-28 12:57:35 UTC
Permalink
... and their AVX equivalents. Note that a few instructions aren't
covered (yet), but those all fall into common pattern groups, so I
would hope that for now we can do with what is there.

Just like for SSE/SSE2, MMX insns aren't being covered at all, as
they're not easy to deal with: The compiler refuses to emit such for
other than uses of built-in functions.

Signed-off-by: Jan Beulich <***@suse.com>
---
v4: New.

--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -22,24 +22,31 @@ sse-flts := 4
sse2-vecs := $(sse-vecs)
sse2-ints := 1 2 4 8
sse2-flts := 4 8
+sse4-vecs := $(sse2-vecs)
+sse4-ints := $(sse2-ints)
+sse4-flts := $(sse2-flts)

# When converting SSE to AVX, have the compiler avoid XMM0 to widen
-# coverage of the VEX.vvvv checks in the emulator.
-sse2avx := -ffixed-xmm0 -Wa,-msse2avx
+# coverage of the VEX.vvvv checks in the emulator. We must not do this,
+# however, for SSE4.1 and later, as there are instructions with XMM0 as
+# an implicit operand.
+sse2avx-sse := -ffixed-xmm0 -Wa,-msse2avx
+sse2avx-sse2 := $(sse2avx-sse)
+sse2avx-sse4 := -Wa,-msse2avx

-simd-cflags := $(foreach flavor,sse sse2, \
+simd-cflags := $(foreach flavor,sse sse2 sse4, \
$(foreach vec,$($(flavor)-vecs), \
$(foreach int,$($(flavor)-ints), \
"-D$(flavor)_$(vec)i$(int) -m$(flavor) -O2 -DVEC_SIZE=$(vec) -DINT_SIZE=$(int)" \
"-D$(flavor)_$(vec)u$(int) -m$(flavor) -O2 -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)" \
- "-D$(flavor)_avx_$(vec)i$(int) -m$(flavor) $(sse2avx) -O2 -DVEC_SIZE=$(vec) -DINT_SIZE=$(int)" \
- "-D$(flavor)_avx_$(vec)u$(int) -m$(flavor) $(sse2avx) -O2 -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)") \
+ "-D$(flavor)_avx_$(vec)i$(int) -m$(flavor) $(sse2avx-$(flavor)) -O2 -DVEC_SIZE=$(vec) -DINT_SIZE=$(int)" \
+ "-D$(flavor)_avx_$(vec)u$(int) -m$(flavor) $(sse2avx-$(flavor)) -O2 -DVEC_SIZE=$(vec) -DUINT_SIZE=$(int)") \
$(foreach flt,$($(flavor)-flts), \
"-D$(flavor)_$(vec)f$(flt) -m$(flavor) -O2 -DVEC_SIZE=$(vec) -DFLOAT_SIZE=$(flt)" \
- "-D$(flavor)_avx_$(vec)f$(flt) -m$(flavor) $(sse2avx) -O2 -DVEC_SIZE=$(vec) -DFLOAT_SIZE=$(flt)")) \
+ "-D$(flavor)_avx_$(vec)f$(flt) -m$(flavor) $(sse2avx-$(flavor)) -O2 -DVEC_SIZE=$(vec) -DFLOAT_SIZE=$(flt)")) \
$(foreach flt,$($(flavor)-flts), \
"-D$(flavor)_f$(flt) -m$(flavor) -mfpmath=sse -O2 -DFLOAT_SIZE=$(flt)" \
- "-D$(flavor)_avx_f$(flt) -m$(flavor) -mfpmath=sse $(sse2avx) -O2 -DFLOAT_SIZE=$(flt)"))
+ "-D$(flavor)_avx_f$(flt) -m$(flavor) -mfpmath=sse $(sse2avx-$(flavor)) -O2 -DFLOAT_SIZE=$(flt)"))

$(addsuffix .h,$(TESTCASES)): %.h: %.c testcase.mk Makefile
rm -f $@.new $*.bin
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -70,7 +70,9 @@ typedef long long __attribute__((vector_
#if VEC_SIZE == 8 && defined(__SSE__)
# define to_bool(cmp) (__builtin_ia32_pmovmskb(cmp) == 0xff)
#elif VEC_SIZE == 16
-# if defined(__SSE__) && ELEM_SIZE == 4
+# if defined(__SSE4_1__)
+# define to_bool(cmp) __builtin_ia32_ptestc128(cmp, (vdi_t){} == 0)
+# elif defined(__SSE__) && ELEM_SIZE == 4
# define to_bool(cmp) (__builtin_ia32_movmskps(cmp) == 0xf)
# elif defined(__SSE2__)
# if ELEM_SIZE == 8
@@ -182,9 +184,122 @@ static inline bool _to_bool(byte_vec_t b
__builtin_ia32_maskmovdqu((vqi_t)(y), ~m_, d_); \
})
#endif
+#if VEC_SIZE == 16 && defined(__SSE3__)
+# if FLOAT_SIZE == 4
+# define addsub(x, y) __builtin_ia32_addsubps(x, y)
+# define dup_hi(x) __builtin_ia32_movshdup(x)
+# define dup_lo(x) __builtin_ia32_movsldup(x)
+# define hadd(x, y) __builtin_ia32_haddps(x, y)
+# define hsub(x, y) __builtin_ia32_hsubps(x, y)
+# elif FLOAT_SIZE == 8
+# define addsub(x, y) __builtin_ia32_addsubpd(x, y)
+# define dup_lo(x) ({ \
+ double __attribute__((vector_size(16))) r_; \
+ asm ( "movddup %1,%0" : "=x" (r_) : "m" ((x)[0]) ); \
+ r_; \
+})
+# define hadd(x, y) __builtin_ia32_haddpd(x, y)
+# define hsub(x, y) __builtin_ia32_hsubpd(x, y)
+# endif
+#endif
+#if VEC_SIZE == 16 && defined(__SSSE3__)
+# if INT_SIZE == 1
+# define abs(x) ((vec_t)__builtin_ia32_pabsb128((vqi_t)(x)))
+# elif INT_SIZE == 2
+# define abs(x) __builtin_ia32_pabsw128(x)
+# elif INT_SIZE == 4
+# define abs(x) __builtin_ia32_pabsd128(x)
+# endif
+# if INT_SIZE == 1 || UINT_SIZE == 1
+# define copysignz(x, y) ((vec_t)__builtin_ia32_psignb128((vqi_t)(x), (vqi_t)(y)))
+# define swap(x) ((vec_t)__builtin_ia32_pshufb128((vqi_t)(x), (vqi_t)(inv - 1)))
+# define rotr(x, n) ((vec_t)__builtin_ia32_palignr128((vdi_t)(x), (vdi_t)(x), (n) * 8))
+# elif INT_SIZE == 2 || UINT_SIZE == 2
+# define copysignz(x, y) ((vec_t)__builtin_ia32_psignw128((vhi_t)(x), (vhi_t)(y)))
+# define hadd(x, y) ((vec_t)__builtin_ia32_phaddw128((vhi_t)(x), (vhi_t)(y)))
+# define hsub(x, y) ((vec_t)__builtin_ia32_phsubw128((vhi_t)(x), (vhi_t)(y)))
+# define rotr(x, n) ((vec_t)__builtin_ia32_palignr128((vdi_t)(x), (vdi_t)(x), (n) * 16))
+# elif INT_SIZE == 4 || UINT_SIZE == 4
+# define copysignz(x, y) ((vec_t)__builtin_ia32_psignd128((vsi_t)(x), (vsi_t)(y)))
+# define hadd(x, y) ((vec_t)__builtin_ia32_phaddd128((vsi_t)(x), (vsi_t)(y)))
+# define hsub(x, y) ((vec_t)__builtin_ia32_phsubd128((vsi_t)(x), (vsi_t)(y)))
+# define rotr(x, n) ((vec_t)__builtin_ia32_palignr128((vdi_t)(x), (vdi_t)(x), (n) * 32))
+# elif INT_SIZE == 8 || UINT_SIZE == 8
+# define rotr(x, n) ((vec_t)__builtin_ia32_palignr128((vdi_t)(x), (vdi_t)(x), (n) * 64))
+# endif
+#endif
+#if VEC_SIZE == 16 && defined(__SSE4_1__)
+# if INT_SIZE == 1
+# define max(x, y) ((vec_t)__builtin_ia32_pmaxsb128((vqi_t)(x), (vqi_t)(y)))
+# define min(x, y) ((vec_t)__builtin_ia32_pminsb128((vqi_t)(x), (vqi_t)(y)))
+# define widen1(x) ((vec_t)__builtin_ia32_pmovsxbw128((vqi_t)(x)))
+# define widen2(x) ((vec_t)__builtin_ia32_pmovsxbd128((vqi_t)(x)))
+# define widen3(x) ((vec_t)__builtin_ia32_pmovsxbq128((vqi_t)(x)))
+# elif INT_SIZE == 2
+# define widen1(x) ((vec_t)__builtin_ia32_pmovsxwd128(x))
+# define widen2(x) ((vec_t)__builtin_ia32_pmovsxwq128(x))
+# elif INT_SIZE == 4
+# define max(x, y) __builtin_ia32_pmaxsd128(x, y)
+# define min(x, y) __builtin_ia32_pminsd128(x, y)
+# define mul_full(x, y) ((vec_t)__builtin_ia32_pmuldq128(x, y))
+# define widen1(x) ((vec_t)__builtin_ia32_pmovsxdq128(x))
+# elif UINT_SIZE == 1
+# define widen1(x) ((vec_t)__builtin_ia32_pmovzxbw128((vqi_t)(x)))
+# define widen2(x) ((vec_t)__builtin_ia32_pmovzxbd128((vqi_t)(x)))
+# define widen3(x) ((vec_t)__builtin_ia32_pmovzxbq128((vqi_t)(x)))
+# elif UINT_SIZE == 2
+# define max(x, y) ((vec_t)__builtin_ia32_pmaxuw128((vhi_t)(x), (vhi_t)(y)))
+# define min(x, y) ((vec_t)__builtin_ia32_pminuw128((vhi_t)(x), (vhi_t)(y)))
+# define widen1(x) ((vec_t)__builtin_ia32_pmovzxwd128((vhi_t)(x)))
+# define widen2(x) ((vec_t)__builtin_ia32_pmovzxwq128((vhi_t)(x)))
+# elif UINT_SIZE == 4
+# define max(x, y) ((vec_t)__builtin_ia32_pmaxud128((vsi_t)(x), (vsi_t)(y)))
+# define min(x, y) ((vec_t)__builtin_ia32_pminud128((vsi_t)(x), (vsi_t)(y)))
+# define widen1(x) ((vec_t)__builtin_ia32_pmovzxdq128((vsi_t)(x)))
+# endif
+# undef select
+# if defined(INT_SIZE) || defined(UINT_SIZE)
+# define select(d, x, y, m) \
+ (*(d) = (vec_t)__builtin_ia32_pblendvb128((vqi_t)(y), (vqi_t)(x), (vqi_t)(m)))
+# elif FLOAT_SIZE == 4
+# define dot_product(x, y) __builtin_ia32_dpps(x, y, 0b11110001)
+# define select(d, x, y, m) (*(d) = __builtin_ia32_blendvps(y, x, m))
+# define trunc(x) __builtin_ia32_roundps(x, 0b1011)
+# elif FLOAT_SIZE == 8
+# define dot_product(x, y) __builtin_ia32_dppd(x, y, 0b00110001)
+# define select(d, x, y, m) (*(d) = __builtin_ia32_blendvpd(y, x, m))
+# define trunc(x) __builtin_ia32_roundpd(x, 0b1011)
+# endif
+# if INT_SIZE == 2 || UINT_SIZE == 2
+# define mix(x, y) ((vec_t)__builtin_ia32_pblendw128((vhi_t)(x), (vhi_t)(y), 0b10101010))
+# elif INT_SIZE == 4 || UINT_SIZE == 4
+# define mix(x, y) ((vec_t)__builtin_ia32_pblendw128((vhi_t)(x), (vhi_t)(y), 0b11001100))
+# elif INT_SIZE == 8 || UINT_SIZE == 8
+# define mix(x, y) ((vec_t)__builtin_ia32_pblendw128((vhi_t)(x), (vhi_t)(y), 0b11110000))
+# elif FLOAT_SIZE == 4
+# define mix(x, y) __builtin_ia32_blendps(x, y, 0b1010)
+# elif FLOAT_SIZE == 8
+# define mix(x, y) __builtin_ia32_blendpd(x, y, 0b10)
+# endif
+#endif
#if VEC_SIZE == FLOAT_SIZE
# define max(x, y) ((vec_t){({ typeof(x[0]) x_ = (x)[0], y_ = (y)[0]; x_ > y_ ? x_ : y_; })})
# define min(x, y) ((vec_t){({ typeof(x[0]) x_ = (x)[0], y_ = (y)[0]; x_ < y_ ? x_ : y_; })})
+# ifdef __SSE4_1__
+# if FLOAT_SIZE == 4
+# define trunc(x) ({ \
+ float __attribute__((vector_size(16))) r_; \
+ asm ( "roundss $0b1011,%1,%0" : "=x" (r_) : "m" (x) ); \
+ (vec_t){ r_[0] }; \
+})
+# elif FLOAT_SIZE == 8
+# define trunc(x) ({ \
+ double __attribute__((vector_size(16))) r_; \
+ asm ( "roundsd $0b1011,%1,%0" : "=x" (r_) : "m" (x) ); \
+ (vec_t){ r_[0] }; \
+})
+# endif
+# endif
#endif

/*
@@ -290,6 +405,14 @@ int simd_test(void)
if ( !to_bool(sqrt(x) == src) ) return __LINE__;
# endif

+# ifdef trunc
+ x = 1 / src;
+ y = (vec_t){ 1 };
+ touch(x);
+ z = trunc(x);
+ if ( !to_bool(y == z) ) return __LINE__;
+# endif
+
#else

# if ELEM_SIZE > 1
@@ -416,6 +539,17 @@ int simd_test(void)
# endif
#endif

+#ifdef abs
+ x = src * alt;
+ touch(x);
+ if ( !to_bool(abs(x) == src) ) return __LINE__;
+#endif
+
+#ifdef copysignz
+ touch(alt);
+ if ( !to_bool(copysignz((vec_t){} + 1, alt) == alt) ) return __LINE__;
+#endif
+
#ifdef swap
touch(src);
if ( !to_bool(swap(src) == inv) ) return __LINE__;
@@ -435,16 +569,140 @@ int simd_test(void)
if ( !to_bool(z == ELEM_COUNT / 2) ) return __LINE__;
#endif

+#if defined(INT_SIZE) && defined(widen1) && defined(interleave_lo)
+
+ x = src * alt;
+ y = interleave_lo(x, alt < 0);
+ touch(x);
+ z = widen1(x);
+ touch(x);
+ if ( !to_bool(z == y) ) return __LINE__;
+
+# ifdef widen2
+ y = interleave_lo(alt < 0, alt < 0);
+ y = interleave_lo(z, y);
+ touch(x);
+ z = widen2(x);
+ touch(x);
+ if ( !to_bool(z == y) ) return __LINE__;
+
+# ifdef widen3
+ y = interleave_lo(alt < 0, alt < 0);
+ y = interleave_lo(y, y);
+ y = interleave_lo(z, y);
+ touch(x);
+ z = widen3(x);
+ touch(x);
+ if ( !to_bool(z == y) ) return __LINE__;
+# endif
+# endif
+
+#endif
+
+#if defined(UINT_SIZE) && defined(interleave_lo)
+
+ y = interleave_lo(src, (vec_t){});
+ z = interleave_lo(y, (vec_t){});
+
+# ifdef widen1
+ touch(src);
+ x = widen1(src);
+ touch(src);
+ if ( !to_bool(x == y) ) return __LINE__;
+# endif
+
+# ifdef widen2
+ touch(src);
+ x = widen2(src);
+ touch(src);
+ if ( !to_bool(x == z) ) return __LINE__;
+# endif
+
+# ifdef widen3
+ touch(src);
+ x = widen3(src);
+ touch(src);
+ if ( !to_bool(x == interleave_lo(z, (vec_t){})) ) return __LINE__;
+# endif
+
+#endif
+
+#ifdef dup_lo
+ touch(src);
+ x = dup_lo(src);
+ touch(src);
+ if ( !to_bool(x - src == (alt - 1) / 2) ) return __LINE__;
+#endif
+
+#ifdef dup_hi
+ touch(src);
+ x = dup_hi(src);
+ touch(src);
+ if ( !to_bool(x - src == (alt + 1) / 2) ) return __LINE__;
+#endif
+
+ for ( i = 0; i < ELEM_COUNT; ++i )
+ y[i] = (i & 1 ? inv : src)[i];
+
#ifdef select
# ifdef UINT_SIZE
select(&z, src, inv, alt);
# else
select(&z, src, inv, alt > 0);
# endif
- for ( i = 0; i < ELEM_COUNT; ++i )
- y[i] = (i & 1 ? inv : src)[i];
if ( !to_bool(z == y) ) return __LINE__;
#endif

+#ifdef mix
+ touch(src);
+ touch(inv);
+ x = mix(src, inv);
+ if ( !to_bool(x == y) ) return __LINE__;
+
+# ifdef addsub
+ touch(src);
+ touch(inv);
+ x = addsub(src, inv);
+ touch(src);
+ touch(inv);
+ y = mix(src - inv, src + inv);
+ if ( !to_bool(x == y) ) return __LINE__;
+# endif
+#endif
+
+#ifdef rotr
+ x = rotr(src, 1);
+ y = (src & (ELEM_COUNT - 1)) + 1;
+ if ( !to_bool(x == y) ) return __LINE__;
+#endif
+
+#ifdef dot_product
+ touch(src);
+ touch(inv);
+ x = dot_product(src, inv);
+ if ( !to_bool(x == (vec_t){ (ELEM_COUNT * (ELEM_COUNT + 1) *
+ (ELEM_COUNT + 2)) / 6 }) ) return __LINE__;
+#endif
+
+#ifdef hadd
+ x = src;
+ for ( i = ELEM_COUNT; i >>= 1; )
+ {
+ touch(x);
+ x = hadd((vec_t){}, x);
+ }
+ if ( x[ELEM_COUNT - 1] != (ELEM_COUNT * (ELEM_COUNT + 1)) / 2 ) return __LINE__;
+
+# ifdef hsub
+ touch(src);
+ touch(inv);
+ x = hsub(src, inv);
+ for ( i = ELEM_COUNT; i >>= 1; )
+ x = hadd(x, (vec_t){});
+ if ( !to_bool(x == 0) ) return __LINE__;
+# endif
+#endif
+
+
return 0;
}
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -30,12 +30,18 @@ static bool simd_check_sse2(void)
return cpu_has_sse2;
}

+static bool simd_check_sse4(void)
+{
+ return cpu_has_sse4_2;
+}
+
static bool simd_check_avx(void)
{
return cpu_has_avx;
}
#define simd_check_sse_avx simd_check_avx
#define simd_check_sse2_avx simd_check_avx
+#define simd_check_sse4_avx simd_check_avx

static void simd_set_regs(struct cpu_user_regs *regs)
{
@@ -99,6 +105,18 @@ static const struct {
SIMD(SSE2 packed u32, sse2, 16u4),
SIMD(SSE2 packed s64, sse2, 16i8),
SIMD(SSE2 packed u64, sse2, 16u8),
+ SIMD(SSE4 scalar single, sse4, f4),
+ SIMD(SSE4 packed single, sse4, 16f4),
+ SIMD(SSE4 scalar double, sse4, f8),
+ SIMD(SSE4 packed double, sse4, 16f8),
+ SIMD(SSE4 packed s8, sse4, 16i1),
+ SIMD(SSE4 packed u8, sse4, 16u1),
+ SIMD(SSE4 packed s16, sse4, 16i2),
+ SIMD(SSE4 packed u16, sse4, 16u2),
+ SIMD(SSE4 packed s32, sse4, 16i4),
+ SIMD(SSE4 packed u32, sse4, 16u4),
+ SIMD(SSE4 packed s64, sse4, 16i8),
+ SIMD(SSE4 packed u64, sse4, 16u8),
SIMD(SSE/AVX scalar single, sse_avx, f4),
SIMD(SSE/AVX packed single, sse_avx, 16f4),
SIMD(SSE2/AVX scalar single, sse2_avx, f4),
@@ -113,6 +131,18 @@ static const struct {
SIMD(SSE2/AVX packed u32, sse2_avx, 16u4),
SIMD(SSE2/AVX packed s64, sse2_avx, 16i8),
SIMD(SSE2/AVX packed u64, sse2_avx, 16u8),
+ SIMD(SSE4/AVX scalar single, sse4_avx, f4),
+ SIMD(SSE4/AVX packed single, sse4_avx, 16f4),
+ SIMD(SSE4/AVX scalar double, sse4_avx, f8),
+ SIMD(SSE4/AVX packed double, sse4_avx, 16f8),
+ SIMD(SSE4/AVX packed s8, sse4_avx, 16i1),
+ SIMD(SSE4/AVX packed u8, sse4_avx, 16u1),
+ SIMD(SSE4/AVX packed s16, sse4_avx, 16i2),
+ SIMD(SSE4/AVX packed u16, sse4_avx, 16u2),
+ SIMD(SSE4/AVX packed s32, sse4_avx, 16i4),
+ SIMD(SSE4/AVX packed u32, sse4_avx, 16u4),
+ SIMD(SSE4/AVX packed s64, sse4_avx, 16i8),
+ SIMD(SSE4/AVX packed u64, sse4_avx, 16u8),
#undef SIMD_
#undef SIMD
};
@@ -2682,6 +2712,99 @@ int main(int argc, char **argv)
goto fail;
printf("okay\n");
}
+ else
+ printf("skipped\n");
+
+ printf("%-40s", "Testing extrq $4,$56,%xmm2...");
+ if ( stack_exec && cpu_has_sse4a )
+ {
+ decl_insn(extrq_imm);
+
+ res[0] = 0x44332211;
+ res[1] = 0x88776655;
+ asm volatile ( "movq %0, %%xmm2\n"
+ put_insn(extrq_imm, "extrq $4, $56, %%xmm2")
+ :: "m" (res[0]) : "memory" );
+
+ set_insn(extrq_imm);
+ rc = x86_emulate(&ctxt, &emulops);
+ asm ( "movq %%xmm2, %0" : "=m" (res[4]) :: "memory" );
+ if ( rc != X86EMUL_OKAY || !check_eip(extrq_imm) ||
+ res[4] != 0x54433221 || res[5] != 0x877665 )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
+ printf("%-40s", "Testing extrq %xmm3,%xmm2...");
+ if ( stack_exec && cpu_has_sse4a )
+ {
+ decl_insn(extrq_reg);
+
+ res[4] = 56 + (4 << 8);
+ res[5] = 0;
+ asm volatile ( "movq %0, %%xmm2\n"
+ "movq %1, %%xmm3\n"
+ put_insn(extrq_reg, "extrq %%xmm3, %%xmm2")
+ :: "m" (res[0]), "m" (res[4]) : "memory" );
+
+ set_insn(extrq_reg);
+ rc = x86_emulate(&ctxt, &emulops);
+ asm ( "movq %%xmm2, %0" : "=m" (res[4]) :: "memory" );
+ if ( rc != X86EMUL_OKAY || !check_eip(extrq_reg) ||
+ res[4] != 0x54433221 || res[5] != 0x877665 )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
+ printf("%-40s", "Testing insertq $12,$40,%xmm2,%xmm3...");
+ if ( stack_exec && cpu_has_sse4a )
+ {
+ decl_insn(insertq_imm);
+
+ res[4] = 0xccbbaa99;
+ res[5] = 0x00ffeedd;
+ asm volatile ( "movq %1, %%xmm2\n"
+ "movq %0, %%xmm3\n"
+ put_insn(insertq_imm, "insertq $12, $40, %%xmm2, %%xmm3")
+ :: "m" (res[0]), "m" (res[4]) : "memory" );
+
+ set_insn(insertq_imm);
+ rc = x86_emulate(&ctxt, &emulops);
+ asm ( "movq %%xmm3, %0" : "=m" (res[4]) :: "memory" );
+ if ( rc != X86EMUL_OKAY || !check_eip(insertq_imm) ||
+ res[4] != 0xbaa99211 || res[5] != 0x887ddccb )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
+ printf("%-40s", "Testing insertq %xmm2,%xmm3...");
+ if ( stack_exec && cpu_has_sse4a )
+ {
+ decl_insn(insertq_reg);
+
+ res[4] = 0xccbbaa99;
+ res[5] = 0x00ffeedd;
+ res[6] = 40 + (12 << 8);
+ res[7] = 0;
+ asm volatile ( "movdqu %1, %%xmm2\n"
+ "movq %0, %%xmm3\n"
+ put_insn(insertq_reg, "insertq %%xmm2, %%xmm3")
+ :: "m" (res[0]), "m" (res[4]) : "memory" );
+
+ set_insn(insertq_reg);
+ rc = x86_emulate(&ctxt, &emulops);
+ asm ( "movq %%xmm3, %0" : "=m" (res[4]) :: "memory" );
+ if ( rc != X86EMUL_OKAY || !check_eip(insertq_reg) ||
+ res[4] != 0xbaa99211 || res[5] != 0x887ddccb )
+ goto fail;
+ printf("okay\n");
+ }
else
printf("skipped\n");

--- a/tools/tests/x86_emulator/x86_emulate.h
+++ b/tools/tests/x86_emulator/x86_emulate.h
@@ -150,6 +150,12 @@ static inline uint64_t xgetbv(uint32_t x
(res.b & (1U << 8)) != 0; \
})

+#define cpu_has_sse4a ({ \
+ struct cpuid_leaf res; \
+ emul_test_cpuid(0x80000001, 0, &res, NULL); \
+ (res.c & (1U << 6)) != 0; \
+})
+
#define cpu_has_tbm ({ \
struct cpuid_leaf res; \
emul_test_cpuid(0x80000001, 0, &res, NULL); \
Andrew Cooper
2017-03-01 17:22:36 UTC
Permalink
Acked-by: Andrew Cooper <***@citrix.com>
Jan Beulich
2017-02-28 12:58:43 UTC
Permalink
... and their AVX equivalents.

Signed-off-by: Jan Beulich <***@suse.com>
---
v3: New.

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -368,6 +368,8 @@ static const struct {
[0x37 ... 0x3f] = { .simd_size = simd_packed_int },
[0x40] = { .simd_size = simd_packed_int },
[0x41] = { .simd_size = simd_packed_int, .two_op = 1 },
+ [0xdb] = { .simd_size = simd_packed_int, .two_op = 1 },
+ [0xdc ... 0xdf] = { .simd_size = simd_packed_int },
[0xf0] = { .two_op = 1 },
[0xf1] = { .to_memory = 1, .two_op = 1 },
[0xf2 ... 0xf3] = {},
@@ -397,6 +399,7 @@ static const struct {
[0x4a ... 0x4b] = { .simd_size = simd_packed_fp, .four_op = 1 },
[0x4c] = { .simd_size = simd_packed_int, .four_op = 1 },
[0x60 ... 0x63] = { .simd_size = simd_packed_int, .two_op = 1 },
+ [0xdf] = { .simd_size = simd_packed_int, .two_op = 1 },
[0xf0] = {},
};

@@ -1465,6 +1468,7 @@ static bool vcpu_has(
#define vcpu_has_sse4_2() vcpu_has( 1, ECX, 20, ctxt, ops)
#define vcpu_has_movbe() vcpu_has( 1, ECX, 22, ctxt, ops)
#define vcpu_has_popcnt() vcpu_has( 1, ECX, 23, ctxt, ops)
+#define vcpu_has_aesni() vcpu_has( 1, ECX, 25, ctxt, ops)
#define vcpu_has_avx() vcpu_has( 1, ECX, 28, ctxt, ops)
#define vcpu_has_rdrand() vcpu_has( 1, ECX, 30, ctxt, ops)
#define vcpu_has_mmxext() (vcpu_has(0x80000001, EDX, 22, ctxt, ops) || \
@@ -7155,6 +7159,22 @@ x86_emulate(
host_and_vcpu_must_have(sse4_2);
goto simd_0f38_common;

+ case X86EMUL_OPC_66(0x0f38, 0xdb): /* aesimc xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0xdb): /* vaesimc xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0xdc): /* aesenc xmm/m128,xmm,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0xdc): /* vaesenc xmm/m128,xmm,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0xdd): /* aesenclast xmm/m128,xmm,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0xdd): /* vaesenclast xmm/m128,xmm,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0xde): /* aesdec xmm/m128,xmm,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0xde): /* vaesdec xmm/m128,xmm,xmm */
+ case X86EMUL_OPC_66(0x0f38, 0xdf): /* aesdeclast xmm/m128,xmm,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0xdf): /* vaesdeclast xmm/m128,xmm,xmm */
+ host_and_vcpu_must_have(aesni);
+ if ( vex.opcx == vex_none )
+ goto simd_0f38_common;
+ generate_exception_if(vex.l, EXC_UD);
+ goto simd_0f_avx;
+
case X86EMUL_OPC(0x0f38, 0xf0): /* movbe m,r */
case X86EMUL_OPC(0x0f38, 0xf1): /* movbe r,m */
vcpu_must_have(movbe);
@@ -7510,6 +7530,14 @@ x86_emulate(
dst.type = OP_NONE;
break;

+ case X86EMUL_OPC_66(0x0f3a, 0xdf): /* aeskeygenassist $imm8,xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f3a, 0xdf): /* vaeskeygenassist $imm8,xmm/m128,xmm */
+ host_and_vcpu_must_have(aesni);
+ if ( vex.opcx == vex_none )
+ goto simd_0f3a_common;
+ generate_exception_if(vex.l, EXC_UD);
+ goto simd_0f_imm8_avx;
+
case X86EMUL_OPC_VEX_F2(0x0f3a, 0xf0): /* rorx imm,r/m,r */
vcpu_must_have(bmi2);
generate_exception_if(vex.l || vex.reg != 0xf, EXC_UD);
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -44,6 +44,7 @@
#define cpu_has_sse4_2 boot_cpu_has(X86_FEATURE_SSE4_2)
#define cpu_has_pclmulqdq boot_cpu_has(X86_FEATURE_PCLMULQDQ)
#define cpu_has_popcnt boot_cpu_has(X86_FEATURE_POPCNT)
+#define cpu_has_aesni boot_cpu_has(X86_FEATURE_AESNI)
#define cpu_has_htt boot_cpu_has(X86_FEATURE_HTT)
#define cpu_has_nx boot_cpu_has(X86_FEATURE_NX)
#define cpu_has_clflush boot_cpu_has(X86_FEATURE_CLFLUSH)
Jan Beulich
2017-02-28 12:59:17 UTC
Permalink
Signed-off-by: Jan Beulich <***@suse.com>
---
v3: New.

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -368,6 +368,7 @@ static const struct {
[0x37 ... 0x3f] = { .simd_size = simd_packed_int },
[0x40] = { .simd_size = simd_packed_int },
[0x41] = { .simd_size = simd_packed_int, .two_op = 1 },
+ [0xc8 ... 0xcd] = { .simd_size = simd_other },
[0xdb] = { .simd_size = simd_packed_int, .two_op = 1 },
[0xdc ... 0xdf] = { .simd_size = simd_packed_int },
[0xf0] = { .two_op = 1 },
@@ -399,6 +400,7 @@ static const struct {
[0x4a ... 0x4b] = { .simd_size = simd_packed_fp, .four_op = 1 },
[0x4c] = { .simd_size = simd_packed_int, .four_op = 1 },
[0x60 ... 0x63] = { .simd_size = simd_packed_int, .two_op = 1 },
+ [0xcc] = { .simd_size = simd_other },
[0xdf] = { .simd_size = simd_packed_int, .two_op = 1 },
[0xf0] = {},
};
@@ -1490,6 +1492,7 @@ static bool vcpu_has(
#define vcpu_has_smap() vcpu_has( 7, EBX, 20, ctxt, ops)
#define vcpu_has_clflushopt() vcpu_has( 7, EBX, 23, ctxt, ops)
#define vcpu_has_clwb() vcpu_has( 7, EBX, 24, ctxt, ops)
+#define vcpu_has_sha() vcpu_has( 7, EBX, 29, ctxt, ops)
#define vcpu_has_rdpid() vcpu_has( 7, ECX, 22, ctxt, ops)

#define vcpu_must_have(feat) \
@@ -7159,6 +7162,16 @@ x86_emulate(
host_and_vcpu_must_have(sse4_2);
goto simd_0f38_common;

+ case X86EMUL_OPC(0x0f38, 0xc8): /* sha1nexte xmm/m128,xmm */
+ case X86EMUL_OPC(0x0f38, 0xc9): /* sha1msg1 xmm/m128,xmm */
+ case X86EMUL_OPC(0x0f38, 0xca): /* sha1msg2 xmm/m128,xmm */
+ case X86EMUL_OPC(0x0f38, 0xcb): /* sha256rnds2 XMM0,xmm/m128,xmm */
+ case X86EMUL_OPC(0x0f38, 0xcc): /* sha256msg1 xmm/m128,xmm */
+ case X86EMUL_OPC(0x0f38, 0xcd): /* sha256msg2 xmm/m128,xmm */
+ host_and_vcpu_must_have(sha);
+ op_bytes = 16;
+ goto simd_0f38_common;
+
case X86EMUL_OPC_66(0x0f38, 0xdb): /* aesimc xmm/m128,xmm */
case X86EMUL_OPC_VEX_66(0x0f38, 0xdb): /* vaesimc xmm/m128,xmm */
case X86EMUL_OPC_66(0x0f38, 0xdc): /* aesenc xmm/m128,xmm,xmm */
@@ -7530,6 +7543,11 @@ x86_emulate(
dst.type = OP_NONE;
break;

+ case X86EMUL_OPC(0x0f3a, 0xcc): /* sha1rnds4 $imm8,xmm/m128,xmm */
+ host_and_vcpu_must_have(sha);
+ op_bytes = 16;
+ goto simd_0f3a_common;
+
case X86EMUL_OPC_66(0x0f3a, 0xdf): /* aeskeygenassist $imm8,xmm/m128,xmm */
case X86EMUL_OPC_VEX_66(0x0f3a, 0xdf): /* vaeskeygenassist $imm8,xmm/m128,xmm */
host_and_vcpu_must_have(aesni);
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -85,6 +85,7 @@
#define cpu_has_sse4a boot_cpu_has(X86_FEATURE_SSE4A)
#define cpu_has_tbm boot_cpu_has(X86_FEATURE_TBM)
#define cpu_has_itsc boot_cpu_has(X86_FEATURE_ITSC)
+#define cpu_has_sha boot_cpu_has(X86_FEATURE_SHA)

enum _cache_type {
CACHE_TYPE_NULL = 0,
Andrew Cooper
2017-03-01 17:51:49 UTC
Permalink
Reviewed-by: Andrew Cooper <***@citrix.com>
Loading...