qemu

tcg-target.c.inc
4407 строк · 138.4 Кб
Перенос по словам
1
/*
2
 * Tiny Code Generator for QEMU
3
 *
4
 * Copyright (c) 2008 Fabrice Bellard
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a copy
7
 * of this software and associated documentation files (the "Software"), to deal
8
 * in the Software without restriction, including without limitation the rights
9
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
 * copies of the Software, and to permit persons to whom the Software is
11
 * furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice shall be included in
14
 * all copies or substantial portions of the Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
 * THE SOFTWARE.
23
 */
24

25
#include "../tcg-ldst.c.inc"
26
#include "../tcg-pool.c.inc"
27

28
#ifdef CONFIG_DEBUG_TCG
29
static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
30
#if TCG_TARGET_REG_BITS == 64
31
    "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
32
#else
33
    "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
34
#endif
35
    "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
36
    "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
37
#if TCG_TARGET_REG_BITS == 64
38
    "%xmm8", "%xmm9", "%xmm10", "%xmm11",
39
    "%xmm12", "%xmm13", "%xmm14", "%xmm15",
40
#endif
41
};
42
#endif
43

44
static const int tcg_target_reg_alloc_order[] = {
45
#if TCG_TARGET_REG_BITS == 64
46
    TCG_REG_RBP,
47
    TCG_REG_RBX,
48
    TCG_REG_R12,
49
    TCG_REG_R13,
50
    TCG_REG_R14,
51
    TCG_REG_R15,
52
    TCG_REG_R10,
53
    TCG_REG_R11,
54
    TCG_REG_R9,
55
    TCG_REG_R8,
56
    TCG_REG_RCX,
57
    TCG_REG_RDX,
58
    TCG_REG_RSI,
59
    TCG_REG_RDI,
60
    TCG_REG_RAX,
61
#else
62
    TCG_REG_EBX,
63
    TCG_REG_ESI,
64
    TCG_REG_EDI,
65
    TCG_REG_EBP,
66
    TCG_REG_ECX,
67
    TCG_REG_EDX,
68
    TCG_REG_EAX,
69
#endif
70
    TCG_REG_XMM0,
71
    TCG_REG_XMM1,
72
    TCG_REG_XMM2,
73
    TCG_REG_XMM3,
74
    TCG_REG_XMM4,
75
    TCG_REG_XMM5,
76
#ifndef _WIN64
77
    /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
78
       any of them.  Therefore only allow xmm0-xmm5 to be allocated.  */
79
    TCG_REG_XMM6,
80
    TCG_REG_XMM7,
81
#if TCG_TARGET_REG_BITS == 64
82
    TCG_REG_XMM8,
83
    TCG_REG_XMM9,
84
    TCG_REG_XMM10,
85
    TCG_REG_XMM11,
86
    TCG_REG_XMM12,
87
    TCG_REG_XMM13,
88
    TCG_REG_XMM14,
89
    TCG_REG_XMM15,
90
#endif
91
#endif
92
};
93

94
#define TCG_TMP_VEC  TCG_REG_XMM5
95

96
static const int tcg_target_call_iarg_regs[] = {
97
#if TCG_TARGET_REG_BITS == 64
98
#if defined(_WIN64)
99
    TCG_REG_RCX,
100
    TCG_REG_RDX,
101
#else
102
    TCG_REG_RDI,
103
    TCG_REG_RSI,
104
    TCG_REG_RDX,
105
    TCG_REG_RCX,
106
#endif
107
    TCG_REG_R8,
108
    TCG_REG_R9,
109
#else
110
    /* 32 bit mode uses stack based calling convention (GCC default). */
111
#endif
112
};
113

114
static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot)
115
{
116
    switch (kind) {
117
    case TCG_CALL_RET_NORMAL:
118
        tcg_debug_assert(slot >= 0 && slot <= 1);
119
        return slot ? TCG_REG_EDX : TCG_REG_EAX;
120
#ifdef _WIN64
121
    case TCG_CALL_RET_BY_VEC:
122
        tcg_debug_assert(slot == 0);
123
        return TCG_REG_XMM0;
124
#endif
125
    default:
126
        g_assert_not_reached();
127
    }
128
}
129

130
/* Constants we accept.  */
131
#define TCG_CT_CONST_S32 0x100
132
#define TCG_CT_CONST_U32 0x200
133
#define TCG_CT_CONST_I32 0x400
134
#define TCG_CT_CONST_WSZ 0x800
135
#define TCG_CT_CONST_TST 0x1000
136

137
/* Registers used with L constraint, which are the first argument
138
   registers on x86_64, and two random call clobbered registers on
139
   i386. */
140
#if TCG_TARGET_REG_BITS == 64
141
# define TCG_REG_L0 tcg_target_call_iarg_regs[0]
142
# define TCG_REG_L1 tcg_target_call_iarg_regs[1]
143
#else
144
# define TCG_REG_L0 TCG_REG_EAX
145
# define TCG_REG_L1 TCG_REG_EDX
146
#endif
147

148
#if TCG_TARGET_REG_BITS == 64
149
# define ALL_GENERAL_REGS      0x0000ffffu
150
# define ALL_VECTOR_REGS       0xffff0000u
151
# define ALL_BYTEL_REGS        ALL_GENERAL_REGS
152
#else
153
# define ALL_GENERAL_REGS      0x000000ffu
154
# define ALL_VECTOR_REGS       0x00ff0000u
155
# define ALL_BYTEL_REGS        0x0000000fu
156
#endif
157
#define SOFTMMU_RESERVE_REGS \
158
    (tcg_use_softmmu ? (1 << TCG_REG_L0) | (1 << TCG_REG_L1) : 0)
159

160
#define have_bmi2       (cpuinfo & CPUINFO_BMI2)
161
#define have_lzcnt      (cpuinfo & CPUINFO_LZCNT)
162

163
static const tcg_insn_unit *tb_ret_addr;
164

165
static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
166
                        intptr_t value, intptr_t addend)
167
{
168
    value += addend;
169
    switch(type) {
170
    case R_386_PC32:
171
        value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
172
        if (value != (int32_t)value) {
173
            return false;
174
        }
175
        /* FALLTHRU */
176
    case R_386_32:
177
        tcg_patch32(code_ptr, value);
178
        break;
179
    case R_386_PC8:
180
        value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
181
        if (value != (int8_t)value) {
182
            return false;
183
        }
184
        tcg_patch8(code_ptr, value);
185
        break;
186
    default:
187
        g_assert_not_reached();
188
    }
189
    return true;
190
}
191

192
/* test if a constant matches the constraint */
193
static bool tcg_target_const_match(int64_t val, int ct,
194
                                   TCGType type, TCGCond cond, int vece)
195
{
196
    if (ct & TCG_CT_CONST) {
197
        return 1;
198
    }
199
    if (type == TCG_TYPE_I32) {
200
        if (ct & (TCG_CT_CONST_S32 | TCG_CT_CONST_U32 |
201
                  TCG_CT_CONST_I32 | TCG_CT_CONST_TST)) {
202
            return 1;
203
        }
204
    } else {
205
        if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
206
            return 1;
207
        }
208
        if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
209
            return 1;
210
        }
211
        if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
212
            return 1;
213
        }
214
        /*
215
         * This will be used in combination with TCG_CT_CONST_S32,
216
         * so "normal" TESTQ is already matched.  Also accept:
217
         *    TESTQ -> TESTL   (uint32_t)
218
         *    TESTQ -> BT      (is_power_of_2)
219
         */
220
        if ((ct & TCG_CT_CONST_TST)
221
            && is_tst_cond(cond)
222
            && (val == (uint32_t)val || is_power_of_2(val))) {
223
            return 1;
224
        }
225
    }
226
    if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
227
        return 1;
228
    }
229
    return 0;
230
}
231

232
# define LOWREGMASK(x)	((x) & 7)
233

234
#define P_EXT		0x100		/* 0x0f opcode prefix */
235
#define P_EXT38         0x200           /* 0x0f 0x38 opcode prefix */
236
#define P_DATA16        0x400           /* 0x66 opcode prefix */
237
#define P_VEXW          0x1000          /* Set VEX.W = 1 */
238
#if TCG_TARGET_REG_BITS == 64
239
# define P_REXW         P_VEXW          /* Set REX.W = 1; match VEXW */
240
# define P_REXB_R       0x2000          /* REG field as byte register */
241
# define P_REXB_RM      0x4000          /* R/M field as byte register */
242
# define P_GS           0x8000          /* gs segment override */
243
#else
244
# define P_REXW		0
245
# define P_REXB_R	0
246
# define P_REXB_RM	0
247
# define P_GS           0
248
#endif
249
#define P_EXT3A         0x10000         /* 0x0f 0x3a opcode prefix */
250
#define P_SIMDF3        0x20000         /* 0xf3 opcode prefix */
251
#define P_SIMDF2        0x40000         /* 0xf2 opcode prefix */
252
#define P_VEXL          0x80000         /* Set VEX.L = 1 */
253
#define P_EVEX          0x100000        /* Requires EVEX encoding */
254

255
#define OPC_ARITH_EbIb	(0x80)
256
#define OPC_ARITH_EvIz	(0x81)
257
#define OPC_ARITH_EvIb	(0x83)
258
#define OPC_ARITH_GvEv	(0x03)		/* ... plus (ARITH_FOO << 3) */
259
#define OPC_ANDN        (0xf2 | P_EXT38)
260
#define OPC_ADD_GvEv	(OPC_ARITH_GvEv | (ARITH_ADD << 3))
261
#define OPC_AND_GvEv    (OPC_ARITH_GvEv | (ARITH_AND << 3))
262
#define OPC_BLENDPS     (0x0c | P_EXT3A | P_DATA16)
263
#define OPC_BSF         (0xbc | P_EXT)
264
#define OPC_BSR         (0xbd | P_EXT)
265
#define OPC_BSWAP	(0xc8 | P_EXT)
266
#define OPC_CALL_Jz	(0xe8)
267
#define OPC_CMOVCC      (0x40 | P_EXT)  /* ... plus condition code */
268
#define OPC_CMP_GvEv	(OPC_ARITH_GvEv | (ARITH_CMP << 3))
269
#define OPC_DEC_r32	(0x48)
270
#define OPC_IMUL_GvEv	(0xaf | P_EXT)
271
#define OPC_IMUL_GvEvIb	(0x6b)
272
#define OPC_IMUL_GvEvIz	(0x69)
273
#define OPC_INC_r32	(0x40)
274
#define OPC_JCC_long	(0x80 | P_EXT)	/* ... plus condition code */
275
#define OPC_JCC_short	(0x70)		/* ... plus condition code */
276
#define OPC_JMP_long	(0xe9)
277
#define OPC_JMP_short	(0xeb)
278
#define OPC_LEA         (0x8d)
279
#define OPC_LZCNT       (0xbd | P_EXT | P_SIMDF3)
280
#define OPC_MOVB_EvGv	(0x88)		/* stores, more or less */
281
#define OPC_MOVL_EvGv	(0x89)		/* stores, more or less */
282
#define OPC_MOVL_GvEv	(0x8b)		/* loads, more or less */
283
#define OPC_MOVB_EvIz   (0xc6)
284
#define OPC_MOVL_EvIz	(0xc7)
285
#define OPC_MOVB_Ib     (0xb0)
286
#define OPC_MOVL_Iv     (0xb8)
287
#define OPC_MOVBE_GyMy  (0xf0 | P_EXT38)
288
#define OPC_MOVBE_MyGy  (0xf1 | P_EXT38)
289
#define OPC_MOVD_VyEy   (0x6e | P_EXT | P_DATA16)
290
#define OPC_MOVD_EyVy   (0x7e | P_EXT | P_DATA16)
291
#define OPC_MOVDDUP     (0x12 | P_EXT | P_SIMDF2)
292
#define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
293
#define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
294
#define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
295
#define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
296
#define OPC_MOVQ_VqWq   (0x7e | P_EXT | P_SIMDF3)
297
#define OPC_MOVQ_WqVq   (0xd6 | P_EXT | P_DATA16)
298
#define OPC_MOVSBL	(0xbe | P_EXT)
299
#define OPC_MOVSWL	(0xbf | P_EXT)
300
#define OPC_MOVSLQ	(0x63 | P_REXW)
301
#define OPC_MOVZBL	(0xb6 | P_EXT)
302
#define OPC_MOVZWL	(0xb7 | P_EXT)
303
#define OPC_PABSB       (0x1c | P_EXT38 | P_DATA16)
304
#define OPC_PABSW       (0x1d | P_EXT38 | P_DATA16)
305
#define OPC_PABSD       (0x1e | P_EXT38 | P_DATA16)
306
#define OPC_VPABSQ      (0x1f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
307
#define OPC_PACKSSDW    (0x6b | P_EXT | P_DATA16)
308
#define OPC_PACKSSWB    (0x63 | P_EXT | P_DATA16)
309
#define OPC_PACKUSDW    (0x2b | P_EXT38 | P_DATA16)
310
#define OPC_PACKUSWB    (0x67 | P_EXT | P_DATA16)
311
#define OPC_PADDB       (0xfc | P_EXT | P_DATA16)
312
#define OPC_PADDW       (0xfd | P_EXT | P_DATA16)
313
#define OPC_PADDD       (0xfe | P_EXT | P_DATA16)
314
#define OPC_PADDQ       (0xd4 | P_EXT | P_DATA16)
315
#define OPC_PADDSB      (0xec | P_EXT | P_DATA16)
316
#define OPC_PADDSW      (0xed | P_EXT | P_DATA16)
317
#define OPC_PADDUB      (0xdc | P_EXT | P_DATA16)
318
#define OPC_PADDUW      (0xdd | P_EXT | P_DATA16)
319
#define OPC_PAND        (0xdb | P_EXT | P_DATA16)
320
#define OPC_PANDN       (0xdf | P_EXT | P_DATA16)
321
#define OPC_PBLENDW     (0x0e | P_EXT3A | P_DATA16)
322
#define OPC_PCMPEQB     (0x74 | P_EXT | P_DATA16)
323
#define OPC_PCMPEQW     (0x75 | P_EXT | P_DATA16)
324
#define OPC_PCMPEQD     (0x76 | P_EXT | P_DATA16)
325
#define OPC_PCMPEQQ     (0x29 | P_EXT38 | P_DATA16)
326
#define OPC_PCMPGTB     (0x64 | P_EXT | P_DATA16)
327
#define OPC_PCMPGTW     (0x65 | P_EXT | P_DATA16)
328
#define OPC_PCMPGTD     (0x66 | P_EXT | P_DATA16)
329
#define OPC_PCMPGTQ     (0x37 | P_EXT38 | P_DATA16)
330
#define OPC_PEXTRD      (0x16 | P_EXT3A | P_DATA16)
331
#define OPC_PINSRD      (0x22 | P_EXT3A | P_DATA16)
332
#define OPC_PMAXSB      (0x3c | P_EXT38 | P_DATA16)
333
#define OPC_PMAXSW      (0xee | P_EXT | P_DATA16)
334
#define OPC_PMAXSD      (0x3d | P_EXT38 | P_DATA16)
335
#define OPC_VPMAXSQ     (0x3d | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
336
#define OPC_PMAXUB      (0xde | P_EXT | P_DATA16)
337
#define OPC_PMAXUW      (0x3e | P_EXT38 | P_DATA16)
338
#define OPC_PMAXUD      (0x3f | P_EXT38 | P_DATA16)
339
#define OPC_VPMAXUQ     (0x3f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
340
#define OPC_PMINSB      (0x38 | P_EXT38 | P_DATA16)
341
#define OPC_PMINSW      (0xea | P_EXT | P_DATA16)
342
#define OPC_PMINSD      (0x39 | P_EXT38 | P_DATA16)
343
#define OPC_VPMINSQ     (0x39 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
344
#define OPC_PMINUB      (0xda | P_EXT | P_DATA16)
345
#define OPC_PMINUW      (0x3a | P_EXT38 | P_DATA16)
346
#define OPC_PMINUD      (0x3b | P_EXT38 | P_DATA16)
347
#define OPC_VPMINUQ     (0x3b | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
348
#define OPC_PMOVSXBW    (0x20 | P_EXT38 | P_DATA16)
349
#define OPC_PMOVSXWD    (0x23 | P_EXT38 | P_DATA16)
350
#define OPC_PMOVSXDQ    (0x25 | P_EXT38 | P_DATA16)
351
#define OPC_PMOVZXBW    (0x30 | P_EXT38 | P_DATA16)
352
#define OPC_PMOVZXWD    (0x33 | P_EXT38 | P_DATA16)
353
#define OPC_PMOVZXDQ    (0x35 | P_EXT38 | P_DATA16)
354
#define OPC_PMULLW      (0xd5 | P_EXT | P_DATA16)
355
#define OPC_PMULLD      (0x40 | P_EXT38 | P_DATA16)
356
#define OPC_VPMULLQ     (0x40 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
357
#define OPC_POR         (0xeb | P_EXT | P_DATA16)
358
#define OPC_PSHUFB      (0x00 | P_EXT38 | P_DATA16)
359
#define OPC_PSHUFD      (0x70 | P_EXT | P_DATA16)
360
#define OPC_PSHUFLW     (0x70 | P_EXT | P_SIMDF2)
361
#define OPC_PSHUFHW     (0x70 | P_EXT | P_SIMDF3)
362
#define OPC_PSHIFTW_Ib  (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
363
#define OPC_PSHIFTD_Ib  (0x72 | P_EXT | P_DATA16) /* /1 /2 /6 /4 */
364
#define OPC_PSHIFTQ_Ib  (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
365
#define OPC_PSLLW       (0xf1 | P_EXT | P_DATA16)
366
#define OPC_PSLLD       (0xf2 | P_EXT | P_DATA16)
367
#define OPC_PSLLQ       (0xf3 | P_EXT | P_DATA16)
368
#define OPC_PSRAW       (0xe1 | P_EXT | P_DATA16)
369
#define OPC_PSRAD       (0xe2 | P_EXT | P_DATA16)
370
#define OPC_VPSRAQ      (0xe2 | P_EXT | P_DATA16 | P_VEXW | P_EVEX)
371
#define OPC_PSRLW       (0xd1 | P_EXT | P_DATA16)
372
#define OPC_PSRLD       (0xd2 | P_EXT | P_DATA16)
373
#define OPC_PSRLQ       (0xd3 | P_EXT | P_DATA16)
374
#define OPC_PSUBB       (0xf8 | P_EXT | P_DATA16)
375
#define OPC_PSUBW       (0xf9 | P_EXT | P_DATA16)
376
#define OPC_PSUBD       (0xfa | P_EXT | P_DATA16)
377
#define OPC_PSUBQ       (0xfb | P_EXT | P_DATA16)
378
#define OPC_PSUBSB      (0xe8 | P_EXT | P_DATA16)
379
#define OPC_PSUBSW      (0xe9 | P_EXT | P_DATA16)
380
#define OPC_PSUBUB      (0xd8 | P_EXT | P_DATA16)
381
#define OPC_PSUBUW      (0xd9 | P_EXT | P_DATA16)
382
#define OPC_PUNPCKLBW   (0x60 | P_EXT | P_DATA16)
383
#define OPC_PUNPCKLWD   (0x61 | P_EXT | P_DATA16)
384
#define OPC_PUNPCKLDQ   (0x62 | P_EXT | P_DATA16)
385
#define OPC_PUNPCKLQDQ  (0x6c | P_EXT | P_DATA16)
386
#define OPC_PUNPCKHBW   (0x68 | P_EXT | P_DATA16)
387
#define OPC_PUNPCKHWD   (0x69 | P_EXT | P_DATA16)
388
#define OPC_PUNPCKHDQ   (0x6a | P_EXT | P_DATA16)
389
#define OPC_PUNPCKHQDQ  (0x6d | P_EXT | P_DATA16)
390
#define OPC_PXOR        (0xef | P_EXT | P_DATA16)
391
#define OPC_POP_r32	(0x58)
392
#define OPC_POPCNT      (0xb8 | P_EXT | P_SIMDF3)
393
#define OPC_PUSH_r32	(0x50)
394
#define OPC_PUSH_Iv	(0x68)
395
#define OPC_PUSH_Ib	(0x6a)
396
#define OPC_RET		(0xc3)
397
#define OPC_SETCC	(0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
398
#define OPC_SHIFT_1	(0xd1)
399
#define OPC_SHIFT_Ib	(0xc1)
400
#define OPC_SHIFT_cl	(0xd3)
401
#define OPC_SARX        (0xf7 | P_EXT38 | P_SIMDF3)
402
#define OPC_SHUFPS      (0xc6 | P_EXT)
403
#define OPC_SHLX        (0xf7 | P_EXT38 | P_DATA16)
404
#define OPC_SHRX        (0xf7 | P_EXT38 | P_SIMDF2)
405
#define OPC_SHRD_Ib     (0xac | P_EXT)
406
#define OPC_TESTB	(0x84)
407
#define OPC_TESTL	(0x85)
408
#define OPC_TZCNT       (0xbc | P_EXT | P_SIMDF3)
409
#define OPC_UD2         (0x0b | P_EXT)
410
#define OPC_VPBLENDD    (0x02 | P_EXT3A | P_DATA16)
411
#define OPC_VPBLENDVB   (0x4c | P_EXT3A | P_DATA16)
412
#define OPC_VPINSRB     (0x20 | P_EXT3A | P_DATA16)
413
#define OPC_VPINSRW     (0xc4 | P_EXT | P_DATA16)
414
#define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16)
415
#define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16)
416
#define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
417
#define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
418
#define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
419
#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
420
#define OPC_VPERMQ      (0x00 | P_EXT3A | P_DATA16 | P_VEXW)
421
#define OPC_VPERM2I128  (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
422
#define OPC_VPROLVD     (0x15 | P_EXT38 | P_DATA16 | P_EVEX)
423
#define OPC_VPROLVQ     (0x15 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
424
#define OPC_VPRORVD     (0x14 | P_EXT38 | P_DATA16 | P_EVEX)
425
#define OPC_VPRORVQ     (0x14 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
426
#define OPC_VPSHLDW     (0x70 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
427
#define OPC_VPSHLDD     (0x71 | P_EXT3A | P_DATA16 | P_EVEX)
428
#define OPC_VPSHLDQ     (0x71 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
429
#define OPC_VPSHLDVW    (0x70 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
430
#define OPC_VPSHLDVD    (0x71 | P_EXT38 | P_DATA16 | P_EVEX)
431
#define OPC_VPSHLDVQ    (0x71 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
432
#define OPC_VPSHRDVW    (0x72 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
433
#define OPC_VPSHRDVD    (0x73 | P_EXT38 | P_DATA16 | P_EVEX)
434
#define OPC_VPSHRDVQ    (0x73 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
435
#define OPC_VPSLLVW     (0x12 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
436
#define OPC_VPSLLVD     (0x47 | P_EXT38 | P_DATA16)
437
#define OPC_VPSLLVQ     (0x47 | P_EXT38 | P_DATA16 | P_VEXW)
438
#define OPC_VPSRAVW     (0x11 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
439
#define OPC_VPSRAVD     (0x46 | P_EXT38 | P_DATA16)
440
#define OPC_VPSRAVQ     (0x46 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
441
#define OPC_VPSRLVW     (0x10 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
442
#define OPC_VPSRLVD     (0x45 | P_EXT38 | P_DATA16)
443
#define OPC_VPSRLVQ     (0x45 | P_EXT38 | P_DATA16 | P_VEXW)
444
#define OPC_VPTERNLOGQ  (0x25 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
445
#define OPC_VZEROUPPER  (0x77 | P_EXT)
446
#define OPC_XCHG_ax_r32	(0x90)
447
#define OPC_XCHG_EvGv   (0x87)
448

449
#define OPC_GRP3_Eb     (0xf6)
450
#define OPC_GRP3_Ev     (0xf7)
451
#define OPC_GRP5        (0xff)
452
#define OPC_GRP14       (0x73 | P_EXT | P_DATA16)
453
#define OPC_GRPBT       (0xba | P_EXT)
454

455
#define OPC_GRPBT_BT    4
456
#define OPC_GRPBT_BTS   5
457
#define OPC_GRPBT_BTR   6
458
#define OPC_GRPBT_BTC   7
459

460
/* Group 1 opcode extensions for 0x80-0x83.
461
   These are also used as modifiers for OPC_ARITH.  */
462
#define ARITH_ADD 0
463
#define ARITH_OR  1
464
#define ARITH_ADC 2
465
#define ARITH_SBB 3
466
#define ARITH_AND 4
467
#define ARITH_SUB 5
468
#define ARITH_XOR 6
469
#define ARITH_CMP 7
470

471
/* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3.  */
472
#define SHIFT_ROL 0
473
#define SHIFT_ROR 1
474
#define SHIFT_SHL 4
475
#define SHIFT_SHR 5
476
#define SHIFT_SAR 7
477

478
/* Group 3 opcode extensions for 0xf6, 0xf7.  To be used with OPC_GRP3.  */
479
#define EXT3_TESTi 0
480
#define EXT3_NOT   2
481
#define EXT3_NEG   3
482
#define EXT3_MUL   4
483
#define EXT3_IMUL  5
484
#define EXT3_DIV   6
485
#define EXT3_IDIV  7
486

487
/* Group 5 opcode extensions for 0xff.  To be used with OPC_GRP5.  */
488
#define EXT5_INC_Ev	0
489
#define EXT5_DEC_Ev	1
490
#define EXT5_CALLN_Ev	2
491
#define EXT5_JMPN_Ev	4
492

493
/* Condition codes to be added to OPC_JCC_{long,short}.  */
494
#define JCC_JMP (-1)
495
#define JCC_JO  0x0
496
#define JCC_JNO 0x1
497
#define JCC_JB  0x2
498
#define JCC_JAE 0x3
499
#define JCC_JE  0x4
500
#define JCC_JNE 0x5
501
#define JCC_JBE 0x6
502
#define JCC_JA  0x7
503
#define JCC_JS  0x8
504
#define JCC_JNS 0x9
505
#define JCC_JP  0xa
506
#define JCC_JNP 0xb
507
#define JCC_JL  0xc
508
#define JCC_JGE 0xd
509
#define JCC_JLE 0xe
510
#define JCC_JG  0xf
511

512
static const uint8_t tcg_cond_to_jcc[] = {
513
    [TCG_COND_EQ] = JCC_JE,
514
    [TCG_COND_NE] = JCC_JNE,
515
    [TCG_COND_LT] = JCC_JL,
516
    [TCG_COND_GE] = JCC_JGE,
517
    [TCG_COND_LE] = JCC_JLE,
518
    [TCG_COND_GT] = JCC_JG,
519
    [TCG_COND_LTU] = JCC_JB,
520
    [TCG_COND_GEU] = JCC_JAE,
521
    [TCG_COND_LEU] = JCC_JBE,
522
    [TCG_COND_GTU] = JCC_JA,
523
    [TCG_COND_TSTEQ] = JCC_JE,
524
    [TCG_COND_TSTNE] = JCC_JNE,
525
};
526

527
#if TCG_TARGET_REG_BITS == 64
528
static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
529
{
530
    int rex;
531

532
    if (opc & P_GS) {
533
        tcg_out8(s, 0x65);
534
    }
535
    if (opc & P_DATA16) {
536
        /* We should never be asking for both 16 and 64-bit operation.  */
537
        tcg_debug_assert((opc & P_REXW) == 0);
538
        tcg_out8(s, 0x66);
539
    }
540
    if (opc & P_SIMDF3) {
541
        tcg_out8(s, 0xf3);
542
    } else if (opc & P_SIMDF2) {
543
        tcg_out8(s, 0xf2);
544
    }
545

546
    rex = 0;
547
    rex |= (opc & P_REXW) ? 0x8 : 0x0;  /* REX.W */
548
    rex |= (r & 8) >> 1;                /* REX.R */
549
    rex |= (x & 8) >> 2;                /* REX.X */
550
    rex |= (rm & 8) >> 3;               /* REX.B */
551

552
    /* P_REXB_{R,RM} indicates that the given register is the low byte.
553
       For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
554
       as otherwise the encoding indicates %[abcd]h.  Note that the values
555
       that are ORed in merely indicate that the REX byte must be present;
556
       those bits get discarded in output.  */
557
    rex |= opc & (r >= 4 ? P_REXB_R : 0);
558
    rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
559

560
    if (rex) {
561
        tcg_out8(s, (uint8_t)(rex | 0x40));
562
    }
563

564
    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
565
        tcg_out8(s, 0x0f);
566
        if (opc & P_EXT38) {
567
            tcg_out8(s, 0x38);
568
        } else if (opc & P_EXT3A) {
569
            tcg_out8(s, 0x3a);
570
        }
571
    }
572

573
    tcg_out8(s, opc);
574
}
575
#else
576
static void tcg_out_opc(TCGContext *s, int opc)
577
{
578
    if (opc & P_DATA16) {
579
        tcg_out8(s, 0x66);
580
    }
581
    if (opc & P_SIMDF3) {
582
        tcg_out8(s, 0xf3);
583
    } else if (opc & P_SIMDF2) {
584
        tcg_out8(s, 0xf2);
585
    }
586
    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
587
        tcg_out8(s, 0x0f);
588
        if (opc & P_EXT38) {
589
            tcg_out8(s, 0x38);
590
        } else if (opc & P_EXT3A) {
591
            tcg_out8(s, 0x3a);
592
        }
593
    }
594
    tcg_out8(s, opc);
595
}
596
/* Discard the register arguments to tcg_out_opc early, so as not to penalize
597
   the 32-bit compilation paths.  This method works with all versions of gcc,
598
   whereas relying on optimization may not be able to exclude them.  */
599
#define tcg_out_opc(s, opc, r, rm, x)  (tcg_out_opc)(s, opc)
600
#endif
601

602
static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
603
{
604
    tcg_out_opc(s, opc, r, rm, 0);
605
    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
606
}
607

608
static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
609
                            int rm, int index)
610
{
611
    int tmp;
612

613
    if (opc & P_GS) {
614
        tcg_out8(s, 0x65);
615
    }
616
    /* Use the two byte form if possible, which cannot encode
617
       VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT.  */
618
    if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_VEXW)) == P_EXT
619
        && ((rm | index) & 8) == 0) {
620
        /* Two byte VEX prefix.  */
621
        tcg_out8(s, 0xc5);
622

623
        tmp = (r & 8 ? 0 : 0x80);              /* VEX.R */
624
    } else {
625
        /* Three byte VEX prefix.  */
626
        tcg_out8(s, 0xc4);
627

628
        /* VEX.m-mmmm */
629
        if (opc & P_EXT3A) {
630
            tmp = 3;
631
        } else if (opc & P_EXT38) {
632
            tmp = 2;
633
        } else if (opc & P_EXT) {
634
            tmp = 1;
635
        } else {
636
            g_assert_not_reached();
637
        }
638
        tmp |= (r & 8 ? 0 : 0x80);             /* VEX.R */
639
        tmp |= (index & 8 ? 0 : 0x40);         /* VEX.X */
640
        tmp |= (rm & 8 ? 0 : 0x20);            /* VEX.B */
641
        tcg_out8(s, tmp);
642

643
        tmp = (opc & P_VEXW ? 0x80 : 0);       /* VEX.W */
644
    }
645

646
    tmp |= (opc & P_VEXL ? 0x04 : 0);      /* VEX.L */
647
    /* VEX.pp */
648
    if (opc & P_DATA16) {
649
        tmp |= 1;                          /* 0x66 */
650
    } else if (opc & P_SIMDF3) {
651
        tmp |= 2;                          /* 0xf3 */
652
    } else if (opc & P_SIMDF2) {
653
        tmp |= 3;                          /* 0xf2 */
654
    }
655
    tmp |= (~v & 15) << 3;                 /* VEX.vvvv */
656
    tcg_out8(s, tmp);
657
    tcg_out8(s, opc);
658
}
659

660
static void tcg_out_evex_opc(TCGContext *s, int opc, int r, int v,
661
                             int rm, int index)
662
{
663
    /* The entire 4-byte evex prefix; with R' and V' set. */
664
    uint32_t p = 0x08041062;
665
    int mm, pp;
666

667
    tcg_debug_assert(have_avx512vl);
668

669
    /* EVEX.mm */
670
    if (opc & P_EXT3A) {
671
        mm = 3;
672
    } else if (opc & P_EXT38) {
673
        mm = 2;
674
    } else if (opc & P_EXT) {
675
        mm = 1;
676
    } else {
677
        g_assert_not_reached();
678
    }
679

680
    /* EVEX.pp */
681
    if (opc & P_DATA16) {
682
        pp = 1;                          /* 0x66 */
683
    } else if (opc & P_SIMDF3) {
684
        pp = 2;                          /* 0xf3 */
685
    } else if (opc & P_SIMDF2) {
686
        pp = 3;                          /* 0xf2 */
687
    } else {
688
        pp = 0;
689
    }
690

691
    p = deposit32(p, 8, 2, mm);
692
    p = deposit32(p, 13, 1, (rm & 8) == 0);             /* EVEX.RXB.B */
693
    p = deposit32(p, 14, 1, (index & 8) == 0);          /* EVEX.RXB.X */
694
    p = deposit32(p, 15, 1, (r & 8) == 0);              /* EVEX.RXB.R */
695
    p = deposit32(p, 16, 2, pp);
696
    p = deposit32(p, 19, 4, ~v);
697
    p = deposit32(p, 23, 1, (opc & P_VEXW) != 0);
698
    p = deposit32(p, 29, 2, (opc & P_VEXL) != 0);
699

700
    tcg_out32(s, p);
701
    tcg_out8(s, opc);
702
}
703

704
static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
705
{
706
    if (opc & P_EVEX) {
707
        tcg_out_evex_opc(s, opc, r, v, rm, 0);
708
    } else {
709
        tcg_out_vex_opc(s, opc, r, v, rm, 0);
710
    }
711
    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
712
}
713

714
/* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
715
   We handle either RM and INDEX missing with a negative value.  In 64-bit
716
   mode for absolute addresses, ~RM is the size of the immediate operand
717
   that will follow the instruction.  */
718

719
static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
720
                               int shift, intptr_t offset)
721
{
722
    int mod, len;
723

724
    if (index < 0 && rm < 0) {
725
        if (TCG_TARGET_REG_BITS == 64) {
726
            /* Try for a rip-relative addressing mode.  This has replaced
727
               the 32-bit-mode absolute addressing encoding.  */
728
            intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
729
            intptr_t disp = offset - pc;
730
            if (disp == (int32_t)disp) {
731
                tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
732
                tcg_out32(s, disp);
733
                return;
734
            }
735

736
            /* Try for an absolute address encoding.  This requires the
737
               use of the MODRM+SIB encoding and is therefore larger than
738
               rip-relative addressing.  */
739
            if (offset == (int32_t)offset) {
740
                tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
741
                tcg_out8(s, (4 << 3) | 5);
742
                tcg_out32(s, offset);
743
                return;
744
            }
745

746
            /* ??? The memory isn't directly addressable.  */
747
            g_assert_not_reached();
748
        } else {
749
            /* Absolute address.  */
750
            tcg_out8(s, (r << 3) | 5);
751
            tcg_out32(s, offset);
752
            return;
753
        }
754
    }
755

756
    /* Find the length of the immediate addend.  Note that the encoding
757
       that would be used for (%ebp) indicates absolute addressing.  */
758
    if (rm < 0) {
759
        mod = 0, len = 4, rm = 5;
760
    } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
761
        mod = 0, len = 0;
762
    } else if (offset == (int8_t)offset) {
763
        mod = 0x40, len = 1;
764
    } else {
765
        mod = 0x80, len = 4;
766
    }
767

768
    /* Use a single byte MODRM format if possible.  Note that the encoding
769
       that would be used for %esp is the escape to the two byte form.  */
770
    if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
771
        /* Single byte MODRM format.  */
772
        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
773
    } else {
774
        /* Two byte MODRM+SIB format.  */
775

776
        /* Note that the encoding that would place %esp into the index
777
           field indicates no index register.  In 64-bit mode, the REX.X
778
           bit counts, so %r12 can be used as the index.  */
779
        if (index < 0) {
780
            index = 4;
781
        } else {
782
            tcg_debug_assert(index != TCG_REG_ESP);
783
        }
784

785
        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
786
        tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
787
    }
788

789
    if (len == 1) {
790
        tcg_out8(s, offset);
791
    } else if (len == 4) {
792
        tcg_out32(s, offset);
793
    }
794
}
795

796
static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
797
                                     int index, int shift, intptr_t offset)
798
{
799
    tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
800
    tcg_out_sib_offset(s, r, rm, index, shift, offset);
801
}
802

803
static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
804
                                         int rm, int index, int shift,
805
                                         intptr_t offset)
806
{
807
    tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
808
    tcg_out_sib_offset(s, r, rm, index, shift, offset);
809
}
810

811
/* A simplification of the above with no index or shift.  */
812
static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
813
                                        int rm, intptr_t offset)
814
{
815
    tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
816
}
817

818
static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
819
                                            int v, int rm, intptr_t offset)
820
{
821
    tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
822
}
823

824
/* Output an opcode with an expected reference to the constant pool.  */
825
static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r)
826
{
827
    tcg_out_opc(s, opc, r, 0, 0);
828
    /* Absolute for 32-bit, pc-relative for 64-bit.  */
829
    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
830
    tcg_out32(s, 0);
831
}
832

833
/* Output an opcode with an expected reference to the constant pool.  */
834
static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r)
835
{
836
    tcg_out_vex_opc(s, opc, r, 0, 0, 0);
837
    /* Absolute for 32-bit, pc-relative for 64-bit.  */
838
    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
839
    tcg_out32(s, 0);
840
}
841

842
/* Generate dest op= src.  Uses the same ARITH_* codes as tgen_arithi.  */
843
static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
844
{
845
    /* Propagate an opcode prefix, such as P_REXW.  */
846
    int ext = subop & ~0x7;
847
    subop &= 0x7;
848

849
    tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
850
}
851

852
static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
853
{
854
    int rexw = 0;
855

856
    if (arg == ret) {
857
        return true;
858
    }
859
    switch (type) {
860
    case TCG_TYPE_I64:
861
        rexw = P_REXW;
862
        /* fallthru */
863
    case TCG_TYPE_I32:
864
        if (ret < 16) {
865
            if (arg < 16) {
866
                tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
867
            } else {
868
                tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret);
869
            }
870
        } else {
871
            if (arg < 16) {
872
                tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
873
            } else {
874
                tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
875
            }
876
        }
877
        break;
878

879
    case TCG_TYPE_V64:
880
        tcg_debug_assert(ret >= 16 && arg >= 16);
881
        tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
882
        break;
883
    case TCG_TYPE_V128:
884
        tcg_debug_assert(ret >= 16 && arg >= 16);
885
        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
886
        break;
887
    case TCG_TYPE_V256:
888
        tcg_debug_assert(ret >= 16 && arg >= 16);
889
        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
890
        break;
891

892
    default:
893
        g_assert_not_reached();
894
    }
895
    return true;
896
}
897

898
static const int avx2_dup_insn[4] = {
899
    OPC_VPBROADCASTB, OPC_VPBROADCASTW,
900
    OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
901
};
902

903
static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
904
                            TCGReg r, TCGReg a)
905
{
906
    if (have_avx2) {
907
        int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
908
        tcg_out_vex_modrm(s, avx2_dup_insn[vece] + vex_l, r, 0, a);
909
    } else {
910
        switch (vece) {
911
        case MO_8:
912
            /* ??? With zero in a register, use PSHUFB.  */
913
            tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a);
914
            a = r;
915
            /* FALLTHRU */
916
        case MO_16:
917
            tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a);
918
            a = r;
919
            /* FALLTHRU */
920
        case MO_32:
921
            tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a);
922
            /* imm8 operand: all output lanes selected from input lane 0.  */
923
            tcg_out8(s, 0);
924
            break;
925
        case MO_64:
926
            tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a);
927
            break;
928
        default:
929
            g_assert_not_reached();
930
        }
931
    }
932
    return true;
933
}
934

935
static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
936
                             TCGReg r, TCGReg base, intptr_t offset)
937
{
938
    if (have_avx2) {
939
        int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
940
        tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l,
941
                                 r, 0, base, offset);
942
    } else {
943
        switch (vece) {
944
        case MO_64:
945
            tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset);
946
            break;
947
        case MO_32:
948
            tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset);
949
            break;
950
        case MO_16:
951
            tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset);
952
            tcg_out8(s, 0); /* imm8 */
953
            tcg_out_dup_vec(s, type, vece, r, r);
954
            break;
955
        case MO_8:
956
            tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset);
957
            tcg_out8(s, 0); /* imm8 */
958
            tcg_out_dup_vec(s, type, vece, r, r);
959
            break;
960
        default:
961
            g_assert_not_reached();
962
        }
963
    }
964
    return true;
965
}
966

967
static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
968
                             TCGReg ret, int64_t arg)
969
{
970
    int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
971

972
    if (arg == 0) {
973
        tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
974
        return;
975
    }
976
    if (arg == -1) {
977
        tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret);
978
        return;
979
    }
980

981
    if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) {
982
        if (have_avx2) {
983
            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
984
        } else {
985
            tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
986
        }
987
        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
988
    } else {
989
        if (type == TCG_TYPE_V64) {
990
            tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
991
        } else if (have_avx2) {
992
            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
993
        } else {
994
            tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
995
        }
996
        if (TCG_TARGET_REG_BITS == 64) {
997
            new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
998
        } else {
999
            new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32);
1000
        }
1001
    }
1002
}
1003

1004
static void tcg_out_movi_vec(TCGContext *s, TCGType type,
1005
                             TCGReg ret, tcg_target_long arg)
1006
{
1007
    if (arg == 0) {
1008
        tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
1009
        return;
1010
    }
1011
    if (arg == -1) {
1012
        tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret);
1013
        return;
1014
    }
1015

1016
    int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
1017
    tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret);
1018
    if (TCG_TARGET_REG_BITS == 64) {
1019
        new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
1020
    } else {
1021
        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
1022
    }
1023
}
1024

1025
static void tcg_out_movi_int(TCGContext *s, TCGType type,
1026
                             TCGReg ret, tcg_target_long arg)
1027
{
1028
    tcg_target_long diff;
1029

1030
    if (arg == 0) {
1031
        tgen_arithr(s, ARITH_XOR, ret, ret);
1032
        return;
1033
    }
1034
    if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
1035
        tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
1036
        tcg_out32(s, arg);
1037
        return;
1038
    }
1039
    if (arg == (int32_t)arg) {
1040
        tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
1041
        tcg_out32(s, arg);
1042
        return;
1043
    }
1044

1045
    /* Try a 7 byte pc-relative lea before the 10 byte movq.  */
1046
    diff = tcg_pcrel_diff(s, (const void *)arg) - 7;
1047
    if (diff == (int32_t)diff) {
1048
        tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
1049
        tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
1050
        tcg_out32(s, diff);
1051
        return;
1052
    }
1053

1054
    tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
1055
    tcg_out64(s, arg);
1056
}
1057

1058
static void tcg_out_movi(TCGContext *s, TCGType type,
1059
                         TCGReg ret, tcg_target_long arg)
1060
{
1061
    switch (type) {
1062
    case TCG_TYPE_I32:
1063
#if TCG_TARGET_REG_BITS == 64
1064
    case TCG_TYPE_I64:
1065
#endif
1066
        if (ret < 16) {
1067
            tcg_out_movi_int(s, type, ret, arg);
1068
        } else {
1069
            tcg_out_movi_vec(s, type, ret, arg);
1070
        }
1071
        break;
1072
    default:
1073
        g_assert_not_reached();
1074
    }
1075
}
1076

1077
static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2)
1078
{
1079
    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1080
    tcg_out_modrm(s, OPC_XCHG_EvGv + rexw, r1, r2);
1081
    return true;
1082
}
1083

1084
static void tcg_out_addi_ptr(TCGContext *s, TCGReg rd, TCGReg rs,
1085
                             tcg_target_long imm)
1086
{
1087
    /* This function is only used for passing structs by reference. */
1088
    tcg_debug_assert(imm == (int32_t)imm);
1089
    tcg_out_modrm_offset(s, OPC_LEA | P_REXW, rd, rs, imm);
1090
}
1091

1092
static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
1093
{
1094
    if (val == (int8_t)val) {
1095
        tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
1096
        tcg_out8(s, val);
1097
    } else if (val == (int32_t)val) {
1098
        tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
1099
        tcg_out32(s, val);
1100
    } else {
1101
        g_assert_not_reached();
1102
    }
1103
}
1104

1105
static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
1106
{
1107
    /* Given the strength of x86 memory ordering, we only need care for
1108
       store-load ordering.  Experimentally, "lock orl $0,0(%esp)" is
1109
       faster than "mfence", so don't bother with the sse insn.  */
1110
    if (a0 & TCG_MO_ST_LD) {
1111
        tcg_out8(s, 0xf0);
1112
        tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
1113
        tcg_out8(s, 0);
1114
    }
1115
}
1116

1117
static inline void tcg_out_push(TCGContext *s, int reg)
1118
{
1119
    tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
1120
}
1121

1122
static inline void tcg_out_pop(TCGContext *s, int reg)
1123
{
1124
    tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
1125
}
1126

1127
static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
1128
                       TCGReg arg1, intptr_t arg2)
1129
{
1130
    switch (type) {
1131
    case TCG_TYPE_I32:
1132
        if (ret < 16) {
1133
            tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
1134
        } else {
1135
            tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2);
1136
        }
1137
        break;
1138
    case TCG_TYPE_I64:
1139
        if (ret < 16) {
1140
            tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
1141
            break;
1142
        }
1143
        /* FALLTHRU */
1144
    case TCG_TYPE_V64:
1145
        /* There is no instruction that can validate 8-byte alignment.  */
1146
        tcg_debug_assert(ret >= 16);
1147
        tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
1148
        break;
1149
    case TCG_TYPE_V128:
1150
        /*
1151
         * The gvec infrastructure is asserts that v128 vector loads
1152
         * and stores use a 16-byte aligned offset.  Validate that the
1153
         * final pointer is aligned by using an insn that will SIGSEGV.
1154
         */
1155
        tcg_debug_assert(ret >= 16);
1156
        tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2);
1157
        break;
1158
    case TCG_TYPE_V256:
1159
        /*
1160
         * The gvec infrastructure only requires 16-byte alignment,
1161
         * so here we must use an unaligned load.
1162
         */
1163
        tcg_debug_assert(ret >= 16);
1164
        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
1165
                                 ret, 0, arg1, arg2);
1166
        break;
1167
    default:
1168
        g_assert_not_reached();
1169
    }
1170
}
1171

1172
static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
1173
                       TCGReg arg1, intptr_t arg2)
1174
{
1175
    switch (type) {
1176
    case TCG_TYPE_I32:
1177
        if (arg < 16) {
1178
            tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
1179
        } else {
1180
            tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2);
1181
        }
1182
        break;
1183
    case TCG_TYPE_I64:
1184
        if (arg < 16) {
1185
            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
1186
            break;
1187
        }
1188
        /* FALLTHRU */
1189
    case TCG_TYPE_V64:
1190
        /* There is no instruction that can validate 8-byte alignment.  */
1191
        tcg_debug_assert(arg >= 16);
1192
        tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
1193
        break;
1194
    case TCG_TYPE_V128:
1195
        /*
1196
         * The gvec infrastructure is asserts that v128 vector loads
1197
         * and stores use a 16-byte aligned offset.  Validate that the
1198
         * final pointer is aligned by using an insn that will SIGSEGV.
1199
         *
1200
         * This specific instance is also used by TCG_CALL_RET_BY_VEC,
1201
         * for _WIN64, which must have SSE2 but may not have AVX.
1202
         */
1203
        tcg_debug_assert(arg >= 16);
1204
        if (have_avx1) {
1205
            tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2);
1206
        } else {
1207
            tcg_out_modrm_offset(s, OPC_MOVDQA_WxVx, arg, arg1, arg2);
1208
        }
1209
        break;
1210
    case TCG_TYPE_V256:
1211
        /*
1212
         * The gvec infrastructure only requires 16-byte alignment,
1213
         * so here we must use an unaligned store.
1214
         */
1215
        tcg_debug_assert(arg >= 16);
1216
        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
1217
                                 arg, 0, arg1, arg2);
1218
        break;
1219
    default:
1220
        g_assert_not_reached();
1221
    }
1222
}
1223

1224
static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
1225
                        TCGReg base, intptr_t ofs)
1226
{
1227
    int rexw = 0;
1228
    if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
1229
        if (val != (int32_t)val) {
1230
            return false;
1231
        }
1232
        rexw = P_REXW;
1233
    } else if (type != TCG_TYPE_I32) {
1234
        return false;
1235
    }
1236
    tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
1237
    tcg_out32(s, val);
1238
    return true;
1239
}
1240

1241
static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
1242
{
1243
    /* Propagate an opcode prefix, such as P_DATA16.  */
1244
    int ext = subopc & ~0x7;
1245
    subopc &= 0x7;
1246

1247
    if (count == 1) {
1248
        tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
1249
    } else {
1250
        tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
1251
        tcg_out8(s, count);
1252
    }
1253
}
1254

1255
static inline void tcg_out_bswap32(TCGContext *s, int reg)
1256
{
1257
    tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
1258
}
1259

1260
static inline void tcg_out_rolw_8(TCGContext *s, int reg)
1261
{
1262
    tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
1263
}
1264

1265
static void tcg_out_ext8u(TCGContext *s, TCGReg dest, TCGReg src)
1266
{
1267
    /* movzbl */
1268
    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1269
    tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
1270
}
1271

1272
static void tcg_out_ext8s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src)
1273
{
1274
    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1275
    /* movsbl */
1276
    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1277
    tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
1278
}
1279

1280
static void tcg_out_ext16u(TCGContext *s, TCGReg dest, TCGReg src)
1281
{
1282
    /* movzwl */
1283
    tcg_out_modrm(s, OPC_MOVZWL, dest, src);
1284
}
1285

1286
static void tcg_out_ext16s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src)
1287
{
1288
    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1289
    /* movsw[lq] */
1290
    tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
1291
}
1292

1293
static void tcg_out_ext32u(TCGContext *s, TCGReg dest, TCGReg src)
1294
{
1295
    /* 32-bit mov zero extends.  */
1296
    tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
1297
}
1298

1299
static void tcg_out_ext32s(TCGContext *s, TCGReg dest, TCGReg src)
1300
{
1301
    tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
1302
    tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
1303
}
1304

1305
static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg dest, TCGReg src)
1306
{
1307
    tcg_out_ext32s(s, dest, src);
1308
}
1309

1310
static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg dest, TCGReg src)
1311
{
1312
    if (dest != src) {
1313
        tcg_out_ext32u(s, dest, src);
1314
    }
1315
}
1316

1317
static void tcg_out_extrl_i64_i32(TCGContext *s, TCGReg dest, TCGReg src)
1318
{
1319
    tcg_out_ext32u(s, dest, src);
1320
}
1321

1322
static inline void tcg_out_bswap64(TCGContext *s, int reg)
1323
{
1324
    tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
1325
}
1326

1327
static void tgen_arithi(TCGContext *s, int c, int r0,
1328
                        tcg_target_long val, int cf)
1329
{
1330
    int rexw = 0;
1331

1332
    if (TCG_TARGET_REG_BITS == 64) {
1333
        rexw = c & -8;
1334
        c &= 7;
1335
    }
1336

1337
    switch (c) {
1338
    case ARITH_ADD:
1339
    case ARITH_SUB:
1340
        if (!cf) {
1341
            /*
1342
             * ??? While INC is 2 bytes shorter than ADDL $1, they also induce
1343
             * partial flags update stalls on Pentium4 and are not recommended
1344
             * by current Intel optimization manuals.
1345
             */
1346
            if (val == 1 || val == -1) {
1347
                int is_inc = (c == ARITH_ADD) ^ (val < 0);
1348
                if (TCG_TARGET_REG_BITS == 64) {
1349
                    /*
1350
                     * The single-byte increment encodings are re-tasked
1351
                     * as the REX prefixes.  Use the MODRM encoding.
1352
                     */
1353
                    tcg_out_modrm(s, OPC_GRP5 + rexw,
1354
                                  (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
1355
                } else {
1356
                    tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
1357
                }
1358
                return;
1359
            }
1360
            if (val == 128) {
1361
                /*
1362
                 * Facilitate using an 8-bit immediate.  Carry is inverted
1363
                 * by this transformation, so do it only if cf == 0.
1364
                 */
1365
                c ^= ARITH_ADD ^ ARITH_SUB;
1366
                val = -128;
1367
            }
1368
        }
1369
        break;
1370

1371
    case ARITH_AND:
1372
        if (TCG_TARGET_REG_BITS == 64) {
1373
            if (val == 0xffffffffu) {
1374
                tcg_out_ext32u(s, r0, r0);
1375
                return;
1376
            }
1377
            if (val == (uint32_t)val) {
1378
                /* AND with no high bits set can use a 32-bit operation.  */
1379
                rexw = 0;
1380
            }
1381
        }
1382
        if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1383
            tcg_out_ext8u(s, r0, r0);
1384
            return;
1385
        }
1386
        if (val == 0xffffu) {
1387
            tcg_out_ext16u(s, r0, r0);
1388
            return;
1389
        }
1390
        break;
1391

1392
    case ARITH_OR:
1393
    case ARITH_XOR:
1394
        if (val >= 0x80 && val <= 0xff
1395
            && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1396
            tcg_out_modrm(s, OPC_ARITH_EbIb + P_REXB_RM, c, r0);
1397
            tcg_out8(s, val);
1398
            return;
1399
        }
1400
        break;
1401
    }
1402

1403
    if (val == (int8_t)val) {
1404
        tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
1405
        tcg_out8(s, val);
1406
        return;
1407
    }
1408
    if (rexw == 0 || val == (int32_t)val) {
1409
        tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
1410
        tcg_out32(s, val);
1411
        return;
1412
    }
1413

1414
    g_assert_not_reached();
1415
}
1416

1417
static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
1418
{
1419
    if (val != 0) {
1420
        tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
1421
    }
1422
}
1423

1424
/* Set SMALL to force a short forward branch.  */
1425
static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, bool small)
1426
{
1427
    int32_t val, val1;
1428

1429
    if (l->has_value) {
1430
        val = tcg_pcrel_diff(s, l->u.value_ptr);
1431
        val1 = val - 2;
1432
        if ((int8_t)val1 == val1) {
1433
            if (opc == -1) {
1434
                tcg_out8(s, OPC_JMP_short);
1435
            } else {
1436
                tcg_out8(s, OPC_JCC_short + opc);
1437
            }
1438
            tcg_out8(s, val1);
1439
        } else {
1440
            tcg_debug_assert(!small);
1441
            if (opc == -1) {
1442
                tcg_out8(s, OPC_JMP_long);
1443
                tcg_out32(s, val - 5);
1444
            } else {
1445
                tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1446
                tcg_out32(s, val - 6);
1447
            }
1448
        }
1449
    } else if (small) {
1450
        if (opc == -1) {
1451
            tcg_out8(s, OPC_JMP_short);
1452
        } else {
1453
            tcg_out8(s, OPC_JCC_short + opc);
1454
        }
1455
        tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
1456
        s->code_ptr += 1;
1457
    } else {
1458
        if (opc == -1) {
1459
            tcg_out8(s, OPC_JMP_long);
1460
        } else {
1461
            tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1462
        }
1463
        tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
1464
        s->code_ptr += 4;
1465
    }
1466
}
1467

1468
static int tcg_out_cmp(TCGContext *s, TCGCond cond, TCGArg arg1,
1469
                       TCGArg arg2, int const_arg2, int rexw)
1470
{
1471
    int jz, js;
1472

1473
    if (!is_tst_cond(cond)) {
1474
        if (!const_arg2) {
1475
            tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
1476
        } else if (arg2 == 0) {
1477
            tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
1478
        } else {
1479
            tcg_debug_assert(!rexw || arg2 == (int32_t)arg2);
1480
            tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
1481
        }
1482
        return tcg_cond_to_jcc[cond];
1483
    }
1484

1485
    jz = tcg_cond_to_jcc[cond];
1486
    js = (cond == TCG_COND_TSTNE ? JCC_JS : JCC_JNS);
1487

1488
    if (!const_arg2) {
1489
        tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg2);
1490
        return jz;
1491
    }
1492

1493
    if (arg2 <= 0xff && (TCG_TARGET_REG_BITS == 64 || arg1 < 4)) {
1494
        if (arg2 == 0x80) {
1495
            tcg_out_modrm(s, OPC_TESTB | P_REXB_R, arg1, arg1);
1496
            return js;
1497
        }
1498
        if (arg2 == 0xff) {
1499
            tcg_out_modrm(s, OPC_TESTB | P_REXB_R, arg1, arg1);
1500
            return jz;
1501
        }
1502
        tcg_out_modrm(s, OPC_GRP3_Eb | P_REXB_RM, EXT3_TESTi, arg1);
1503
        tcg_out8(s, arg2);
1504
        return jz;
1505
    }
1506

1507
    if ((arg2 & ~0xff00) == 0 && arg1 < 4) {
1508
        if (arg2 == 0x8000) {
1509
            tcg_out_modrm(s, OPC_TESTB, arg1 + 4, arg1 + 4);
1510
            return js;
1511
        }
1512
        if (arg2 == 0xff00) {
1513
            tcg_out_modrm(s, OPC_TESTB, arg1 + 4, arg1 + 4);
1514
            return jz;
1515
        }
1516
        tcg_out_modrm(s, OPC_GRP3_Eb, EXT3_TESTi, arg1 + 4);
1517
        tcg_out8(s, arg2 >> 8);
1518
        return jz;
1519
    }
1520

1521
    if (arg2 == 0xffff) {
1522
        tcg_out_modrm(s, OPC_TESTL | P_DATA16, arg1, arg1);
1523
        return jz;
1524
    }
1525
    if (arg2 == 0xffffffffu) {
1526
        tcg_out_modrm(s, OPC_TESTL, arg1, arg1);
1527
        return jz;
1528
    }
1529

1530
    if (is_power_of_2(rexw ? arg2 : (uint32_t)arg2)) {
1531
        int jc = (cond == TCG_COND_TSTNE ? JCC_JB : JCC_JAE);
1532
        int sh = ctz64(arg2);
1533

1534
        rexw = (sh & 32 ? P_REXW : 0);
1535
        if ((sh & 31) == 31) {
1536
            tcg_out_modrm(s, OPC_TESTL | rexw, arg1, arg1);
1537
            return js;
1538
        } else {
1539
            tcg_out_modrm(s, OPC_GRPBT | rexw, OPC_GRPBT_BT, arg1);
1540
            tcg_out8(s, sh);
1541
            return jc;
1542
        }
1543
    }
1544

1545
    if (rexw) {
1546
        if (arg2 == (uint32_t)arg2) {
1547
            rexw = 0;
1548
        } else {
1549
            tcg_debug_assert(arg2 == (int32_t)arg2);
1550
        }
1551
    }
1552
    tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_TESTi, arg1);
1553
    tcg_out32(s, arg2);
1554
    return jz;
1555
}
1556

1557
static void tcg_out_brcond(TCGContext *s, int rexw, TCGCond cond,
1558
                           TCGArg arg1, TCGArg arg2, int const_arg2,
1559
                           TCGLabel *label, bool small)
1560
{
1561
    int jcc = tcg_out_cmp(s, cond, arg1, arg2, const_arg2, rexw);
1562
    tcg_out_jxx(s, jcc, label, small);
1563
}
1564

1565
#if TCG_TARGET_REG_BITS == 32
1566
static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
1567
                            const int *const_args, bool small)
1568
{
1569
    TCGLabel *label_next = gen_new_label();
1570
    TCGLabel *label_this = arg_label(args[5]);
1571
    TCGCond cond = args[4];
1572

1573
    switch (cond) {
1574
    case TCG_COND_EQ:
1575
    case TCG_COND_TSTEQ:
1576
        tcg_out_brcond(s, 0, tcg_invert_cond(cond),
1577
                       args[0], args[2], const_args[2], label_next, 1);
1578
        tcg_out_brcond(s, 0, cond, args[1], args[3], const_args[3],
1579
                       label_this, small);
1580
        break;
1581
    case TCG_COND_NE:
1582
    case TCG_COND_TSTNE:
1583
        tcg_out_brcond(s, 0, cond, args[0], args[2], const_args[2],
1584
                       label_this, small);
1585
        tcg_out_brcond(s, 0, cond, args[1], args[3], const_args[3],
1586
                       label_this, small);
1587
        break;
1588
    case TCG_COND_LT:
1589
        tcg_out_brcond(s, 0, TCG_COND_LT, args[1], args[3], const_args[3],
1590
                       label_this, small);
1591
        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1592
        tcg_out_brcond(s, 0, TCG_COND_LTU, args[0], args[2], const_args[2],
1593
                       label_this, small);
1594
        break;
1595
    case TCG_COND_LE:
1596
        tcg_out_brcond(s, 0, TCG_COND_LT, args[1], args[3], const_args[3],
1597
                       label_this, small);
1598
        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1599
        tcg_out_brcond(s, 0, TCG_COND_LEU, args[0], args[2], const_args[2],
1600
                       label_this, small);
1601
        break;
1602
    case TCG_COND_GT:
1603
        tcg_out_brcond(s, 0, TCG_COND_GT, args[1], args[3], const_args[3],
1604
                       label_this, small);
1605
        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1606
        tcg_out_brcond(s, 0, TCG_COND_GTU, args[0], args[2], const_args[2],
1607
                       label_this, small);
1608
        break;
1609
    case TCG_COND_GE:
1610
        tcg_out_brcond(s, 0, TCG_COND_GT, args[1], args[3], const_args[3],
1611
                       label_this, small);
1612
        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1613
        tcg_out_brcond(s, 0, TCG_COND_GEU, args[0], args[2], const_args[2],
1614
                       label_this, small);
1615
        break;
1616
    case TCG_COND_LTU:
1617
        tcg_out_brcond(s, 0, TCG_COND_LTU, args[1], args[3], const_args[3],
1618
                       label_this, small);
1619
        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1620
        tcg_out_brcond(s, 0, TCG_COND_LTU, args[0], args[2], const_args[2],
1621
                       label_this, small);
1622
        break;
1623
    case TCG_COND_LEU:
1624
        tcg_out_brcond(s, 0, TCG_COND_LTU, args[1], args[3], const_args[3],
1625
                       label_this, small);
1626
        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1627
        tcg_out_brcond(s, 0, TCG_COND_LEU, args[0], args[2], const_args[2],
1628
                       label_this, small);
1629
        break;
1630
    case TCG_COND_GTU:
1631
        tcg_out_brcond(s, 0, TCG_COND_GTU, args[1], args[3], const_args[3],
1632
                       label_this, small);
1633
        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1634
        tcg_out_brcond(s, 0, TCG_COND_GTU, args[0], args[2], const_args[2],
1635
                       label_this, small);
1636
        break;
1637
    case TCG_COND_GEU:
1638
        tcg_out_brcond(s, 0, TCG_COND_GTU, args[1], args[3], const_args[3],
1639
                       label_this, small);
1640
        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1641
        tcg_out_brcond(s, 0, TCG_COND_GEU, args[0], args[2], const_args[2],
1642
                       label_this, small);
1643
        break;
1644
    default:
1645
        g_assert_not_reached();
1646
    }
1647
    tcg_out_label(s, label_next);
1648
}
1649
#endif
1650

1651
static void tcg_out_setcond(TCGContext *s, int rexw, TCGCond cond,
1652
                            TCGArg dest, TCGArg arg1, TCGArg arg2,
1653
                            int const_arg2, bool neg)
1654
{
1655
    int cmp_rexw = rexw;
1656
    bool inv = false;
1657
    bool cleared;
1658
    int jcc;
1659

1660
    switch (cond) {
1661
    case TCG_COND_NE:
1662
        inv = true;
1663
        /* fall through */
1664
    case TCG_COND_EQ:
1665
        /* If arg2 is 0, convert to LTU/GEU vs 1. */
1666
        if (const_arg2 && arg2 == 0) {
1667
            arg2 = 1;
1668
            goto do_ltu;
1669
        }
1670
        break;
1671

1672
    case TCG_COND_TSTNE:
1673
        inv = true;
1674
        /* fall through */
1675
    case TCG_COND_TSTEQ:
1676
        /* If arg2 is -1, convert to LTU/GEU vs 1. */
1677
        if (const_arg2 && arg2 == 0xffffffffu) {
1678
            arg2 = 1;
1679
            cmp_rexw = 0;
1680
            goto do_ltu;
1681
        }
1682
        break;
1683

1684
    case TCG_COND_LEU:
1685
        inv = true;
1686
        /* fall through */
1687
    case TCG_COND_GTU:
1688
        /* If arg2 is a register, swap for LTU/GEU. */
1689
        if (!const_arg2) {
1690
            TCGReg t = arg1;
1691
            arg1 = arg2;
1692
            arg2 = t;
1693
            goto do_ltu;
1694
        }
1695
        break;
1696

1697
    case TCG_COND_GEU:
1698
        inv = true;
1699
        /* fall through */
1700
    case TCG_COND_LTU:
1701
    do_ltu:
1702
        /*
1703
         * Relying on the carry bit, use SBB to produce -1 if LTU, 0 if GEU.
1704
         * We can then use NEG or INC to produce the desired result.
1705
         * This is always smaller than the SETCC expansion.
1706
         */
1707
        tcg_out_cmp(s, TCG_COND_LTU, arg1, arg2, const_arg2, cmp_rexw);
1708

1709
        /* X - X - C = -C = (C ? -1 : 0) */
1710
        tgen_arithr(s, ARITH_SBB + (neg ? rexw : 0), dest, dest);
1711
        if (inv && neg) {
1712
            /* ~(C ? -1 : 0) = (C ? 0 : -1) */
1713
            tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest);
1714
        } else if (inv) {
1715
            /* (C ? -1 : 0) + 1 = (C ? 0 : 1) */
1716
            tgen_arithi(s, ARITH_ADD, dest, 1, 0);
1717
        } else if (!neg) {
1718
            /* -(C ? -1 : 0) = (C ? 1 : 0) */
1719
            tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_NEG, dest);
1720
        }
1721
        return;
1722

1723
    case TCG_COND_GE:
1724
        inv = true;
1725
        /* fall through */
1726
    case TCG_COND_LT:
1727
        /* If arg2 is 0, extract the sign bit. */
1728
        if (const_arg2 && arg2 == 0) {
1729
            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, dest, arg1);
1730
            if (inv) {
1731
                tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest);
1732
            }
1733
            tcg_out_shifti(s, (neg ? SHIFT_SAR : SHIFT_SHR) + rexw,
1734
                           dest, rexw ? 63 : 31);
1735
            return;
1736
        }
1737
        break;
1738

1739
    default:
1740
        break;
1741
    }
1742

1743
    /*
1744
     * If dest does not overlap the inputs, clearing it first is preferred.
1745
     * The XOR breaks any false dependency for the low-byte write to dest,
1746
     * and is also one byte smaller than MOVZBL.
1747
     */
1748
    cleared = false;
1749
    if (dest != arg1 && (const_arg2 || dest != arg2)) {
1750
        tgen_arithr(s, ARITH_XOR, dest, dest);
1751
        cleared = true;
1752
    }
1753

1754
    jcc = tcg_out_cmp(s, cond, arg1, arg2, const_arg2, cmp_rexw);
1755
    tcg_out_modrm(s, OPC_SETCC | jcc, 0, dest);
1756

1757
    if (!cleared) {
1758
        tcg_out_ext8u(s, dest, dest);
1759
    }
1760
    if (neg) {
1761
        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, dest);
1762
    }
1763
}
1764

1765
#if TCG_TARGET_REG_BITS == 32
1766
static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
1767
                             const int *const_args)
1768
{
1769
    TCGArg new_args[6];
1770
    TCGLabel *label_true, *label_over;
1771

1772
    memcpy(new_args, args+1, 5*sizeof(TCGArg));
1773

1774
    if (args[0] == args[1] || args[0] == args[2]
1775
        || (!const_args[3] && args[0] == args[3])
1776
        || (!const_args[4] && args[0] == args[4])) {
1777
        /* When the destination overlaps with one of the argument
1778
           registers, don't do anything tricky.  */
1779
        label_true = gen_new_label();
1780
        label_over = gen_new_label();
1781

1782
        new_args[5] = label_arg(label_true);
1783
        tcg_out_brcond2(s, new_args, const_args+1, 1);
1784

1785
        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1786
        tcg_out_jxx(s, JCC_JMP, label_over, 1);
1787
        tcg_out_label(s, label_true);
1788

1789
        tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
1790
        tcg_out_label(s, label_over);
1791
    } else {
1792
        /* When the destination does not overlap one of the arguments,
1793
           clear the destination first, jump if cond false, and emit an
1794
           increment in the true case.  This results in smaller code.  */
1795

1796
        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1797

1798
        label_over = gen_new_label();
1799
        new_args[4] = tcg_invert_cond(new_args[4]);
1800
        new_args[5] = label_arg(label_over);
1801
        tcg_out_brcond2(s, new_args, const_args+1, 1);
1802

1803
        tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
1804
        tcg_out_label(s, label_over);
1805
    }
1806
}
1807
#endif
1808

1809
static void tcg_out_cmov(TCGContext *s, int jcc, int rexw,
1810
                         TCGReg dest, TCGReg v1)
1811
{
1812
    tcg_out_modrm(s, OPC_CMOVCC | jcc | rexw, dest, v1);
1813
}
1814

1815
static void tcg_out_movcond(TCGContext *s, int rexw, TCGCond cond,
1816
                            TCGReg dest, TCGReg c1, TCGArg c2, int const_c2,
1817
                            TCGReg v1)
1818
{
1819
    int jcc = tcg_out_cmp(s, cond, c1, c2, const_c2, rexw);
1820
    tcg_out_cmov(s, jcc, rexw, dest, v1);
1821
}
1822

1823
static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1824
                        TCGArg arg2, bool const_a2)
1825
{
1826
    if (have_bmi1) {
1827
        tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
1828
        if (const_a2) {
1829
            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1830
        } else {
1831
            tcg_debug_assert(dest != arg2);
1832
            tcg_out_cmov(s, JCC_JB, rexw, dest, arg2);
1833
        }
1834
    } else {
1835
        tcg_debug_assert(dest != arg2);
1836
        tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
1837
        tcg_out_cmov(s, JCC_JE, rexw, dest, arg2);
1838
    }
1839
}
1840

1841
static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1842
                        TCGArg arg2, bool const_a2)
1843
{
1844
    if (have_lzcnt) {
1845
        tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
1846
        if (const_a2) {
1847
            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1848
        } else {
1849
            tcg_debug_assert(dest != arg2);
1850
            tcg_out_cmov(s, JCC_JB, rexw, dest, arg2);
1851
        }
1852
    } else {
1853
        tcg_debug_assert(!const_a2);
1854
        tcg_debug_assert(dest != arg1);
1855
        tcg_debug_assert(dest != arg2);
1856

1857
        /* Recall that the output of BSR is the index not the count.  */
1858
        tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
1859
        tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);
1860

1861
        /* Since we have destroyed the flags from BSR, we have to re-test.  */
1862
        int jcc = tcg_out_cmp(s, TCG_COND_EQ, arg1, 0, 1, rexw);
1863
        tcg_out_cmov(s, jcc, rexw, dest, arg2);
1864
    }
1865
}
1866

1867
static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest)
1868
{
1869
    intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1870

1871
    if (disp == (int32_t)disp) {
1872
        tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
1873
        tcg_out32(s, disp);
1874
    } else {
1875
        /* rip-relative addressing into the constant pool.
1876
           This is 6 + 8 = 14 bytes, as compared to using an
1877
           immediate load 10 + 6 = 16 bytes, plus we may
1878
           be able to re-use the pool constant for more calls.  */
1879
        tcg_out_opc(s, OPC_GRP5, 0, 0, 0);
1880
        tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5);
1881
        new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4);
1882
        tcg_out32(s, 0);
1883
    }
1884
}
1885

1886
static void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest,
1887
                         const TCGHelperInfo *info)
1888
{
1889
    tcg_out_branch(s, 1, dest);
1890

1891
#ifndef _WIN32
1892
    if (TCG_TARGET_REG_BITS == 32 && info->out_kind == TCG_CALL_RET_BY_REF) {
1893
        /*
1894
         * The sysv i386 abi for struct return places a reference as the
1895
         * first argument of the stack, and pops that argument with the
1896
         * return statement.  Since we want to retain the aligned stack
1897
         * pointer for the callee, we do not want to actually push that
1898
         * argument before the call but rely on the normal store to the
1899
         * stack slot.  But we do need to compensate for the pop in order
1900
         * to reset our correct stack pointer value.
1901
         * Pushing a garbage value back onto the stack is quickest.
1902
         */
1903
        tcg_out_push(s, TCG_REG_EAX);
1904
    }
1905
#endif
1906
}
1907

1908
static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest)
1909
{
1910
    tcg_out_branch(s, 0, dest);
1911
}
1912

1913
static void tcg_out_nopn(TCGContext *s, int n)
1914
{
1915
    int i;
1916
    /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1917
     * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1918
     * duplicate prefix, and all of the interesting recent cores can
1919
     * decode and discard the duplicates in a single cycle.
1920
     */
1921
    tcg_debug_assert(n >= 1);
1922
    for (i = 1; i < n; ++i) {
1923
        tcg_out8(s, 0x66);
1924
    }
1925
    tcg_out8(s, 0x90);
1926
}
1927

1928
typedef struct {
1929
    TCGReg base;
1930
    int index;
1931
    int ofs;
1932
    int seg;
1933
    TCGAtomAlign aa;
1934
} HostAddress;
1935

1936
bool tcg_target_has_memory_bswap(MemOp memop)
1937
{
1938
    TCGAtomAlign aa;
1939

1940
    if (!have_movbe) {
1941
        return false;
1942
    }
1943
    if ((memop & MO_SIZE) < MO_128) {
1944
        return true;
1945
    }
1946

1947
    /*
1948
     * Reject 16-byte memop with 16-byte atomicity, i.e. VMOVDQA,
1949
     * but do allow a pair of 64-bit operations, i.e. MOVBEQ.
1950
     */
1951
    aa = atom_and_align_for_opc(tcg_ctx, memop, MO_ATOM_IFALIGN, true);
1952
    return aa.atom < MO_128;
1953
}
1954

1955
/*
1956
 * Because i686 has no register parameters and because x86_64 has xchg
1957
 * to handle addr/data register overlap, we have placed all input arguments
1958
 * before we need might need a scratch reg.
1959
 *
1960
 * Even then, a scratch is only needed for l->raddr.  Rather than expose
1961
 * a general-purpose scratch when we don't actually know it's available,
1962
 * use the ra_gen hook to load into RAX if needed.
1963
 */
1964
#if TCG_TARGET_REG_BITS == 64
1965
static TCGReg ldst_ra_gen(TCGContext *s, const TCGLabelQemuLdst *l, int arg)
1966
{
1967
    if (arg < 0) {
1968
        arg = TCG_REG_RAX;
1969
    }
1970
    tcg_out_movi(s, TCG_TYPE_PTR, arg, (uintptr_t)l->raddr);
1971
    return arg;
1972
}
1973
static const TCGLdstHelperParam ldst_helper_param = {
1974
    .ra_gen = ldst_ra_gen
1975
};
1976
#else
1977
static const TCGLdstHelperParam ldst_helper_param = { };
1978
#endif
1979

1980
static void tcg_out_vec_to_pair(TCGContext *s, TCGType type,
1981
                                TCGReg l, TCGReg h, TCGReg v)
1982
{
1983
    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1984

1985
    /* vpmov{d,q} %v, %l */
1986
    tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, v, 0, l);
1987
    /* vpextr{d,q} $1, %v, %h */
1988
    tcg_out_vex_modrm(s, OPC_PEXTRD + rexw, v, 0, h);
1989
    tcg_out8(s, 1);
1990
}
1991

1992
static void tcg_out_pair_to_vec(TCGContext *s, TCGType type,
1993
                                TCGReg v, TCGReg l, TCGReg h)
1994
{
1995
    int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1996

1997
    /* vmov{d,q} %l, %v */
1998
    tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, v, 0, l);
1999
    /* vpinsr{d,q} $1, %h, %v, %v */
2000
    tcg_out_vex_modrm(s, OPC_PINSRD + rexw, v, v, h);
2001
    tcg_out8(s, 1);
2002
}
2003

2004
/*
2005
 * Generate code for the slow path for a load at the end of block
2006
 */
2007
static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
2008
{
2009
    MemOp opc = get_memop(l->oi);
2010
    tcg_insn_unit **label_ptr = &l->label_ptr[0];
2011

2012
    /* resolve label address */
2013
    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
2014
    if (label_ptr[1]) {
2015
        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
2016
    }
2017

2018
    tcg_out_ld_helper_args(s, l, &ldst_helper_param);
2019
    tcg_out_branch(s, 1, qemu_ld_helpers[opc & MO_SIZE]);
2020
    tcg_out_ld_helper_ret(s, l, false, &ldst_helper_param);
2021

2022
    tcg_out_jmp(s, l->raddr);
2023
    return true;
2024
}
2025

2026
/*
2027
 * Generate code for the slow path for a store at the end of block
2028
 */
2029
static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
2030
{
2031
    MemOp opc = get_memop(l->oi);
2032
    tcg_insn_unit **label_ptr = &l->label_ptr[0];
2033

2034
    /* resolve label address */
2035
    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
2036
    if (label_ptr[1]) {
2037
        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
2038
    }
2039

2040
    tcg_out_st_helper_args(s, l, &ldst_helper_param);
2041
    tcg_out_branch(s, 1, qemu_st_helpers[opc & MO_SIZE]);
2042

2043
    tcg_out_jmp(s, l->raddr);
2044
    return true;
2045
}
2046

2047
#ifdef CONFIG_USER_ONLY
2048
static HostAddress x86_guest_base = {
2049
    .index = -1
2050
};
2051

2052
#if defined(__x86_64__) && defined(__linux__)
2053
# include <asm/prctl.h>
2054
# include <sys/prctl.h>
2055
int arch_prctl(int code, unsigned long addr);
2056
static inline int setup_guest_base_seg(void)
2057
{
2058
    if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
2059
        return P_GS;
2060
    }
2061
    return 0;
2062
}
2063
#define setup_guest_base_seg  setup_guest_base_seg
2064
#elif defined(__x86_64__) && \
2065
      (defined (__FreeBSD__) || defined (__FreeBSD_kernel__))
2066
# include <machine/sysarch.h>
2067
static inline int setup_guest_base_seg(void)
2068
{
2069
    if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) {
2070
        return P_GS;
2071
    }
2072
    return 0;
2073
}
2074
#define setup_guest_base_seg  setup_guest_base_seg
2075
#endif
2076
#else
2077
# define x86_guest_base (*(HostAddress *)({ qemu_build_not_reached(); NULL; }))
2078
#endif /* CONFIG_USER_ONLY */
2079
#ifndef setup_guest_base_seg
2080
# define setup_guest_base_seg()  0
2081
#endif
2082

2083
#define MIN_TLB_MASK_TABLE_OFS  INT_MIN
2084

2085
/*
2086
 * For softmmu, perform the TLB load and compare.
2087
 * For useronly, perform any required alignment tests.
2088
 * In both cases, return a TCGLabelQemuLdst structure if the slow path
2089
 * is required and fill in @h with the host address for the fast path.
2090
 */
2091
static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
2092
                                           TCGReg addrlo, TCGReg addrhi,
2093
                                           MemOpIdx oi, bool is_ld)
2094
{
2095
    TCGLabelQemuLdst *ldst = NULL;
2096
    MemOp opc = get_memop(oi);
2097
    MemOp s_bits = opc & MO_SIZE;
2098
    unsigned a_mask;
2099

2100
    if (tcg_use_softmmu) {
2101
        h->index = TCG_REG_L0;
2102
        h->ofs = 0;
2103
        h->seg = 0;
2104
    } else {
2105
        *h = x86_guest_base;
2106
    }
2107
    h->base = addrlo;
2108
    h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, s_bits == MO_128);
2109
    a_mask = (1 << h->aa.align) - 1;
2110

2111
    if (tcg_use_softmmu) {
2112
        int cmp_ofs = is_ld ? offsetof(CPUTLBEntry, addr_read)
2113
                            : offsetof(CPUTLBEntry, addr_write);
2114
        TCGType ttype = TCG_TYPE_I32;
2115
        TCGType tlbtype = TCG_TYPE_I32;
2116
        int trexw = 0, hrexw = 0, tlbrexw = 0;
2117
        unsigned mem_index = get_mmuidx(oi);
2118
        unsigned s_mask = (1 << s_bits) - 1;
2119
        int fast_ofs = tlb_mask_table_ofs(s, mem_index);
2120
        int tlb_mask;
2121

2122
        ldst = new_ldst_label(s);
2123
        ldst->is_ld = is_ld;
2124
        ldst->oi = oi;
2125
        ldst->addrlo_reg = addrlo;
2126
        ldst->addrhi_reg = addrhi;
2127

2128
        if (TCG_TARGET_REG_BITS == 64) {
2129
            ttype = s->addr_type;
2130
            trexw = (ttype == TCG_TYPE_I32 ? 0 : P_REXW);
2131
            if (TCG_TYPE_PTR == TCG_TYPE_I64) {
2132
                hrexw = P_REXW;
2133
                if (s->page_bits + s->tlb_dyn_max_bits > 32) {
2134
                    tlbtype = TCG_TYPE_I64;
2135
                    tlbrexw = P_REXW;
2136
                }
2137
            }
2138
        }
2139

2140
        tcg_out_mov(s, tlbtype, TCG_REG_L0, addrlo);
2141
        tcg_out_shifti(s, SHIFT_SHR + tlbrexw, TCG_REG_L0,
2142
                       s->page_bits - CPU_TLB_ENTRY_BITS);
2143

2144
        tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, TCG_REG_L0, TCG_AREG0,
2145
                             fast_ofs + offsetof(CPUTLBDescFast, mask));
2146

2147
        tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, TCG_REG_L0, TCG_AREG0,
2148
                             fast_ofs + offsetof(CPUTLBDescFast, table));
2149

2150
        /*
2151
         * If the required alignment is at least as large as the access,
2152
         * simply copy the address and mask.  For lesser alignments,
2153
         * check that we don't cross pages for the complete access.
2154
         */
2155
        if (a_mask >= s_mask) {
2156
            tcg_out_mov(s, ttype, TCG_REG_L1, addrlo);
2157
        } else {
2158
            tcg_out_modrm_offset(s, OPC_LEA + trexw, TCG_REG_L1,
2159
                                 addrlo, s_mask - a_mask);
2160
        }
2161
        tlb_mask = s->page_mask | a_mask;
2162
        tgen_arithi(s, ARITH_AND + trexw, TCG_REG_L1, tlb_mask, 0);
2163

2164
        /* cmp 0(TCG_REG_L0), TCG_REG_L1 */
2165
        tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw,
2166
                             TCG_REG_L1, TCG_REG_L0, cmp_ofs);
2167

2168
        /* jne slow_path */
2169
        tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
2170
        ldst->label_ptr[0] = s->code_ptr;
2171
        s->code_ptr += 4;
2172

2173
        if (TCG_TARGET_REG_BITS == 32 && s->addr_type == TCG_TYPE_I64) {
2174
            /* cmp 4(TCG_REG_L0), addrhi */
2175
            tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi,
2176
                                 TCG_REG_L0, cmp_ofs + 4);
2177

2178
            /* jne slow_path */
2179
            tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
2180
            ldst->label_ptr[1] = s->code_ptr;
2181
            s->code_ptr += 4;
2182
        }
2183

2184
        /* TLB Hit.  */
2185
        tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_L0, TCG_REG_L0,
2186
                   offsetof(CPUTLBEntry, addend));
2187
    } else if (a_mask) {
2188
        int jcc;
2189

2190
        ldst = new_ldst_label(s);
2191
        ldst->is_ld = is_ld;
2192
        ldst->oi = oi;
2193
        ldst->addrlo_reg = addrlo;
2194
        ldst->addrhi_reg = addrhi;
2195

2196
        /* jne slow_path */
2197
        jcc = tcg_out_cmp(s, TCG_COND_TSTNE, addrlo, a_mask, true, false);
2198
        tcg_out_opc(s, OPC_JCC_long + jcc, 0, 0, 0);
2199
        ldst->label_ptr[0] = s->code_ptr;
2200
        s->code_ptr += 4;
2201
    }
2202

2203
    return ldst;
2204
}
2205

2206
static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2207
                                   HostAddress h, TCGType type, MemOp memop)
2208
{
2209
    bool use_movbe = false;
2210
    int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
2211
    int movop = OPC_MOVL_GvEv;
2212

2213
    /* Do big-endian loads with movbe.  */
2214
    if (memop & MO_BSWAP) {
2215
        tcg_debug_assert(have_movbe);
2216
        use_movbe = true;
2217
        movop = OPC_MOVBE_GyMy;
2218
    }
2219

2220
    switch (memop & MO_SSIZE) {
2221
    case MO_UB:
2222
        tcg_out_modrm_sib_offset(s, OPC_MOVZBL + h.seg, datalo,
2223
                                 h.base, h.index, 0, h.ofs);
2224
        break;
2225
    case MO_SB:
2226
        tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + h.seg, datalo,
2227
                                 h.base, h.index, 0, h.ofs);
2228
        break;
2229
    case MO_UW:
2230
        if (use_movbe) {
2231
            /* There is no extending movbe; only low 16-bits are modified.  */
2232
            if (datalo != h.base && datalo != h.index) {
2233
                /* XOR breaks dependency chains.  */
2234
                tgen_arithr(s, ARITH_XOR, datalo, datalo);
2235
                tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2236
                                         datalo, h.base, h.index, 0, h.ofs);
2237
            } else {
2238
                tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2239
                                         datalo, h.base, h.index, 0, h.ofs);
2240
                tcg_out_ext16u(s, datalo, datalo);
2241
            }
2242
        } else {
2243
            tcg_out_modrm_sib_offset(s, OPC_MOVZWL + h.seg, datalo,
2244
                                     h.base, h.index, 0, h.ofs);
2245
        }
2246
        break;
2247
    case MO_SW:
2248
        if (use_movbe) {
2249
            tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2250
                                     datalo, h.base, h.index, 0, h.ofs);
2251
            tcg_out_ext16s(s, type, datalo, datalo);
2252
        } else {
2253
            tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + h.seg,
2254
                                     datalo, h.base, h.index, 0, h.ofs);
2255
        }
2256
        break;
2257
    case MO_UL:
2258
        tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2259
                                 h.base, h.index, 0, h.ofs);
2260
        break;
2261
#if TCG_TARGET_REG_BITS == 64
2262
    case MO_SL:
2263
        if (use_movbe) {
2264
            tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + h.seg, datalo,
2265
                                     h.base, h.index, 0, h.ofs);
2266
            tcg_out_ext32s(s, datalo, datalo);
2267
        } else {
2268
            tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + h.seg, datalo,
2269
                                     h.base, h.index, 0, h.ofs);
2270
        }
2271
        break;
2272
#endif
2273
    case MO_UQ:
2274
        if (TCG_TARGET_REG_BITS == 64) {
2275
            tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2276
                                     h.base, h.index, 0, h.ofs);
2277
            break;
2278
        }
2279
        if (use_movbe) {
2280
            TCGReg t = datalo;
2281
            datalo = datahi;
2282
            datahi = t;
2283
        }
2284
        if (h.base == datalo || h.index == datalo) {
2285
            tcg_out_modrm_sib_offset(s, OPC_LEA, datahi,
2286
                                     h.base, h.index, 0, h.ofs);
2287
            tcg_out_modrm_offset(s, movop + h.seg, datalo, datahi, 0);
2288
            tcg_out_modrm_offset(s, movop + h.seg, datahi, datahi, 4);
2289
        } else {
2290
            tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2291
                                     h.base, h.index, 0, h.ofs);
2292
            tcg_out_modrm_sib_offset(s, movop + h.seg, datahi,
2293
                                     h.base, h.index, 0, h.ofs + 4);
2294
        }
2295
        break;
2296

2297
    case MO_128:
2298
        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2299

2300
        /*
2301
         * Without 16-byte atomicity, use integer regs.
2302
         * That is where we want the data, and it allows bswaps.
2303
         */
2304
        if (h.aa.atom < MO_128) {
2305
            if (use_movbe) {
2306
                TCGReg t = datalo;
2307
                datalo = datahi;
2308
                datahi = t;
2309
            }
2310
            if (h.base == datalo || h.index == datalo) {
2311
                tcg_out_modrm_sib_offset(s, OPC_LEA + P_REXW, datahi,
2312
                                         h.base, h.index, 0, h.ofs);
2313
                tcg_out_modrm_offset(s, movop + P_REXW + h.seg,
2314
                                     datalo, datahi, 0);
2315
                tcg_out_modrm_offset(s, movop + P_REXW + h.seg,
2316
                                     datahi, datahi, 8);
2317
            } else {
2318
                tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2319
                                         h.base, h.index, 0, h.ofs);
2320
                tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi,
2321
                                         h.base, h.index, 0, h.ofs + 8);
2322
            }
2323
            break;
2324
        }
2325

2326
        /*
2327
         * With 16-byte atomicity, a vector load is required.
2328
         * If we already have 16-byte alignment, then VMOVDQA always works.
2329
         * Else if VMOVDQU has atomicity with dynamic alignment, use that.
2330
         * Else use we require a runtime test for alignment for VMOVDQA;
2331
         * use VMOVDQU on the unaligned nonatomic path for simplicity.
2332
         */
2333
        if (h.aa.align >= MO_128) {
2334
            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg,
2335
                                         TCG_TMP_VEC, 0,
2336
                                         h.base, h.index, 0, h.ofs);
2337
        } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) {
2338
            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg,
2339
                                         TCG_TMP_VEC, 0,
2340
                                         h.base, h.index, 0, h.ofs);
2341
        } else {
2342
            TCGLabel *l1 = gen_new_label();
2343
            TCGLabel *l2 = gen_new_label();
2344
            int jcc;
2345

2346
            jcc = tcg_out_cmp(s, TCG_COND_TSTNE, h.base, 15, true, false);
2347
            tcg_out_jxx(s, jcc, l1, true);
2348

2349
            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg,
2350
                                         TCG_TMP_VEC, 0,
2351
                                         h.base, h.index, 0, h.ofs);
2352
            tcg_out_jxx(s, JCC_JMP, l2, true);
2353

2354
            tcg_out_label(s, l1);
2355
            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg,
2356
                                         TCG_TMP_VEC, 0,
2357
                                         h.base, h.index, 0, h.ofs);
2358
            tcg_out_label(s, l2);
2359
        }
2360
        tcg_out_vec_to_pair(s, TCG_TYPE_I64, datalo, datahi, TCG_TMP_VEC);
2361
        break;
2362

2363
    default:
2364
        g_assert_not_reached();
2365
    }
2366
}
2367

2368
static void tcg_out_qemu_ld(TCGContext *s, TCGReg datalo, TCGReg datahi,
2369
                            TCGReg addrlo, TCGReg addrhi,
2370
                            MemOpIdx oi, TCGType data_type)
2371
{
2372
    TCGLabelQemuLdst *ldst;
2373
    HostAddress h;
2374

2375
    ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, true);
2376
    tcg_out_qemu_ld_direct(s, datalo, datahi, h, data_type, get_memop(oi));
2377

2378
    if (ldst) {
2379
        ldst->type = data_type;
2380
        ldst->datalo_reg = datalo;
2381
        ldst->datahi_reg = datahi;
2382
        ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
2383
    }
2384
}
2385

2386
static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2387
                                   HostAddress h, MemOp memop)
2388
{
2389
    bool use_movbe = false;
2390
    int movop = OPC_MOVL_EvGv;
2391

2392
    /*
2393
     * Do big-endian stores with movbe or system-mode.
2394
     * User-only without movbe will have its swapping done generically.
2395
     */
2396
    if (memop & MO_BSWAP) {
2397
        tcg_debug_assert(have_movbe);
2398
        use_movbe = true;
2399
        movop = OPC_MOVBE_MyGy;
2400
    }
2401

2402
    switch (memop & MO_SIZE) {
2403
    case MO_8:
2404
        /* This is handled with constraints on INDEX_op_qemu_st8_i32. */
2405
        tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4);
2406
        tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + h.seg,
2407
                                 datalo, h.base, h.index, 0, h.ofs);
2408
        break;
2409
    case MO_16:
2410
        tcg_out_modrm_sib_offset(s, movop + P_DATA16 + h.seg, datalo,
2411
                                 h.base, h.index, 0, h.ofs);
2412
        break;
2413
    case MO_32:
2414
        tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2415
                                 h.base, h.index, 0, h.ofs);
2416
        break;
2417
    case MO_64:
2418
        if (TCG_TARGET_REG_BITS == 64) {
2419
            tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2420
                                     h.base, h.index, 0, h.ofs);
2421
        } else {
2422
            if (use_movbe) {
2423
                TCGReg t = datalo;
2424
                datalo = datahi;
2425
                datahi = t;
2426
            }
2427
            tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2428
                                     h.base, h.index, 0, h.ofs);
2429
            tcg_out_modrm_sib_offset(s, movop + h.seg, datahi,
2430
                                     h.base, h.index, 0, h.ofs + 4);
2431
        }
2432
        break;
2433

2434
    case MO_128:
2435
        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2436

2437
        /*
2438
         * Without 16-byte atomicity, use integer regs.
2439
         * That is where we have the data, and it allows bswaps.
2440
         */
2441
        if (h.aa.atom < MO_128) {
2442
            if (use_movbe) {
2443
                TCGReg t = datalo;
2444
                datalo = datahi;
2445
                datahi = t;
2446
            }
2447
            tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2448
                                     h.base, h.index, 0, h.ofs);
2449
            tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi,
2450
                                     h.base, h.index, 0, h.ofs + 8);
2451
            break;
2452
        }
2453

2454
        /*
2455
         * With 16-byte atomicity, a vector store is required.
2456
         * If we already have 16-byte alignment, then VMOVDQA always works.
2457
         * Else if VMOVDQU has atomicity with dynamic alignment, use that.
2458
         * Else use we require a runtime test for alignment for VMOVDQA;
2459
         * use VMOVDQU on the unaligned nonatomic path for simplicity.
2460
         */
2461
        tcg_out_pair_to_vec(s, TCG_TYPE_I64, TCG_TMP_VEC, datalo, datahi);
2462
        if (h.aa.align >= MO_128) {
2463
            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg,
2464
                                         TCG_TMP_VEC, 0,
2465
                                         h.base, h.index, 0, h.ofs);
2466
        } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) {
2467
            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg,
2468
                                         TCG_TMP_VEC, 0,
2469
                                         h.base, h.index, 0, h.ofs);
2470
        } else {
2471
            TCGLabel *l1 = gen_new_label();
2472
            TCGLabel *l2 = gen_new_label();
2473
            int jcc;
2474

2475
            jcc = tcg_out_cmp(s, TCG_COND_TSTNE, h.base, 15, true, false);
2476
            tcg_out_jxx(s, jcc, l1, true);
2477

2478
            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg,
2479
                                         TCG_TMP_VEC, 0,
2480
                                         h.base, h.index, 0, h.ofs);
2481
            tcg_out_jxx(s, JCC_JMP, l2, true);
2482

2483
            tcg_out_label(s, l1);
2484
            tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg,
2485
                                         TCG_TMP_VEC, 0,
2486
                                         h.base, h.index, 0, h.ofs);
2487
            tcg_out_label(s, l2);
2488
        }
2489
        break;
2490

2491
    default:
2492
        g_assert_not_reached();
2493
    }
2494
}
2495

2496
static void tcg_out_qemu_st(TCGContext *s, TCGReg datalo, TCGReg datahi,
2497
                            TCGReg addrlo, TCGReg addrhi,
2498
                            MemOpIdx oi, TCGType data_type)
2499
{
2500
    TCGLabelQemuLdst *ldst;
2501
    HostAddress h;
2502

2503
    ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, false);
2504
    tcg_out_qemu_st_direct(s, datalo, datahi, h, get_memop(oi));
2505

2506
    if (ldst) {
2507
        ldst->type = data_type;
2508
        ldst->datalo_reg = datalo;
2509
        ldst->datahi_reg = datahi;
2510
        ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
2511
    }
2512
}
2513

2514
static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
2515
{
2516
    /* Reuse the zeroing that exists for goto_ptr.  */
2517
    if (a0 == 0) {
2518
        tcg_out_jmp(s, tcg_code_gen_epilogue);
2519
    } else {
2520
        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
2521
        tcg_out_jmp(s, tb_ret_addr);
2522
    }
2523
}
2524

2525
static void tcg_out_goto_tb(TCGContext *s, int which)
2526
{
2527
    /*
2528
     * Jump displacement must be aligned for atomic patching;
2529
     * see if we need to add extra nops before jump
2530
     */
2531
    int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
2532
    if (gap != 1) {
2533
        tcg_out_nopn(s, gap - 1);
2534
    }
2535
    tcg_out8(s, OPC_JMP_long); /* jmp im */
2536
    set_jmp_insn_offset(s, which);
2537
    tcg_out32(s, 0);
2538
    set_jmp_reset_offset(s, which);
2539
}
2540

2541
void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
2542
                              uintptr_t jmp_rx, uintptr_t jmp_rw)
2543
{
2544
    /* patch the branch destination */
2545
    uintptr_t addr = tb->jmp_target_addr[n];
2546
    qatomic_set((int32_t *)jmp_rw, addr - (jmp_rx + 4));
2547
    /* no need to flush icache explicitly */
2548
}
2549

2550
static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
2551
                              const TCGArg args[TCG_MAX_OP_ARGS],
2552
                              const int const_args[TCG_MAX_OP_ARGS])
2553
{
2554
    TCGArg a0, a1, a2;
2555
    int c, const_a2, vexop, rexw = 0;
2556

2557
#if TCG_TARGET_REG_BITS == 64
2558
# define OP_32_64(x) \
2559
        case glue(glue(INDEX_op_, x), _i64): \
2560
            rexw = P_REXW; /* FALLTHRU */    \
2561
        case glue(glue(INDEX_op_, x), _i32)
2562
#else
2563
# define OP_32_64(x) \
2564
        case glue(glue(INDEX_op_, x), _i32)
2565
#endif
2566

2567
    /* Hoist the loads of the most common arguments.  */
2568
    a0 = args[0];
2569
    a1 = args[1];
2570
    a2 = args[2];
2571
    const_a2 = const_args[2];
2572

2573
    switch (opc) {
2574
    case INDEX_op_goto_ptr:
2575
        /* jmp to the given host address (could be epilogue) */
2576
        tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
2577
        break;
2578
    case INDEX_op_br:
2579
        tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
2580
        break;
2581
    OP_32_64(ld8u):
2582
        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2583
        tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
2584
        break;
2585
    OP_32_64(ld8s):
2586
        tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
2587
        break;
2588
    OP_32_64(ld16u):
2589
        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2590
        tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
2591
        break;
2592
    OP_32_64(ld16s):
2593
        tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
2594
        break;
2595
#if TCG_TARGET_REG_BITS == 64
2596
    case INDEX_op_ld32u_i64:
2597
#endif
2598
    case INDEX_op_ld_i32:
2599
        tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
2600
        break;
2601

2602
    OP_32_64(st8):
2603
        if (const_args[0]) {
2604
            tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
2605
            tcg_out8(s, a0);
2606
        } else {
2607
            tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
2608
        }
2609
        break;
2610
    OP_32_64(st16):
2611
        if (const_args[0]) {
2612
            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
2613
            tcg_out16(s, a0);
2614
        } else {
2615
            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
2616
        }
2617
        break;
2618
#if TCG_TARGET_REG_BITS == 64
2619
    case INDEX_op_st32_i64:
2620
#endif
2621
    case INDEX_op_st_i32:
2622
        if (const_args[0]) {
2623
            tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
2624
            tcg_out32(s, a0);
2625
        } else {
2626
            tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
2627
        }
2628
        break;
2629

2630
    OP_32_64(add):
2631
        /* For 3-operand addition, use LEA.  */
2632
        if (a0 != a1) {
2633
            TCGArg c3 = 0;
2634
            if (const_a2) {
2635
                c3 = a2, a2 = -1;
2636
            } else if (a0 == a2) {
2637
                /* Watch out for dest = src + dest, since we've removed
2638
                   the matching constraint on the add.  */
2639
                tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
2640
                break;
2641
            }
2642

2643
            tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
2644
            break;
2645
        }
2646
        c = ARITH_ADD;
2647
        goto gen_arith;
2648
    OP_32_64(sub):
2649
        c = ARITH_SUB;
2650
        goto gen_arith;
2651
    OP_32_64(and):
2652
        c = ARITH_AND;
2653
        goto gen_arith;
2654
    OP_32_64(or):
2655
        c = ARITH_OR;
2656
        goto gen_arith;
2657
    OP_32_64(xor):
2658
        c = ARITH_XOR;
2659
        goto gen_arith;
2660
    gen_arith:
2661
        if (const_a2) {
2662
            tgen_arithi(s, c + rexw, a0, a2, 0);
2663
        } else {
2664
            tgen_arithr(s, c + rexw, a0, a2);
2665
        }
2666
        break;
2667

2668
    OP_32_64(andc):
2669
        if (const_a2) {
2670
            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2671
            tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
2672
        } else {
2673
            tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
2674
        }
2675
        break;
2676

2677
    OP_32_64(mul):
2678
        if (const_a2) {
2679
            int32_t val;
2680
            val = a2;
2681
            if (val == (int8_t)val) {
2682
                tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
2683
                tcg_out8(s, val);
2684
            } else {
2685
                tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
2686
                tcg_out32(s, val);
2687
            }
2688
        } else {
2689
            tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
2690
        }
2691
        break;
2692

2693
    OP_32_64(div2):
2694
        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
2695
        break;
2696
    OP_32_64(divu2):
2697
        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
2698
        break;
2699

2700
    OP_32_64(shl):
2701
        /* For small constant 3-operand shift, use LEA.  */
2702
        if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
2703
            if (a2 - 1 == 0) {
2704
                /* shl $1,a1,a0 -> lea (a1,a1),a0 */
2705
                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
2706
            } else {
2707
                /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2708
                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
2709
            }
2710
            break;
2711
        }
2712
        c = SHIFT_SHL;
2713
        vexop = OPC_SHLX;
2714
        goto gen_shift_maybe_vex;
2715
    OP_32_64(shr):
2716
        c = SHIFT_SHR;
2717
        vexop = OPC_SHRX;
2718
        goto gen_shift_maybe_vex;
2719
    OP_32_64(sar):
2720
        c = SHIFT_SAR;
2721
        vexop = OPC_SARX;
2722
        goto gen_shift_maybe_vex;
2723
    OP_32_64(rotl):
2724
        c = SHIFT_ROL;
2725
        goto gen_shift;
2726
    OP_32_64(rotr):
2727
        c = SHIFT_ROR;
2728
        goto gen_shift;
2729
    gen_shift_maybe_vex:
2730
        if (have_bmi2) {
2731
            if (!const_a2) {
2732
                tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
2733
                break;
2734
            }
2735
            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2736
        }
2737
        /* FALLTHRU */
2738
    gen_shift:
2739
        if (const_a2) {
2740
            tcg_out_shifti(s, c + rexw, a0, a2);
2741
        } else {
2742
            tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
2743
        }
2744
        break;
2745

2746
    OP_32_64(ctz):
2747
        tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
2748
        break;
2749
    OP_32_64(clz):
2750
        tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
2751
        break;
2752
    OP_32_64(ctpop):
2753
        tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
2754
        break;
2755

2756
    OP_32_64(brcond):
2757
        tcg_out_brcond(s, rexw, a2, a0, a1, const_args[1],
2758
                       arg_label(args[3]), 0);
2759
        break;
2760
    OP_32_64(setcond):
2761
        tcg_out_setcond(s, rexw, args[3], a0, a1, a2, const_a2, false);
2762
        break;
2763
    OP_32_64(negsetcond):
2764
        tcg_out_setcond(s, rexw, args[3], a0, a1, a2, const_a2, true);
2765
        break;
2766
    OP_32_64(movcond):
2767
        tcg_out_movcond(s, rexw, args[5], a0, a1, a2, const_a2, args[3]);
2768
        break;
2769

2770
    OP_32_64(bswap16):
2771
        if (a2 & TCG_BSWAP_OS) {
2772
            /* Output must be sign-extended. */
2773
            if (rexw) {
2774
                tcg_out_bswap64(s, a0);
2775
                tcg_out_shifti(s, SHIFT_SAR + rexw, a0, 48);
2776
            } else {
2777
                tcg_out_bswap32(s, a0);
2778
                tcg_out_shifti(s, SHIFT_SAR, a0, 16);
2779
            }
2780
        } else if ((a2 & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) {
2781
            /* Output must be zero-extended, but input isn't. */
2782
            tcg_out_bswap32(s, a0);
2783
            tcg_out_shifti(s, SHIFT_SHR, a0, 16);
2784
        } else {
2785
            tcg_out_rolw_8(s, a0);
2786
        }
2787
        break;
2788
    OP_32_64(bswap32):
2789
        tcg_out_bswap32(s, a0);
2790
        if (rexw && (a2 & TCG_BSWAP_OS)) {
2791
            tcg_out_ext32s(s, a0, a0);
2792
        }
2793
        break;
2794

2795
    OP_32_64(neg):
2796
        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
2797
        break;
2798
    OP_32_64(not):
2799
        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
2800
        break;
2801

2802
    case INDEX_op_qemu_ld_a64_i32:
2803
        if (TCG_TARGET_REG_BITS == 32) {
2804
            tcg_out_qemu_ld(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32);
2805
            break;
2806
        }
2807
        /* fall through */
2808
    case INDEX_op_qemu_ld_a32_i32:
2809
        tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I32);
2810
        break;
2811
    case INDEX_op_qemu_ld_a32_i64:
2812
        if (TCG_TARGET_REG_BITS == 64) {
2813
            tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2814
        } else {
2815
            tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64);
2816
        }
2817
        break;
2818
    case INDEX_op_qemu_ld_a64_i64:
2819
        if (TCG_TARGET_REG_BITS == 64) {
2820
            tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2821
        } else {
2822
            tcg_out_qemu_ld(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64);
2823
        }
2824
        break;
2825
    case INDEX_op_qemu_ld_a32_i128:
2826
    case INDEX_op_qemu_ld_a64_i128:
2827
        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2828
        tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128);
2829
        break;
2830

2831
    case INDEX_op_qemu_st_a64_i32:
2832
    case INDEX_op_qemu_st8_a64_i32:
2833
        if (TCG_TARGET_REG_BITS == 32) {
2834
            tcg_out_qemu_st(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32);
2835
            break;
2836
        }
2837
        /* fall through */
2838
    case INDEX_op_qemu_st_a32_i32:
2839
    case INDEX_op_qemu_st8_a32_i32:
2840
        tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I32);
2841
        break;
2842
    case INDEX_op_qemu_st_a32_i64:
2843
        if (TCG_TARGET_REG_BITS == 64) {
2844
            tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2845
        } else {
2846
            tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64);
2847
        }
2848
        break;
2849
    case INDEX_op_qemu_st_a64_i64:
2850
        if (TCG_TARGET_REG_BITS == 64) {
2851
            tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2852
        } else {
2853
            tcg_out_qemu_st(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64);
2854
        }
2855
        break;
2856
    case INDEX_op_qemu_st_a32_i128:
2857
    case INDEX_op_qemu_st_a64_i128:
2858
        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2859
        tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128);
2860
        break;
2861

2862
    OP_32_64(mulu2):
2863
        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
2864
        break;
2865
    OP_32_64(muls2):
2866
        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
2867
        break;
2868
    OP_32_64(add2):
2869
        if (const_args[4]) {
2870
            tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
2871
        } else {
2872
            tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
2873
        }
2874
        if (const_args[5]) {
2875
            tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
2876
        } else {
2877
            tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
2878
        }
2879
        break;
2880
    OP_32_64(sub2):
2881
        if (const_args[4]) {
2882
            tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
2883
        } else {
2884
            tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
2885
        }
2886
        if (const_args[5]) {
2887
            tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
2888
        } else {
2889
            tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
2890
        }
2891
        break;
2892

2893
#if TCG_TARGET_REG_BITS == 32
2894
    case INDEX_op_brcond2_i32:
2895
        tcg_out_brcond2(s, args, const_args, 0);
2896
        break;
2897
    case INDEX_op_setcond2_i32:
2898
        tcg_out_setcond2(s, args, const_args);
2899
        break;
2900
#else /* TCG_TARGET_REG_BITS == 64 */
2901
    case INDEX_op_ld32s_i64:
2902
        tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
2903
        break;
2904
    case INDEX_op_ld_i64:
2905
        tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
2906
        break;
2907
    case INDEX_op_st_i64:
2908
        if (const_args[0]) {
2909
            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
2910
            tcg_out32(s, a0);
2911
        } else {
2912
            tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
2913
        }
2914
        break;
2915

2916
    case INDEX_op_bswap64_i64:
2917
        tcg_out_bswap64(s, a0);
2918
        break;
2919
    case INDEX_op_extrh_i64_i32:
2920
        tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32);
2921
        break;
2922
#endif
2923

2924
    OP_32_64(deposit):
2925
        if (args[3] == 0 && args[4] == 8) {
2926
            /* load bits 0..7 */
2927
            if (const_a2) {
2928
                tcg_out_opc(s, OPC_MOVB_Ib | P_REXB_RM | LOWREGMASK(a0),
2929
                            0, a0, 0);
2930
                tcg_out8(s, a2);
2931
            } else {
2932
                tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
2933
            }
2934
        } else if (TCG_TARGET_REG_BITS == 32 && args[3] == 8 && args[4] == 8) {
2935
            /* load bits 8..15 */
2936
            if (const_a2) {
2937
                tcg_out8(s, OPC_MOVB_Ib + a0 + 4);
2938
                tcg_out8(s, a2);
2939
            } else {
2940
                tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
2941
            }
2942
        } else if (args[3] == 0 && args[4] == 16) {
2943
            /* load bits 0..15 */
2944
            if (const_a2) {
2945
                tcg_out_opc(s, OPC_MOVL_Iv | P_DATA16 | LOWREGMASK(a0),
2946
                            0, a0, 0);
2947
                tcg_out16(s, a2);
2948
            } else {
2949
                tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
2950
            }
2951
        } else {
2952
            g_assert_not_reached();
2953
        }
2954
        break;
2955

2956
    case INDEX_op_extract_i64:
2957
        if (a2 + args[3] == 32) {
2958
            /* This is a 32-bit zero-extending right shift.  */
2959
            tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
2960
            tcg_out_shifti(s, SHIFT_SHR, a0, a2);
2961
            break;
2962
        }
2963
        /* FALLTHRU */
2964
    case INDEX_op_extract_i32:
2965
        /* On the off-chance that we can use the high-byte registers.
2966
           Otherwise we emit the same ext16 + shift pattern that we
2967
           would have gotten from the normal tcg-op.c expansion.  */
2968
        tcg_debug_assert(a2 == 8 && args[3] == 8);
2969
        if (a1 < 4 && a0 < 8) {
2970
            tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
2971
        } else {
2972
            tcg_out_ext16u(s, a0, a1);
2973
            tcg_out_shifti(s, SHIFT_SHR, a0, 8);
2974
        }
2975
        break;
2976

2977
    case INDEX_op_sextract_i32:
2978
        /* We don't implement sextract_i64, as we cannot sign-extend to
2979
           64-bits without using the REX prefix that explicitly excludes
2980
           access to the high-byte registers.  */
2981
        tcg_debug_assert(a2 == 8 && args[3] == 8);
2982
        if (a1 < 4 && a0 < 8) {
2983
            tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
2984
        } else {
2985
            tcg_out_ext16s(s, TCG_TYPE_I32, a0, a1);
2986
            tcg_out_shifti(s, SHIFT_SAR, a0, 8);
2987
        }
2988
        break;
2989

2990
    OP_32_64(extract2):
2991
        /* Note that SHRD outputs to the r/m operand.  */
2992
        tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0);
2993
        tcg_out8(s, args[3]);
2994
        break;
2995

2996
    case INDEX_op_mb:
2997
        tcg_out_mb(s, a0);
2998
        break;
2999
    case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
3000
    case INDEX_op_mov_i64:
3001
    case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
3002
    case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
3003
    case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
3004
    case INDEX_op_ext8s_i32:  /* Always emitted via tcg_reg_alloc_op.  */
3005
    case INDEX_op_ext8s_i64:
3006
    case INDEX_op_ext8u_i32:
3007
    case INDEX_op_ext8u_i64:
3008
    case INDEX_op_ext16s_i32:
3009
    case INDEX_op_ext16s_i64:
3010
    case INDEX_op_ext16u_i32:
3011
    case INDEX_op_ext16u_i64:
3012
    case INDEX_op_ext32s_i64:
3013
    case INDEX_op_ext32u_i64:
3014
    case INDEX_op_ext_i32_i64:
3015
    case INDEX_op_extu_i32_i64:
3016
    case INDEX_op_extrl_i64_i32:
3017
    default:
3018
        g_assert_not_reached();
3019
    }
3020

3021
#undef OP_32_64
3022
}
3023

3024
static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
3025
                           unsigned vecl, unsigned vece,
3026
                           const TCGArg args[TCG_MAX_OP_ARGS],
3027
                           const int const_args[TCG_MAX_OP_ARGS])
3028
{
3029
    static int const add_insn[4] = {
3030
        OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
3031
    };
3032
    static int const ssadd_insn[4] = {
3033
        OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2
3034
    };
3035
    static int const usadd_insn[4] = {
3036
        OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2
3037
    };
3038
    static int const sub_insn[4] = {
3039
        OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
3040
    };
3041
    static int const sssub_insn[4] = {
3042
        OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2
3043
    };
3044
    static int const ussub_insn[4] = {
3045
        OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2
3046
    };
3047
    static int const mul_insn[4] = {
3048
        OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_VPMULLQ
3049
    };
3050
    static int const shift_imm_insn[4] = {
3051
        OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
3052
    };
3053
    static int const cmpeq_insn[4] = {
3054
        OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
3055
    };
3056
    static int const cmpgt_insn[4] = {
3057
        OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
3058
    };
3059
    static int const punpckl_insn[4] = {
3060
        OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
3061
    };
3062
    static int const punpckh_insn[4] = {
3063
        OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ
3064
    };
3065
    static int const packss_insn[4] = {
3066
        OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2
3067
    };
3068
    static int const packus_insn[4] = {
3069
        OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
3070
    };
3071
    static int const smin_insn[4] = {
3072
        OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_VPMINSQ
3073
    };
3074
    static int const smax_insn[4] = {
3075
        OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_VPMAXSQ
3076
    };
3077
    static int const umin_insn[4] = {
3078
        OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_VPMINUQ
3079
    };
3080
    static int const umax_insn[4] = {
3081
        OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_VPMAXUQ
3082
    };
3083
    static int const rotlv_insn[4] = {
3084
        OPC_UD2, OPC_UD2, OPC_VPROLVD, OPC_VPROLVQ
3085
    };
3086
    static int const rotrv_insn[4] = {
3087
        OPC_UD2, OPC_UD2, OPC_VPRORVD, OPC_VPRORVQ
3088
    };
3089
    static int const shlv_insn[4] = {
3090
        OPC_UD2, OPC_VPSLLVW, OPC_VPSLLVD, OPC_VPSLLVQ
3091
    };
3092
    static int const shrv_insn[4] = {
3093
        OPC_UD2, OPC_VPSRLVW, OPC_VPSRLVD, OPC_VPSRLVQ
3094
    };
3095
    static int const sarv_insn[4] = {
3096
        OPC_UD2, OPC_VPSRAVW, OPC_VPSRAVD, OPC_VPSRAVQ
3097
    };
3098
    static int const shls_insn[4] = {
3099
        OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ
3100
    };
3101
    static int const shrs_insn[4] = {
3102
        OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ
3103
    };
3104
    static int const sars_insn[4] = {
3105
        OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_VPSRAQ
3106
    };
3107
    static int const vpshldi_insn[4] = {
3108
        OPC_UD2, OPC_VPSHLDW, OPC_VPSHLDD, OPC_VPSHLDQ
3109
    };
3110
    static int const vpshldv_insn[4] = {
3111
        OPC_UD2, OPC_VPSHLDVW, OPC_VPSHLDVD, OPC_VPSHLDVQ
3112
    };
3113
    static int const vpshrdv_insn[4] = {
3114
        OPC_UD2, OPC_VPSHRDVW, OPC_VPSHRDVD, OPC_VPSHRDVQ
3115
    };
3116
    static int const abs_insn[4] = {
3117
        OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_VPABSQ
3118
    };
3119

3120
    TCGType type = vecl + TCG_TYPE_V64;
3121
    int insn, sub;
3122
    TCGArg a0, a1, a2, a3;
3123

3124
    a0 = args[0];
3125
    a1 = args[1];
3126
    a2 = args[2];
3127

3128
    switch (opc) {
3129
    case INDEX_op_add_vec:
3130
        insn = add_insn[vece];
3131
        goto gen_simd;
3132
    case INDEX_op_ssadd_vec:
3133
        insn = ssadd_insn[vece];
3134
        goto gen_simd;
3135
    case INDEX_op_usadd_vec:
3136
        insn = usadd_insn[vece];
3137
        goto gen_simd;
3138
    case INDEX_op_sub_vec:
3139
        insn = sub_insn[vece];
3140
        goto gen_simd;
3141
    case INDEX_op_sssub_vec:
3142
        insn = sssub_insn[vece];
3143
        goto gen_simd;
3144
    case INDEX_op_ussub_vec:
3145
        insn = ussub_insn[vece];
3146
        goto gen_simd;
3147
    case INDEX_op_mul_vec:
3148
        insn = mul_insn[vece];
3149
        goto gen_simd;
3150
    case INDEX_op_and_vec:
3151
        insn = OPC_PAND;
3152
        goto gen_simd;
3153
    case INDEX_op_or_vec:
3154
        insn = OPC_POR;
3155
        goto gen_simd;
3156
    case INDEX_op_xor_vec:
3157
        insn = OPC_PXOR;
3158
        goto gen_simd;
3159
    case INDEX_op_smin_vec:
3160
        insn = smin_insn[vece];
3161
        goto gen_simd;
3162
    case INDEX_op_umin_vec:
3163
        insn = umin_insn[vece];
3164
        goto gen_simd;
3165
    case INDEX_op_smax_vec:
3166
        insn = smax_insn[vece];
3167
        goto gen_simd;
3168
    case INDEX_op_umax_vec:
3169
        insn = umax_insn[vece];
3170
        goto gen_simd;
3171
    case INDEX_op_shlv_vec:
3172
        insn = shlv_insn[vece];
3173
        goto gen_simd;
3174
    case INDEX_op_shrv_vec:
3175
        insn = shrv_insn[vece];
3176
        goto gen_simd;
3177
    case INDEX_op_sarv_vec:
3178
        insn = sarv_insn[vece];
3179
        goto gen_simd;
3180
    case INDEX_op_rotlv_vec:
3181
        insn = rotlv_insn[vece];
3182
        goto gen_simd;
3183
    case INDEX_op_rotrv_vec:
3184
        insn = rotrv_insn[vece];
3185
        goto gen_simd;
3186
    case INDEX_op_shls_vec:
3187
        insn = shls_insn[vece];
3188
        goto gen_simd;
3189
    case INDEX_op_shrs_vec:
3190
        insn = shrs_insn[vece];
3191
        goto gen_simd;
3192
    case INDEX_op_sars_vec:
3193
        insn = sars_insn[vece];
3194
        goto gen_simd;
3195
    case INDEX_op_x86_punpckl_vec:
3196
        insn = punpckl_insn[vece];
3197
        goto gen_simd;
3198
    case INDEX_op_x86_punpckh_vec:
3199
        insn = punpckh_insn[vece];
3200
        goto gen_simd;
3201
    case INDEX_op_x86_packss_vec:
3202
        insn = packss_insn[vece];
3203
        goto gen_simd;
3204
    case INDEX_op_x86_packus_vec:
3205
        insn = packus_insn[vece];
3206
        goto gen_simd;
3207
    case INDEX_op_x86_vpshldv_vec:
3208
        insn = vpshldv_insn[vece];
3209
        a1 = a2;
3210
        a2 = args[3];
3211
        goto gen_simd;
3212
    case INDEX_op_x86_vpshrdv_vec:
3213
        insn = vpshrdv_insn[vece];
3214
        a1 = a2;
3215
        a2 = args[3];
3216
        goto gen_simd;
3217
#if TCG_TARGET_REG_BITS == 32
3218
    case INDEX_op_dup2_vec:
3219
        /* First merge the two 32-bit inputs to a single 64-bit element. */
3220
        tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2);
3221
        /* Then replicate the 64-bit elements across the rest of the vector. */
3222
        if (type != TCG_TYPE_V64) {
3223
            tcg_out_dup_vec(s, type, MO_64, a0, a0);
3224
        }
3225
        break;
3226
#endif
3227
    case INDEX_op_abs_vec:
3228
        insn = abs_insn[vece];
3229
        a2 = a1;
3230
        a1 = 0;
3231
        goto gen_simd;
3232
    gen_simd:
3233
        tcg_debug_assert(insn != OPC_UD2);
3234
        if (type == TCG_TYPE_V256) {
3235
            insn |= P_VEXL;
3236
        }
3237
        tcg_out_vex_modrm(s, insn, a0, a1, a2);
3238
        break;
3239

3240
    case INDEX_op_cmp_vec:
3241
        sub = args[3];
3242
        if (sub == TCG_COND_EQ) {
3243
            insn = cmpeq_insn[vece];
3244
        } else if (sub == TCG_COND_GT) {
3245
            insn = cmpgt_insn[vece];
3246
        } else {
3247
            g_assert_not_reached();
3248
        }
3249
        goto gen_simd;
3250

3251
    case INDEX_op_andc_vec:
3252
        insn = OPC_PANDN;
3253
        if (type == TCG_TYPE_V256) {
3254
            insn |= P_VEXL;
3255
        }
3256
        tcg_out_vex_modrm(s, insn, a0, a2, a1);
3257
        break;
3258

3259
    case INDEX_op_shli_vec:
3260
        insn = shift_imm_insn[vece];
3261
        sub = 6;
3262
        goto gen_shift;
3263
    case INDEX_op_shri_vec:
3264
        insn = shift_imm_insn[vece];
3265
        sub = 2;
3266
        goto gen_shift;
3267
    case INDEX_op_sari_vec:
3268
        if (vece == MO_64) {
3269
            insn = OPC_PSHIFTD_Ib | P_VEXW | P_EVEX;
3270
        } else {
3271
            insn = shift_imm_insn[vece];
3272
        }
3273
        sub = 4;
3274
        goto gen_shift;
3275
    case INDEX_op_rotli_vec:
3276
        insn = OPC_PSHIFTD_Ib | P_EVEX;  /* VPROL[DQ] */
3277
        if (vece == MO_64) {
3278
            insn |= P_VEXW;
3279
        }
3280
        sub = 1;
3281
        goto gen_shift;
3282
    gen_shift:
3283
        tcg_debug_assert(vece != MO_8);
3284
        if (type == TCG_TYPE_V256) {
3285
            insn |= P_VEXL;
3286
        }
3287
        tcg_out_vex_modrm(s, insn, sub, a0, a1);
3288
        tcg_out8(s, a2);
3289
        break;
3290

3291
    case INDEX_op_ld_vec:
3292
        tcg_out_ld(s, type, a0, a1, a2);
3293
        break;
3294
    case INDEX_op_st_vec:
3295
        tcg_out_st(s, type, a0, a1, a2);
3296
        break;
3297
    case INDEX_op_dupm_vec:
3298
        tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
3299
        break;
3300

3301
    case INDEX_op_x86_shufps_vec:
3302
        insn = OPC_SHUFPS;
3303
        sub = args[3];
3304
        goto gen_simd_imm8;
3305
    case INDEX_op_x86_blend_vec:
3306
        if (vece == MO_16) {
3307
            insn = OPC_PBLENDW;
3308
        } else if (vece == MO_32) {
3309
            insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS);
3310
        } else {
3311
            g_assert_not_reached();
3312
        }
3313
        sub = args[3];
3314
        goto gen_simd_imm8;
3315
    case INDEX_op_x86_vperm2i128_vec:
3316
        insn = OPC_VPERM2I128;
3317
        sub = args[3];
3318
        goto gen_simd_imm8;
3319
    case INDEX_op_x86_vpshldi_vec:
3320
        insn = vpshldi_insn[vece];
3321
        sub = args[3];
3322
        goto gen_simd_imm8;
3323

3324
    case INDEX_op_not_vec:
3325
        insn = OPC_VPTERNLOGQ;
3326
        a2 = a1;
3327
        sub = 0x33; /* !B */
3328
        goto gen_simd_imm8;
3329
    case INDEX_op_nor_vec:
3330
        insn = OPC_VPTERNLOGQ;
3331
        sub = 0x11; /* norCB */
3332
        goto gen_simd_imm8;
3333
    case INDEX_op_nand_vec:
3334
        insn = OPC_VPTERNLOGQ;
3335
        sub = 0x77; /* nandCB */
3336
        goto gen_simd_imm8;
3337
    case INDEX_op_eqv_vec:
3338
        insn = OPC_VPTERNLOGQ;
3339
        sub = 0x99; /* xnorCB */
3340
        goto gen_simd_imm8;
3341
    case INDEX_op_orc_vec:
3342
        insn = OPC_VPTERNLOGQ;
3343
        sub = 0xdd; /* orB!C */
3344
        goto gen_simd_imm8;
3345

3346
    case INDEX_op_bitsel_vec:
3347
        insn = OPC_VPTERNLOGQ;
3348
        a3 = args[3];
3349
        if (a0 == a1) {
3350
            a1 = a2;
3351
            a2 = a3;
3352
            sub = 0xca; /* A?B:C */
3353
        } else if (a0 == a2) {
3354
            a2 = a3;
3355
            sub = 0xe2; /* B?A:C */
3356
        } else {
3357
            tcg_out_mov(s, type, a0, a3);
3358
            sub = 0xb8; /* B?C:A */
3359
        }
3360
        goto gen_simd_imm8;
3361

3362
    gen_simd_imm8:
3363
        tcg_debug_assert(insn != OPC_UD2);
3364
        if (type == TCG_TYPE_V256) {
3365
            insn |= P_VEXL;
3366
        }
3367
        tcg_out_vex_modrm(s, insn, a0, a1, a2);
3368
        tcg_out8(s, sub);
3369
        break;
3370

3371
    case INDEX_op_x86_vpblendvb_vec:
3372
        insn = OPC_VPBLENDVB;
3373
        if (type == TCG_TYPE_V256) {
3374
            insn |= P_VEXL;
3375
        }
3376
        tcg_out_vex_modrm(s, insn, a0, a1, a2);
3377
        tcg_out8(s, args[3] << 4);
3378
        break;
3379

3380
    case INDEX_op_x86_psrldq_vec:
3381
        tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
3382
        tcg_out8(s, a2);
3383
        break;
3384

3385
    case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
3386
    case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
3387
    default:
3388
        g_assert_not_reached();
3389
    }
3390
}
3391

3392
static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
3393
{
3394
    switch (op) {
3395
    case INDEX_op_goto_ptr:
3396
        return C_O0_I1(r);
3397

3398
    case INDEX_op_ld8u_i32:
3399
    case INDEX_op_ld8u_i64:
3400
    case INDEX_op_ld8s_i32:
3401
    case INDEX_op_ld8s_i64:
3402
    case INDEX_op_ld16u_i32:
3403
    case INDEX_op_ld16u_i64:
3404
    case INDEX_op_ld16s_i32:
3405
    case INDEX_op_ld16s_i64:
3406
    case INDEX_op_ld_i32:
3407
    case INDEX_op_ld32u_i64:
3408
    case INDEX_op_ld32s_i64:
3409
    case INDEX_op_ld_i64:
3410
        return C_O1_I1(r, r);
3411

3412
    case INDEX_op_st8_i32:
3413
    case INDEX_op_st8_i64:
3414
        return C_O0_I2(qi, r);
3415

3416
    case INDEX_op_st16_i32:
3417
    case INDEX_op_st16_i64:
3418
    case INDEX_op_st_i32:
3419
    case INDEX_op_st32_i64:
3420
        return C_O0_I2(ri, r);
3421

3422
    case INDEX_op_st_i64:
3423
        return C_O0_I2(re, r);
3424

3425
    case INDEX_op_add_i32:
3426
    case INDEX_op_add_i64:
3427
        return C_O1_I2(r, r, re);
3428

3429
    case INDEX_op_sub_i32:
3430
    case INDEX_op_sub_i64:
3431
    case INDEX_op_mul_i32:
3432
    case INDEX_op_mul_i64:
3433
    case INDEX_op_or_i32:
3434
    case INDEX_op_or_i64:
3435
    case INDEX_op_xor_i32:
3436
    case INDEX_op_xor_i64:
3437
        return C_O1_I2(r, 0, re);
3438

3439
    case INDEX_op_and_i32:
3440
    case INDEX_op_and_i64:
3441
        return C_O1_I2(r, 0, reZ);
3442

3443
    case INDEX_op_andc_i32:
3444
    case INDEX_op_andc_i64:
3445
        return C_O1_I2(r, r, rI);
3446

3447
    case INDEX_op_shl_i32:
3448
    case INDEX_op_shl_i64:
3449
    case INDEX_op_shr_i32:
3450
    case INDEX_op_shr_i64:
3451
    case INDEX_op_sar_i32:
3452
    case INDEX_op_sar_i64:
3453
        return have_bmi2 ? C_O1_I2(r, r, ri) : C_O1_I2(r, 0, ci);
3454

3455
    case INDEX_op_rotl_i32:
3456
    case INDEX_op_rotl_i64:
3457
    case INDEX_op_rotr_i32:
3458
    case INDEX_op_rotr_i64:
3459
        return C_O1_I2(r, 0, ci);
3460

3461
    case INDEX_op_brcond_i32:
3462
    case INDEX_op_brcond_i64:
3463
        return C_O0_I2(r, reT);
3464

3465
    case INDEX_op_bswap16_i32:
3466
    case INDEX_op_bswap16_i64:
3467
    case INDEX_op_bswap32_i32:
3468
    case INDEX_op_bswap32_i64:
3469
    case INDEX_op_bswap64_i64:
3470
    case INDEX_op_neg_i32:
3471
    case INDEX_op_neg_i64:
3472
    case INDEX_op_not_i32:
3473
    case INDEX_op_not_i64:
3474
    case INDEX_op_extrh_i64_i32:
3475
        return C_O1_I1(r, 0);
3476

3477
    case INDEX_op_ext8s_i32:
3478
    case INDEX_op_ext8s_i64:
3479
    case INDEX_op_ext8u_i32:
3480
    case INDEX_op_ext8u_i64:
3481
        return C_O1_I1(r, q);
3482

3483
    case INDEX_op_ext16s_i32:
3484
    case INDEX_op_ext16s_i64:
3485
    case INDEX_op_ext16u_i32:
3486
    case INDEX_op_ext16u_i64:
3487
    case INDEX_op_ext32s_i64:
3488
    case INDEX_op_ext32u_i64:
3489
    case INDEX_op_ext_i32_i64:
3490
    case INDEX_op_extu_i32_i64:
3491
    case INDEX_op_extrl_i64_i32:
3492
    case INDEX_op_extract_i32:
3493
    case INDEX_op_extract_i64:
3494
    case INDEX_op_sextract_i32:
3495
    case INDEX_op_ctpop_i32:
3496
    case INDEX_op_ctpop_i64:
3497
        return C_O1_I1(r, r);
3498

3499
    case INDEX_op_extract2_i32:
3500
    case INDEX_op_extract2_i64:
3501
        return C_O1_I2(r, 0, r);
3502

3503
    case INDEX_op_deposit_i32:
3504
    case INDEX_op_deposit_i64:
3505
        return C_O1_I2(q, 0, qi);
3506

3507
    case INDEX_op_setcond_i32:
3508
    case INDEX_op_setcond_i64:
3509
    case INDEX_op_negsetcond_i32:
3510
    case INDEX_op_negsetcond_i64:
3511
        return C_O1_I2(q, r, reT);
3512

3513
    case INDEX_op_movcond_i32:
3514
    case INDEX_op_movcond_i64:
3515
        return C_O1_I4(r, r, reT, r, 0);
3516

3517
    case INDEX_op_div2_i32:
3518
    case INDEX_op_div2_i64:
3519
    case INDEX_op_divu2_i32:
3520
    case INDEX_op_divu2_i64:
3521
        return C_O2_I3(a, d, 0, 1, r);
3522

3523
    case INDEX_op_mulu2_i32:
3524
    case INDEX_op_mulu2_i64:
3525
    case INDEX_op_muls2_i32:
3526
    case INDEX_op_muls2_i64:
3527
        return C_O2_I2(a, d, a, r);
3528

3529
    case INDEX_op_add2_i32:
3530
    case INDEX_op_add2_i64:
3531
    case INDEX_op_sub2_i32:
3532
    case INDEX_op_sub2_i64:
3533
        return C_N1_O1_I4(r, r, 0, 1, re, re);
3534

3535
    case INDEX_op_ctz_i32:
3536
    case INDEX_op_ctz_i64:
3537
        return have_bmi1 ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3538

3539
    case INDEX_op_clz_i32:
3540
    case INDEX_op_clz_i64:
3541
        return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3542

3543
    case INDEX_op_qemu_ld_a32_i32:
3544
        return C_O1_I1(r, L);
3545
    case INDEX_op_qemu_ld_a64_i32:
3546
        return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O1_I2(r, L, L);
3547

3548
    case INDEX_op_qemu_st_a32_i32:
3549
        return C_O0_I2(L, L);
3550
    case INDEX_op_qemu_st_a64_i32:
3551
        return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L);
3552
    case INDEX_op_qemu_st8_a32_i32:
3553
        return C_O0_I2(s, L);
3554
    case INDEX_op_qemu_st8_a64_i32:
3555
        return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(s, L) : C_O0_I3(s, L, L);
3556

3557
    case INDEX_op_qemu_ld_a32_i64:
3558
        return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I1(r, r, L);
3559
    case INDEX_op_qemu_ld_a64_i64:
3560
        return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I2(r, r, L, L);
3561

3562
    case INDEX_op_qemu_st_a32_i64:
3563
        return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L);
3564
    case INDEX_op_qemu_st_a64_i64:
3565
        return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I4(L, L, L, L);
3566

3567
    case INDEX_op_qemu_ld_a32_i128:
3568
    case INDEX_op_qemu_ld_a64_i128:
3569
        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
3570
        return C_O2_I1(r, r, L);
3571
    case INDEX_op_qemu_st_a32_i128:
3572
    case INDEX_op_qemu_st_a64_i128:
3573
        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
3574
        return C_O0_I3(L, L, L);
3575

3576
    case INDEX_op_brcond2_i32:
3577
        return C_O0_I4(r, r, ri, ri);
3578

3579
    case INDEX_op_setcond2_i32:
3580
        return C_O1_I4(r, r, r, ri, ri);
3581

3582
    case INDEX_op_ld_vec:
3583
    case INDEX_op_dupm_vec:
3584
        return C_O1_I1(x, r);
3585

3586
    case INDEX_op_st_vec:
3587
        return C_O0_I2(x, r);
3588

3589
    case INDEX_op_add_vec:
3590
    case INDEX_op_sub_vec:
3591
    case INDEX_op_mul_vec:
3592
    case INDEX_op_and_vec:
3593
    case INDEX_op_or_vec:
3594
    case INDEX_op_xor_vec:
3595
    case INDEX_op_andc_vec:
3596
    case INDEX_op_orc_vec:
3597
    case INDEX_op_nand_vec:
3598
    case INDEX_op_nor_vec:
3599
    case INDEX_op_eqv_vec:
3600
    case INDEX_op_ssadd_vec:
3601
    case INDEX_op_usadd_vec:
3602
    case INDEX_op_sssub_vec:
3603
    case INDEX_op_ussub_vec:
3604
    case INDEX_op_smin_vec:
3605
    case INDEX_op_umin_vec:
3606
    case INDEX_op_smax_vec:
3607
    case INDEX_op_umax_vec:
3608
    case INDEX_op_shlv_vec:
3609
    case INDEX_op_shrv_vec:
3610
    case INDEX_op_sarv_vec:
3611
    case INDEX_op_rotlv_vec:
3612
    case INDEX_op_rotrv_vec:
3613
    case INDEX_op_shls_vec:
3614
    case INDEX_op_shrs_vec:
3615
    case INDEX_op_sars_vec:
3616
    case INDEX_op_cmp_vec:
3617
    case INDEX_op_x86_shufps_vec:
3618
    case INDEX_op_x86_blend_vec:
3619
    case INDEX_op_x86_packss_vec:
3620
    case INDEX_op_x86_packus_vec:
3621
    case INDEX_op_x86_vperm2i128_vec:
3622
    case INDEX_op_x86_punpckl_vec:
3623
    case INDEX_op_x86_punpckh_vec:
3624
    case INDEX_op_x86_vpshldi_vec:
3625
#if TCG_TARGET_REG_BITS == 32
3626
    case INDEX_op_dup2_vec:
3627
#endif
3628
        return C_O1_I2(x, x, x);
3629

3630
    case INDEX_op_abs_vec:
3631
    case INDEX_op_dup_vec:
3632
    case INDEX_op_not_vec:
3633
    case INDEX_op_shli_vec:
3634
    case INDEX_op_shri_vec:
3635
    case INDEX_op_sari_vec:
3636
    case INDEX_op_rotli_vec:
3637
    case INDEX_op_x86_psrldq_vec:
3638
        return C_O1_I1(x, x);
3639

3640
    case INDEX_op_x86_vpshldv_vec:
3641
    case INDEX_op_x86_vpshrdv_vec:
3642
        return C_O1_I3(x, 0, x, x);
3643

3644
    case INDEX_op_bitsel_vec:
3645
    case INDEX_op_x86_vpblendvb_vec:
3646
        return C_O1_I3(x, x, x, x);
3647

3648
    default:
3649
        g_assert_not_reached();
3650
    }
3651
}
3652

3653
int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
3654
{
3655
    switch (opc) {
3656
    case INDEX_op_add_vec:
3657
    case INDEX_op_sub_vec:
3658
    case INDEX_op_and_vec:
3659
    case INDEX_op_or_vec:
3660
    case INDEX_op_xor_vec:
3661
    case INDEX_op_andc_vec:
3662
    case INDEX_op_orc_vec:
3663
    case INDEX_op_nand_vec:
3664
    case INDEX_op_nor_vec:
3665
    case INDEX_op_eqv_vec:
3666
    case INDEX_op_not_vec:
3667
    case INDEX_op_bitsel_vec:
3668
        return 1;
3669
    case INDEX_op_cmp_vec:
3670
    case INDEX_op_cmpsel_vec:
3671
        return -1;
3672

3673
    case INDEX_op_rotli_vec:
3674
        return have_avx512vl && vece >= MO_32 ? 1 : -1;
3675

3676
    case INDEX_op_shli_vec:
3677
    case INDEX_op_shri_vec:
3678
        /* We must expand the operation for MO_8.  */
3679
        return vece == MO_8 ? -1 : 1;
3680

3681
    case INDEX_op_sari_vec:
3682
        switch (vece) {
3683
        case MO_8:
3684
            return -1;
3685
        case MO_16:
3686
        case MO_32:
3687
            return 1;
3688
        case MO_64:
3689
            if (have_avx512vl) {
3690
                return 1;
3691
            }
3692
            /*
3693
             * We can emulate this for MO_64, but it does not pay off
3694
             * unless we're producing at least 4 values.
3695
             */
3696
            return type >= TCG_TYPE_V256 ? -1 : 0;
3697
        }
3698
        return 0;
3699

3700
    case INDEX_op_shls_vec:
3701
    case INDEX_op_shrs_vec:
3702
        return vece >= MO_16;
3703
    case INDEX_op_sars_vec:
3704
        switch (vece) {
3705
        case MO_16:
3706
        case MO_32:
3707
            return 1;
3708
        case MO_64:
3709
            return have_avx512vl;
3710
        }
3711
        return 0;
3712
    case INDEX_op_rotls_vec:
3713
        return vece >= MO_16 ? -1 : 0;
3714

3715
    case INDEX_op_shlv_vec:
3716
    case INDEX_op_shrv_vec:
3717
        switch (vece) {
3718
        case MO_16:
3719
            return have_avx512bw;
3720
        case MO_32:
3721
        case MO_64:
3722
            return have_avx2;
3723
        }
3724
        return 0;
3725
    case INDEX_op_sarv_vec:
3726
        switch (vece) {
3727
        case MO_16:
3728
            return have_avx512bw;
3729
        case MO_32:
3730
            return have_avx2;
3731
        case MO_64:
3732
            return have_avx512vl;
3733
        }
3734
        return 0;
3735
    case INDEX_op_rotlv_vec:
3736
    case INDEX_op_rotrv_vec:
3737
        switch (vece) {
3738
        case MO_16:
3739
            return have_avx512vbmi2 ? -1 : 0;
3740
        case MO_32:
3741
        case MO_64:
3742
            return have_avx512vl ? 1 : have_avx2 ? -1 : 0;
3743
        }
3744
        return 0;
3745

3746
    case INDEX_op_mul_vec:
3747
        switch (vece) {
3748
        case MO_8:
3749
            return -1;
3750
        case MO_64:
3751
            return have_avx512dq;
3752
        }
3753
        return 1;
3754

3755
    case INDEX_op_ssadd_vec:
3756
    case INDEX_op_usadd_vec:
3757
    case INDEX_op_sssub_vec:
3758
    case INDEX_op_ussub_vec:
3759
        return vece <= MO_16;
3760
    case INDEX_op_smin_vec:
3761
    case INDEX_op_smax_vec:
3762
    case INDEX_op_umin_vec:
3763
    case INDEX_op_umax_vec:
3764
    case INDEX_op_abs_vec:
3765
        return vece <= MO_32 || have_avx512vl;
3766

3767
    default:
3768
        return 0;
3769
    }
3770
}
3771

3772
static void expand_vec_shi(TCGType type, unsigned vece, bool right,
3773
                           TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3774
{
3775
    uint8_t mask;
3776

3777
    tcg_debug_assert(vece == MO_8);
3778
    if (right) {
3779
        mask = 0xff >> imm;
3780
        tcg_gen_shri_vec(MO_16, v0, v1, imm);
3781
    } else {
3782
        mask = 0xff << imm;
3783
        tcg_gen_shli_vec(MO_16, v0, v1, imm);
3784
    }
3785
    tcg_gen_and_vec(MO_8, v0, v0, tcg_constant_vec(type, MO_8, mask));
3786
}
3787

3788
static void expand_vec_sari(TCGType type, unsigned vece,
3789
                            TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3790
{
3791
    TCGv_vec t1, t2;
3792

3793
    switch (vece) {
3794
    case MO_8:
3795
        /* Unpack to 16-bit, shift, and repack.  */
3796
        t1 = tcg_temp_new_vec(type);
3797
        t2 = tcg_temp_new_vec(type);
3798
        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3799
                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3800
        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3801
                  tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3802
        tcg_gen_sari_vec(MO_16, t1, t1, imm + 8);
3803
        tcg_gen_sari_vec(MO_16, t2, t2, imm + 8);
3804
        vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
3805
                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3806
        tcg_temp_free_vec(t1);
3807
        tcg_temp_free_vec(t2);
3808
        break;
3809

3810
    case MO_64:
3811
        t1 = tcg_temp_new_vec(type);
3812
        if (imm <= 32) {
3813
            /*
3814
             * We can emulate a small sign extend by performing an arithmetic
3815
             * 32-bit shift and overwriting the high half of a 64-bit logical
3816
             * shift.  Note that the ISA says shift of 32 is valid, but TCG
3817
             * does not, so we have to bound the smaller shift -- we get the
3818
             * same result in the high half either way.
3819
             */
3820
            tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31));
3821
            tcg_gen_shri_vec(MO_64, v0, v1, imm);
3822
            vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
3823
                      tcgv_vec_arg(v0), tcgv_vec_arg(v0),
3824
                      tcgv_vec_arg(t1), 0xaa);
3825
        } else {
3826
            /* Otherwise we will need to use a compare vs 0 to produce
3827
             * the sign-extend, shift and merge.
3828
             */
3829
            tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1,
3830
                            tcg_constant_vec(type, MO_64, 0), v1);
3831
            tcg_gen_shri_vec(MO_64, v0, v1, imm);
3832
            tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm);
3833
            tcg_gen_or_vec(MO_64, v0, v0, t1);
3834
        }
3835
        tcg_temp_free_vec(t1);
3836
        break;
3837

3838
    default:
3839
        g_assert_not_reached();
3840
    }
3841
}
3842

3843
static void expand_vec_rotli(TCGType type, unsigned vece,
3844
                             TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3845
{
3846
    TCGv_vec t;
3847

3848
    if (vece != MO_8 && have_avx512vbmi2) {
3849
        vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece,
3850
                  tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v1), imm);
3851
        return;
3852
    }
3853

3854
    t = tcg_temp_new_vec(type);
3855
    tcg_gen_shli_vec(vece, t, v1, imm);
3856
    tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm);
3857
    tcg_gen_or_vec(vece, v0, v0, t);
3858
    tcg_temp_free_vec(t);
3859
}
3860

3861
static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0,
3862
                            TCGv_vec v1, TCGv_vec sh, bool right)
3863
{
3864
    TCGv_vec t;
3865

3866
    if (have_avx512vbmi2) {
3867
        vec_gen_4(right ? INDEX_op_x86_vpshrdv_vec : INDEX_op_x86_vpshldv_vec,
3868
                  type, vece, tcgv_vec_arg(v0), tcgv_vec_arg(v1),
3869
                  tcgv_vec_arg(v1), tcgv_vec_arg(sh));
3870
        return;
3871
    }
3872

3873
    t = tcg_temp_new_vec(type);
3874
    tcg_gen_dupi_vec(vece, t, 8 << vece);
3875
    tcg_gen_sub_vec(vece, t, t, sh);
3876
    if (right) {
3877
        tcg_gen_shlv_vec(vece, t, v1, t);
3878
        tcg_gen_shrv_vec(vece, v0, v1, sh);
3879
    } else {
3880
        tcg_gen_shrv_vec(vece, t, v1, t);
3881
        tcg_gen_shlv_vec(vece, v0, v1, sh);
3882
    }
3883
    tcg_gen_or_vec(vece, v0, v0, t);
3884
    tcg_temp_free_vec(t);
3885
}
3886

3887
static void expand_vec_rotls(TCGType type, unsigned vece,
3888
                             TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh)
3889
{
3890
    TCGv_vec t = tcg_temp_new_vec(type);
3891

3892
    tcg_debug_assert(vece != MO_8);
3893

3894
    if (vece >= MO_32 ? have_avx512vl : have_avx512vbmi2) {
3895
        tcg_gen_dup_i32_vec(vece, t, lsh);
3896
        if (vece >= MO_32) {
3897
            tcg_gen_rotlv_vec(vece, v0, v1, t);
3898
        } else {
3899
            expand_vec_rotv(type, vece, v0, v1, t, false);
3900
        }
3901
    } else {
3902
        TCGv_i32 rsh = tcg_temp_new_i32();
3903

3904
        tcg_gen_neg_i32(rsh, lsh);
3905
        tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1);
3906
        tcg_gen_shls_vec(vece, t, v1, lsh);
3907
        tcg_gen_shrs_vec(vece, v0, v1, rsh);
3908
        tcg_gen_or_vec(vece, v0, v0, t);
3909

3910
        tcg_temp_free_i32(rsh);
3911
    }
3912

3913
    tcg_temp_free_vec(t);
3914
}
3915

3916
static void expand_vec_mul(TCGType type, unsigned vece,
3917
                           TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
3918
{
3919
    TCGv_vec t1, t2, t3, t4, zero;
3920

3921
    tcg_debug_assert(vece == MO_8);
3922

3923
    /*
3924
     * Unpack v1 bytes to words, 0 | x.
3925
     * Unpack v2 bytes to words, y | 0.
3926
     * This leaves the 8-bit result, x * y, with 8 bits of right padding.
3927
     * Shift logical right by 8 bits to clear the high 8 bytes before
3928
     * using an unsigned saturated pack.
3929
     *
3930
     * The difference between the V64, V128 and V256 cases is merely how
3931
     * we distribute the expansion between temporaries.
3932
     */
3933
    switch (type) {
3934
    case TCG_TYPE_V64:
3935
        t1 = tcg_temp_new_vec(TCG_TYPE_V128);
3936
        t2 = tcg_temp_new_vec(TCG_TYPE_V128);
3937
        zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3938
        vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3939
                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3940
        vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3941
                  tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3942
        tcg_gen_mul_vec(MO_16, t1, t1, t2);
3943
        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3944
        vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
3945
                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1));
3946
        tcg_temp_free_vec(t1);
3947
        tcg_temp_free_vec(t2);
3948
        break;
3949

3950
    case TCG_TYPE_V128:
3951
    case TCG_TYPE_V256:
3952
        t1 = tcg_temp_new_vec(type);
3953
        t2 = tcg_temp_new_vec(type);
3954
        t3 = tcg_temp_new_vec(type);
3955
        t4 = tcg_temp_new_vec(type);
3956
        zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3957
        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3958
                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3959
        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3960
                  tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3961
        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3962
                  tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3963
        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3964
                  tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3965
        tcg_gen_mul_vec(MO_16, t1, t1, t2);
3966
        tcg_gen_mul_vec(MO_16, t3, t3, t4);
3967
        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3968
        tcg_gen_shri_vec(MO_16, t3, t3, 8);
3969
        vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3970
                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
3971
        tcg_temp_free_vec(t1);
3972
        tcg_temp_free_vec(t2);
3973
        tcg_temp_free_vec(t3);
3974
        tcg_temp_free_vec(t4);
3975
        break;
3976

3977
    default:
3978
        g_assert_not_reached();
3979
    }
3980
}
3981

3982
static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0,
3983
                                 TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3984
{
3985
    enum {
3986
        NEED_INV  = 1,
3987
        NEED_SWAP = 2,
3988
        NEED_BIAS = 4,
3989
        NEED_UMIN = 8,
3990
        NEED_UMAX = 16,
3991
    };
3992
    TCGv_vec t1, t2, t3;
3993
    uint8_t fixup;
3994

3995
    switch (cond) {
3996
    case TCG_COND_EQ:
3997
    case TCG_COND_GT:
3998
        fixup = 0;
3999
        break;
4000
    case TCG_COND_NE:
4001
    case TCG_COND_LE:
4002
        fixup = NEED_INV;
4003
        break;
4004
    case TCG_COND_LT:
4005
        fixup = NEED_SWAP;
4006
        break;
4007
    case TCG_COND_GE:
4008
        fixup = NEED_SWAP | NEED_INV;
4009
        break;
4010
    case TCG_COND_LEU:
4011
        if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) {
4012
            fixup = NEED_UMIN;
4013
        } else {
4014
            fixup = NEED_BIAS | NEED_INV;
4015
        }
4016
        break;
4017
    case TCG_COND_GTU:
4018
        if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) {
4019
            fixup = NEED_UMIN | NEED_INV;
4020
        } else {
4021
            fixup = NEED_BIAS;
4022
        }
4023
        break;
4024
    case TCG_COND_GEU:
4025
        if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) {
4026
            fixup = NEED_UMAX;
4027
        } else {
4028
            fixup = NEED_BIAS | NEED_SWAP | NEED_INV;
4029
        }
4030
        break;
4031
    case TCG_COND_LTU:
4032
        if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) {
4033
            fixup = NEED_UMAX | NEED_INV;
4034
        } else {
4035
            fixup = NEED_BIAS | NEED_SWAP;
4036
        }
4037
        break;
4038
    default:
4039
        g_assert_not_reached();
4040
    }
4041

4042
    if (fixup & NEED_INV) {
4043
        cond = tcg_invert_cond(cond);
4044
    }
4045
    if (fixup & NEED_SWAP) {
4046
        t1 = v1, v1 = v2, v2 = t1;
4047
        cond = tcg_swap_cond(cond);
4048
    }
4049

4050
    t1 = t2 = NULL;
4051
    if (fixup & (NEED_UMIN | NEED_UMAX)) {
4052
        t1 = tcg_temp_new_vec(type);
4053
        if (fixup & NEED_UMIN) {
4054
            tcg_gen_umin_vec(vece, t1, v1, v2);
4055
        } else {
4056
            tcg_gen_umax_vec(vece, t1, v1, v2);
4057
        }
4058
        v2 = t1;
4059
        cond = TCG_COND_EQ;
4060
    } else if (fixup & NEED_BIAS) {
4061
        t1 = tcg_temp_new_vec(type);
4062
        t2 = tcg_temp_new_vec(type);
4063
        t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1));
4064
        tcg_gen_sub_vec(vece, t1, v1, t3);
4065
        tcg_gen_sub_vec(vece, t2, v2, t3);
4066
        v1 = t1;
4067
        v2 = t2;
4068
        cond = tcg_signed_cond(cond);
4069
    }
4070

4071
    tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
4072
    /* Expand directly; do not recurse.  */
4073
    vec_gen_4(INDEX_op_cmp_vec, type, vece,
4074
              tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond);
4075

4076
    if (t1) {
4077
        tcg_temp_free_vec(t1);
4078
        if (t2) {
4079
            tcg_temp_free_vec(t2);
4080
        }
4081
    }
4082
    return fixup & NEED_INV;
4083
}
4084

4085
static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
4086
                           TCGv_vec v1, TCGv_vec v2, TCGCond cond)
4087
{
4088
    if (expand_vec_cmp_noinv(type, vece, v0, v1, v2, cond)) {
4089
        tcg_gen_not_vec(vece, v0, v0);
4090
    }
4091
}
4092

4093
static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGv_vec v0,
4094
                              TCGv_vec c1, TCGv_vec c2,
4095
                              TCGv_vec v3, TCGv_vec v4, TCGCond cond)
4096
{
4097
    TCGv_vec t = tcg_temp_new_vec(type);
4098

4099
    if (expand_vec_cmp_noinv(type, vece, t, c1, c2, cond)) {
4100
        /* Invert the sense of the compare by swapping arguments.  */
4101
        TCGv_vec x;
4102
        x = v3, v3 = v4, v4 = x;
4103
    }
4104
    vec_gen_4(INDEX_op_x86_vpblendvb_vec, type, vece,
4105
              tcgv_vec_arg(v0), tcgv_vec_arg(v4),
4106
              tcgv_vec_arg(v3), tcgv_vec_arg(t));
4107
    tcg_temp_free_vec(t);
4108
}
4109

4110
void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
4111
                       TCGArg a0, ...)
4112
{
4113
    va_list va;
4114
    TCGArg a2;
4115
    TCGv_vec v0, v1, v2, v3, v4;
4116

4117
    va_start(va, a0);
4118
    v0 = temp_tcgv_vec(arg_temp(a0));
4119
    v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
4120
    a2 = va_arg(va, TCGArg);
4121

4122
    switch (opc) {
4123
    case INDEX_op_shli_vec:
4124
        expand_vec_shi(type, vece, false, v0, v1, a2);
4125
        break;
4126
    case INDEX_op_shri_vec:
4127
        expand_vec_shi(type, vece, true, v0, v1, a2);
4128
        break;
4129
    case INDEX_op_sari_vec:
4130
        expand_vec_sari(type, vece, v0, v1, a2);
4131
        break;
4132

4133
    case INDEX_op_rotli_vec:
4134
        expand_vec_rotli(type, vece, v0, v1, a2);
4135
        break;
4136

4137
    case INDEX_op_rotls_vec:
4138
        expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2)));
4139
        break;
4140

4141
    case INDEX_op_rotlv_vec:
4142
        v2 = temp_tcgv_vec(arg_temp(a2));
4143
        expand_vec_rotv(type, vece, v0, v1, v2, false);
4144
        break;
4145
    case INDEX_op_rotrv_vec:
4146
        v2 = temp_tcgv_vec(arg_temp(a2));
4147
        expand_vec_rotv(type, vece, v0, v1, v2, true);
4148
        break;
4149

4150
    case INDEX_op_mul_vec:
4151
        v2 = temp_tcgv_vec(arg_temp(a2));
4152
        expand_vec_mul(type, vece, v0, v1, v2);
4153
        break;
4154

4155
    case INDEX_op_cmp_vec:
4156
        v2 = temp_tcgv_vec(arg_temp(a2));
4157
        expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg));
4158
        break;
4159

4160
    case INDEX_op_cmpsel_vec:
4161
        v2 = temp_tcgv_vec(arg_temp(a2));
4162
        v3 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
4163
        v4 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
4164
        expand_vec_cmpsel(type, vece, v0, v1, v2, v3, v4, va_arg(va, TCGArg));
4165
        break;
4166

4167
    default:
4168
        break;
4169
    }
4170

4171
    va_end(va);
4172
}
4173

4174
static const int tcg_target_callee_save_regs[] = {
4175
#if TCG_TARGET_REG_BITS == 64
4176
    TCG_REG_RBP,
4177
    TCG_REG_RBX,
4178
#if defined(_WIN64)
4179
    TCG_REG_RDI,
4180
    TCG_REG_RSI,
4181
#endif
4182
    TCG_REG_R12,
4183
    TCG_REG_R13,
4184
    TCG_REG_R14, /* Currently used for the global env. */
4185
    TCG_REG_R15,
4186
#else
4187
    TCG_REG_EBP, /* Currently used for the global env. */
4188
    TCG_REG_EBX,
4189
    TCG_REG_ESI,
4190
    TCG_REG_EDI,
4191
#endif
4192
};
4193

4194
/* Compute frame size via macros, to share between tcg_target_qemu_prologue
4195
   and tcg_register_jit.  */
4196

4197
#define PUSH_SIZE \
4198
    ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
4199
     * (TCG_TARGET_REG_BITS / 8))
4200

4201
#define FRAME_SIZE \
4202
    ((PUSH_SIZE \
4203
      + TCG_STATIC_CALL_ARGS_SIZE \
4204
      + CPU_TEMP_BUF_NLONGS * sizeof(long) \
4205
      + TCG_TARGET_STACK_ALIGN - 1) \
4206
     & ~(TCG_TARGET_STACK_ALIGN - 1))
4207

4208
/* Generate global QEMU prologue and epilogue code */
4209
static void tcg_target_qemu_prologue(TCGContext *s)
4210
{
4211
    int i, stack_addend;
4212

4213
    /* TB prologue */
4214

4215
    /* Reserve some stack space, also for TCG temps.  */
4216
    stack_addend = FRAME_SIZE - PUSH_SIZE;
4217
    tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
4218
                  CPU_TEMP_BUF_NLONGS * sizeof(long));
4219

4220
    /* Save all callee saved registers.  */
4221
    for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
4222
        tcg_out_push(s, tcg_target_callee_save_regs[i]);
4223
    }
4224

4225
    if (!tcg_use_softmmu && guest_base) {
4226
        int seg = setup_guest_base_seg();
4227
        if (seg != 0) {
4228
            x86_guest_base.seg = seg;
4229
        } else if (guest_base == (int32_t)guest_base) {
4230
            x86_guest_base.ofs = guest_base;
4231
        } else {
4232
            assert(TCG_TARGET_REG_BITS == 64);
4233
            /* Choose R12 because, as a base, it requires a SIB byte. */
4234
            x86_guest_base.index = TCG_REG_R12;
4235
            tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base.index, guest_base);
4236
            tcg_regset_set_reg(s->reserved_regs, x86_guest_base.index);
4237
        }
4238
    }
4239

4240
    if (TCG_TARGET_REG_BITS == 32) {
4241
        tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
4242
                   (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
4243
        tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
4244
        /* jmp *tb.  */
4245
        tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
4246
                             (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
4247
                             + stack_addend);
4248
    } else {
4249
        tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
4250
        tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
4251
        /* jmp *tb.  */
4252
        tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
4253
    }
4254

4255
    /*
4256
     * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
4257
     * and fall through to the rest of the epilogue.
4258
     */
4259
    tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr);
4260
    tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
4261

4262
    /* TB epilogue */
4263
    tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr);
4264

4265
    tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
4266

4267
    if (have_avx2) {
4268
        tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
4269
    }
4270
    for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
4271
        tcg_out_pop(s, tcg_target_callee_save_regs[i]);
4272
    }
4273
    tcg_out_opc(s, OPC_RET, 0, 0, 0);
4274
}
4275

4276
static void tcg_out_tb_start(TCGContext *s)
4277
{
4278
    /* nothing to do */
4279
}
4280

4281
static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
4282
{
4283
    memset(p, 0x90, count);
4284
}
4285

4286
static void tcg_target_init(TCGContext *s)
4287
{
4288
    tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
4289
    if (TCG_TARGET_REG_BITS == 64) {
4290
        tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
4291
    }
4292
    if (have_avx1) {
4293
        tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS;
4294
        tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
4295
    }
4296
    if (have_avx2) {
4297
        tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS;
4298
    }
4299

4300
    tcg_target_call_clobber_regs = ALL_VECTOR_REGS;
4301
    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
4302
    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
4303
    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
4304
    if (TCG_TARGET_REG_BITS == 64) {
4305
#if !defined(_WIN64)
4306
        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
4307
        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
4308
#endif
4309
        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
4310
        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
4311
        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
4312
        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
4313
    }
4314

4315
    s->reserved_regs = 0;
4316
    tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
4317
    tcg_regset_set_reg(s->reserved_regs, TCG_TMP_VEC);
4318
#ifdef _WIN64
4319
    /* These are call saved, and we don't save them, so don't use them. */
4320
    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM6);
4321
    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM7);
4322
    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM8);
4323
    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM9);
4324
    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM10);
4325
    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM11);
4326
    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM12);
4327
    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM13);
4328
    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM14);
4329
    tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM15);
4330
#endif
4331
}
4332

4333
typedef struct {
4334
    DebugFrameHeader h;
4335
    uint8_t fde_def_cfa[4];
4336
    uint8_t fde_reg_ofs[14];
4337
} DebugFrame;
4338

4339
/* We're expecting a 2 byte uleb128 encoded value.  */
4340
QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
4341

4342
#if !defined(__ELF__)
4343
    /* Host machine without ELF. */
4344
#elif TCG_TARGET_REG_BITS == 64
4345
#define ELF_HOST_MACHINE EM_X86_64
4346
static const DebugFrame debug_frame = {
4347
    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
4348
    .h.cie.id = -1,
4349
    .h.cie.version = 1,
4350
    .h.cie.code_align = 1,
4351
    .h.cie.data_align = 0x78,             /* sleb128 -8 */
4352
    .h.cie.return_column = 16,
4353

4354
    /* Total FDE size does not include the "len" member.  */
4355
    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4356

4357
    .fde_def_cfa = {
4358
        12, 7,                          /* DW_CFA_def_cfa %rsp, ... */
4359
        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
4360
        (FRAME_SIZE >> 7)
4361
    },
4362
    .fde_reg_ofs = {
4363
        0x90, 1,                        /* DW_CFA_offset, %rip, -8 */
4364
        /* The following ordering must match tcg_target_callee_save_regs.  */
4365
        0x86, 2,                        /* DW_CFA_offset, %rbp, -16 */
4366
        0x83, 3,                        /* DW_CFA_offset, %rbx, -24 */
4367
        0x8c, 4,                        /* DW_CFA_offset, %r12, -32 */
4368
        0x8d, 5,                        /* DW_CFA_offset, %r13, -40 */
4369
        0x8e, 6,                        /* DW_CFA_offset, %r14, -48 */
4370
        0x8f, 7,                        /* DW_CFA_offset, %r15, -56 */
4371
    }
4372
};
4373
#else
4374
#define ELF_HOST_MACHINE EM_386
4375
static const DebugFrame debug_frame = {
4376
    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
4377
    .h.cie.id = -1,
4378
    .h.cie.version = 1,
4379
    .h.cie.code_align = 1,
4380
    .h.cie.data_align = 0x7c,             /* sleb128 -4 */
4381
    .h.cie.return_column = 8,
4382

4383
    /* Total FDE size does not include the "len" member.  */
4384
    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4385

4386
    .fde_def_cfa = {
4387
        12, 4,                          /* DW_CFA_def_cfa %esp, ... */
4388
        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
4389
        (FRAME_SIZE >> 7)
4390
    },
4391
    .fde_reg_ofs = {
4392
        0x88, 1,                        /* DW_CFA_offset, %eip, -4 */
4393
        /* The following ordering must match tcg_target_callee_save_regs.  */
4394
        0x85, 2,                        /* DW_CFA_offset, %ebp, -8 */
4395
        0x83, 3,                        /* DW_CFA_offset, %ebx, -12 */
4396
        0x86, 4,                        /* DW_CFA_offset, %esi, -16 */
4397
        0x87, 5,                        /* DW_CFA_offset, %edi, -20 */
4398
    }
4399
};
4400
#endif
4401

4402
#if defined(ELF_HOST_MACHINE)
4403
void tcg_register_jit(const void *buf, size_t buf_size)
4404
{
4405
    tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
4406
}
4407
#endif
4408
qemu

Использование cookies