2
* Tiny Code Generator for QEMU
4
* Copyright (c) 2008 Fabrice Bellard
6
* Permission is hereby granted, free of charge, to any person obtaining a copy
7
* of this software and associated documentation files (the "Software"), to deal
8
* in the Software without restriction, including without limitation the rights
9
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
* copies of the Software, and to permit persons to whom the Software is
11
* furnished to do so, subject to the following conditions:
13
* The above copyright notice and this permission notice shall be included in
14
* all copies or substantial portions of the Software.
16
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25
#include "../tcg-ldst.c.inc"
26
#include "../tcg-pool.c.inc"
28
#ifdef CONFIG_DEBUG_TCG
29
static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
30
#if TCG_TARGET_REG_BITS == 64
31
"%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
33
"%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
35
"%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
36
"%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
37
#if TCG_TARGET_REG_BITS == 64
38
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
39
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
44
static const int tcg_target_reg_alloc_order[] = {
45
#if TCG_TARGET_REG_BITS == 64
77
/* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
78
any of them. Therefore only allow xmm0-xmm5 to be allocated. */
81
#if TCG_TARGET_REG_BITS == 64
94
#define TCG_TMP_VEC TCG_REG_XMM5
96
static const int tcg_target_call_iarg_regs[] = {
97
#if TCG_TARGET_REG_BITS == 64
110
/* 32 bit mode uses stack based calling convention (GCC default). */
114
static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot)
117
case TCG_CALL_RET_NORMAL:
118
tcg_debug_assert(slot >= 0 && slot <= 1);
119
return slot ? TCG_REG_EDX : TCG_REG_EAX;
121
case TCG_CALL_RET_BY_VEC:
122
tcg_debug_assert(slot == 0);
126
g_assert_not_reached();
130
/* Constants we accept. */
131
#define TCG_CT_CONST_S32 0x100
132
#define TCG_CT_CONST_U32 0x200
133
#define TCG_CT_CONST_I32 0x400
134
#define TCG_CT_CONST_WSZ 0x800
135
#define TCG_CT_CONST_TST 0x1000
137
/* Registers used with L constraint, which are the first argument
138
registers on x86_64, and two random call clobbered registers on
140
#if TCG_TARGET_REG_BITS == 64
141
# define TCG_REG_L0 tcg_target_call_iarg_regs[0]
142
# define TCG_REG_L1 tcg_target_call_iarg_regs[1]
144
# define TCG_REG_L0 TCG_REG_EAX
145
# define TCG_REG_L1 TCG_REG_EDX
148
#if TCG_TARGET_REG_BITS == 64
149
# define ALL_GENERAL_REGS 0x0000ffffu
150
# define ALL_VECTOR_REGS 0xffff0000u
151
# define ALL_BYTEL_REGS ALL_GENERAL_REGS
153
# define ALL_GENERAL_REGS 0x000000ffu
154
# define ALL_VECTOR_REGS 0x00ff0000u
155
# define ALL_BYTEL_REGS 0x0000000fu
157
#define SOFTMMU_RESERVE_REGS \
158
(tcg_use_softmmu ? (1 << TCG_REG_L0) | (1 << TCG_REG_L1) : 0)
160
#define have_bmi2 (cpuinfo & CPUINFO_BMI2)
161
#define have_lzcnt (cpuinfo & CPUINFO_LZCNT)
163
static const tcg_insn_unit *tb_ret_addr;
165
static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
166
intptr_t value, intptr_t addend)
171
value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
172
if (value != (int32_t)value) {
177
tcg_patch32(code_ptr, value);
180
value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
181
if (value != (int8_t)value) {
184
tcg_patch8(code_ptr, value);
187
g_assert_not_reached();
192
/* test if a constant matches the constraint */
193
static bool tcg_target_const_match(int64_t val, int ct,
194
TCGType type, TCGCond cond, int vece)
196
if (ct & TCG_CT_CONST) {
199
if (type == TCG_TYPE_I32) {
200
if (ct & (TCG_CT_CONST_S32 | TCG_CT_CONST_U32 |
201
TCG_CT_CONST_I32 | TCG_CT_CONST_TST)) {
205
if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
208
if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
211
if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
215
* This will be used in combination with TCG_CT_CONST_S32,
216
* so "normal" TESTQ is already matched. Also accept:
217
* TESTQ -> TESTL (uint32_t)
218
* TESTQ -> BT (is_power_of_2)
220
if ((ct & TCG_CT_CONST_TST)
222
&& (val == (uint32_t)val || is_power_of_2(val))) {
226
if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
232
# define LOWREGMASK(x) ((x) & 7)
234
#define P_EXT 0x100 /* 0x0f opcode prefix */
235
#define P_EXT38 0x200 /* 0x0f 0x38 opcode prefix */
236
#define P_DATA16 0x400 /* 0x66 opcode prefix */
237
#define P_VEXW 0x1000 /* Set VEX.W = 1 */
238
#if TCG_TARGET_REG_BITS == 64
239
# define P_REXW P_VEXW /* Set REX.W = 1; match VEXW */
240
# define P_REXB_R 0x2000 /* REG field as byte register */
241
# define P_REXB_RM 0x4000 /* R/M field as byte register */
242
# define P_GS 0x8000 /* gs segment override */
249
#define P_EXT3A 0x10000 /* 0x0f 0x3a opcode prefix */
250
#define P_SIMDF3 0x20000 /* 0xf3 opcode prefix */
251
#define P_SIMDF2 0x40000 /* 0xf2 opcode prefix */
252
#define P_VEXL 0x80000 /* Set VEX.L = 1 */
253
#define P_EVEX 0x100000 /* Requires EVEX encoding */
255
#define OPC_ARITH_EbIb (0x80)
256
#define OPC_ARITH_EvIz (0x81)
257
#define OPC_ARITH_EvIb (0x83)
258
#define OPC_ARITH_GvEv (0x03) /* ... plus (ARITH_FOO << 3) */
259
#define OPC_ANDN (0xf2 | P_EXT38)
260
#define OPC_ADD_GvEv (OPC_ARITH_GvEv | (ARITH_ADD << 3))
261
#define OPC_AND_GvEv (OPC_ARITH_GvEv | (ARITH_AND << 3))
262
#define OPC_BLENDPS (0x0c | P_EXT3A | P_DATA16)
263
#define OPC_BSF (0xbc | P_EXT)
264
#define OPC_BSR (0xbd | P_EXT)
265
#define OPC_BSWAP (0xc8 | P_EXT)
266
#define OPC_CALL_Jz (0xe8)
267
#define OPC_CMOVCC (0x40 | P_EXT) /* ... plus condition code */
268
#define OPC_CMP_GvEv (OPC_ARITH_GvEv | (ARITH_CMP << 3))
269
#define OPC_DEC_r32 (0x48)
270
#define OPC_IMUL_GvEv (0xaf | P_EXT)
271
#define OPC_IMUL_GvEvIb (0x6b)
272
#define OPC_IMUL_GvEvIz (0x69)
273
#define OPC_INC_r32 (0x40)
274
#define OPC_JCC_long (0x80 | P_EXT) /* ... plus condition code */
275
#define OPC_JCC_short (0x70) /* ... plus condition code */
276
#define OPC_JMP_long (0xe9)
277
#define OPC_JMP_short (0xeb)
278
#define OPC_LEA (0x8d)
279
#define OPC_LZCNT (0xbd | P_EXT | P_SIMDF3)
280
#define OPC_MOVB_EvGv (0x88) /* stores, more or less */
281
#define OPC_MOVL_EvGv (0x89) /* stores, more or less */
282
#define OPC_MOVL_GvEv (0x8b) /* loads, more or less */
283
#define OPC_MOVB_EvIz (0xc6)
284
#define OPC_MOVL_EvIz (0xc7)
285
#define OPC_MOVB_Ib (0xb0)
286
#define OPC_MOVL_Iv (0xb8)
287
#define OPC_MOVBE_GyMy (0xf0 | P_EXT38)
288
#define OPC_MOVBE_MyGy (0xf1 | P_EXT38)
289
#define OPC_MOVD_VyEy (0x6e | P_EXT | P_DATA16)
290
#define OPC_MOVD_EyVy (0x7e | P_EXT | P_DATA16)
291
#define OPC_MOVDDUP (0x12 | P_EXT | P_SIMDF2)
292
#define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
293
#define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
294
#define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
295
#define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
296
#define OPC_MOVQ_VqWq (0x7e | P_EXT | P_SIMDF3)
297
#define OPC_MOVQ_WqVq (0xd6 | P_EXT | P_DATA16)
298
#define OPC_MOVSBL (0xbe | P_EXT)
299
#define OPC_MOVSWL (0xbf | P_EXT)
300
#define OPC_MOVSLQ (0x63 | P_REXW)
301
#define OPC_MOVZBL (0xb6 | P_EXT)
302
#define OPC_MOVZWL (0xb7 | P_EXT)
303
#define OPC_PABSB (0x1c | P_EXT38 | P_DATA16)
304
#define OPC_PABSW (0x1d | P_EXT38 | P_DATA16)
305
#define OPC_PABSD (0x1e | P_EXT38 | P_DATA16)
306
#define OPC_VPABSQ (0x1f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
307
#define OPC_PACKSSDW (0x6b | P_EXT | P_DATA16)
308
#define OPC_PACKSSWB (0x63 | P_EXT | P_DATA16)
309
#define OPC_PACKUSDW (0x2b | P_EXT38 | P_DATA16)
310
#define OPC_PACKUSWB (0x67 | P_EXT | P_DATA16)
311
#define OPC_PADDB (0xfc | P_EXT | P_DATA16)
312
#define OPC_PADDW (0xfd | P_EXT | P_DATA16)
313
#define OPC_PADDD (0xfe | P_EXT | P_DATA16)
314
#define OPC_PADDQ (0xd4 | P_EXT | P_DATA16)
315
#define OPC_PADDSB (0xec | P_EXT | P_DATA16)
316
#define OPC_PADDSW (0xed | P_EXT | P_DATA16)
317
#define OPC_PADDUB (0xdc | P_EXT | P_DATA16)
318
#define OPC_PADDUW (0xdd | P_EXT | P_DATA16)
319
#define OPC_PAND (0xdb | P_EXT | P_DATA16)
320
#define OPC_PANDN (0xdf | P_EXT | P_DATA16)
321
#define OPC_PBLENDW (0x0e | P_EXT3A | P_DATA16)
322
#define OPC_PCMPEQB (0x74 | P_EXT | P_DATA16)
323
#define OPC_PCMPEQW (0x75 | P_EXT | P_DATA16)
324
#define OPC_PCMPEQD (0x76 | P_EXT | P_DATA16)
325
#define OPC_PCMPEQQ (0x29 | P_EXT38 | P_DATA16)
326
#define OPC_PCMPGTB (0x64 | P_EXT | P_DATA16)
327
#define OPC_PCMPGTW (0x65 | P_EXT | P_DATA16)
328
#define OPC_PCMPGTD (0x66 | P_EXT | P_DATA16)
329
#define OPC_PCMPGTQ (0x37 | P_EXT38 | P_DATA16)
330
#define OPC_PEXTRD (0x16 | P_EXT3A | P_DATA16)
331
#define OPC_PINSRD (0x22 | P_EXT3A | P_DATA16)
332
#define OPC_PMAXSB (0x3c | P_EXT38 | P_DATA16)
333
#define OPC_PMAXSW (0xee | P_EXT | P_DATA16)
334
#define OPC_PMAXSD (0x3d | P_EXT38 | P_DATA16)
335
#define OPC_VPMAXSQ (0x3d | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
336
#define OPC_PMAXUB (0xde | P_EXT | P_DATA16)
337
#define OPC_PMAXUW (0x3e | P_EXT38 | P_DATA16)
338
#define OPC_PMAXUD (0x3f | P_EXT38 | P_DATA16)
339
#define OPC_VPMAXUQ (0x3f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
340
#define OPC_PMINSB (0x38 | P_EXT38 | P_DATA16)
341
#define OPC_PMINSW (0xea | P_EXT | P_DATA16)
342
#define OPC_PMINSD (0x39 | P_EXT38 | P_DATA16)
343
#define OPC_VPMINSQ (0x39 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
344
#define OPC_PMINUB (0xda | P_EXT | P_DATA16)
345
#define OPC_PMINUW (0x3a | P_EXT38 | P_DATA16)
346
#define OPC_PMINUD (0x3b | P_EXT38 | P_DATA16)
347
#define OPC_VPMINUQ (0x3b | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
348
#define OPC_PMOVSXBW (0x20 | P_EXT38 | P_DATA16)
349
#define OPC_PMOVSXWD (0x23 | P_EXT38 | P_DATA16)
350
#define OPC_PMOVSXDQ (0x25 | P_EXT38 | P_DATA16)
351
#define OPC_PMOVZXBW (0x30 | P_EXT38 | P_DATA16)
352
#define OPC_PMOVZXWD (0x33 | P_EXT38 | P_DATA16)
353
#define OPC_PMOVZXDQ (0x35 | P_EXT38 | P_DATA16)
354
#define OPC_PMULLW (0xd5 | P_EXT | P_DATA16)
355
#define OPC_PMULLD (0x40 | P_EXT38 | P_DATA16)
356
#define OPC_VPMULLQ (0x40 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
357
#define OPC_POR (0xeb | P_EXT | P_DATA16)
358
#define OPC_PSHUFB (0x00 | P_EXT38 | P_DATA16)
359
#define OPC_PSHUFD (0x70 | P_EXT | P_DATA16)
360
#define OPC_PSHUFLW (0x70 | P_EXT | P_SIMDF2)
361
#define OPC_PSHUFHW (0x70 | P_EXT | P_SIMDF3)
362
#define OPC_PSHIFTW_Ib (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
363
#define OPC_PSHIFTD_Ib (0x72 | P_EXT | P_DATA16) /* /1 /2 /6 /4 */
364
#define OPC_PSHIFTQ_Ib (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
365
#define OPC_PSLLW (0xf1 | P_EXT | P_DATA16)
366
#define OPC_PSLLD (0xf2 | P_EXT | P_DATA16)
367
#define OPC_PSLLQ (0xf3 | P_EXT | P_DATA16)
368
#define OPC_PSRAW (0xe1 | P_EXT | P_DATA16)
369
#define OPC_PSRAD (0xe2 | P_EXT | P_DATA16)
370
#define OPC_VPSRAQ (0xe2 | P_EXT | P_DATA16 | P_VEXW | P_EVEX)
371
#define OPC_PSRLW (0xd1 | P_EXT | P_DATA16)
372
#define OPC_PSRLD (0xd2 | P_EXT | P_DATA16)
373
#define OPC_PSRLQ (0xd3 | P_EXT | P_DATA16)
374
#define OPC_PSUBB (0xf8 | P_EXT | P_DATA16)
375
#define OPC_PSUBW (0xf9 | P_EXT | P_DATA16)
376
#define OPC_PSUBD (0xfa | P_EXT | P_DATA16)
377
#define OPC_PSUBQ (0xfb | P_EXT | P_DATA16)
378
#define OPC_PSUBSB (0xe8 | P_EXT | P_DATA16)
379
#define OPC_PSUBSW (0xe9 | P_EXT | P_DATA16)
380
#define OPC_PSUBUB (0xd8 | P_EXT | P_DATA16)
381
#define OPC_PSUBUW (0xd9 | P_EXT | P_DATA16)
382
#define OPC_PUNPCKLBW (0x60 | P_EXT | P_DATA16)
383
#define OPC_PUNPCKLWD (0x61 | P_EXT | P_DATA16)
384
#define OPC_PUNPCKLDQ (0x62 | P_EXT | P_DATA16)
385
#define OPC_PUNPCKLQDQ (0x6c | P_EXT | P_DATA16)
386
#define OPC_PUNPCKHBW (0x68 | P_EXT | P_DATA16)
387
#define OPC_PUNPCKHWD (0x69 | P_EXT | P_DATA16)
388
#define OPC_PUNPCKHDQ (0x6a | P_EXT | P_DATA16)
389
#define OPC_PUNPCKHQDQ (0x6d | P_EXT | P_DATA16)
390
#define OPC_PXOR (0xef | P_EXT | P_DATA16)
391
#define OPC_POP_r32 (0x58)
392
#define OPC_POPCNT (0xb8 | P_EXT | P_SIMDF3)
393
#define OPC_PUSH_r32 (0x50)
394
#define OPC_PUSH_Iv (0x68)
395
#define OPC_PUSH_Ib (0x6a)
396
#define OPC_RET (0xc3)
397
#define OPC_SETCC (0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
398
#define OPC_SHIFT_1 (0xd1)
399
#define OPC_SHIFT_Ib (0xc1)
400
#define OPC_SHIFT_cl (0xd3)
401
#define OPC_SARX (0xf7 | P_EXT38 | P_SIMDF3)
402
#define OPC_SHUFPS (0xc6 | P_EXT)
403
#define OPC_SHLX (0xf7 | P_EXT38 | P_DATA16)
404
#define OPC_SHRX (0xf7 | P_EXT38 | P_SIMDF2)
405
#define OPC_SHRD_Ib (0xac | P_EXT)
406
#define OPC_TESTB (0x84)
407
#define OPC_TESTL (0x85)
408
#define OPC_TZCNT (0xbc | P_EXT | P_SIMDF3)
409
#define OPC_UD2 (0x0b | P_EXT)
410
#define OPC_VPBLENDD (0x02 | P_EXT3A | P_DATA16)
411
#define OPC_VPBLENDVB (0x4c | P_EXT3A | P_DATA16)
412
#define OPC_VPINSRB (0x20 | P_EXT3A | P_DATA16)
413
#define OPC_VPINSRW (0xc4 | P_EXT | P_DATA16)
414
#define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16)
415
#define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16)
416
#define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
417
#define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
418
#define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
419
#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
420
#define OPC_VPERMQ (0x00 | P_EXT3A | P_DATA16 | P_VEXW)
421
#define OPC_VPERM2I128 (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
422
#define OPC_VPROLVD (0x15 | P_EXT38 | P_DATA16 | P_EVEX)
423
#define OPC_VPROLVQ (0x15 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
424
#define OPC_VPRORVD (0x14 | P_EXT38 | P_DATA16 | P_EVEX)
425
#define OPC_VPRORVQ (0x14 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
426
#define OPC_VPSHLDW (0x70 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
427
#define OPC_VPSHLDD (0x71 | P_EXT3A | P_DATA16 | P_EVEX)
428
#define OPC_VPSHLDQ (0x71 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
429
#define OPC_VPSHLDVW (0x70 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
430
#define OPC_VPSHLDVD (0x71 | P_EXT38 | P_DATA16 | P_EVEX)
431
#define OPC_VPSHLDVQ (0x71 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
432
#define OPC_VPSHRDVW (0x72 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
433
#define OPC_VPSHRDVD (0x73 | P_EXT38 | P_DATA16 | P_EVEX)
434
#define OPC_VPSHRDVQ (0x73 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
435
#define OPC_VPSLLVW (0x12 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
436
#define OPC_VPSLLVD (0x47 | P_EXT38 | P_DATA16)
437
#define OPC_VPSLLVQ (0x47 | P_EXT38 | P_DATA16 | P_VEXW)
438
#define OPC_VPSRAVW (0x11 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
439
#define OPC_VPSRAVD (0x46 | P_EXT38 | P_DATA16)
440
#define OPC_VPSRAVQ (0x46 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
441
#define OPC_VPSRLVW (0x10 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
442
#define OPC_VPSRLVD (0x45 | P_EXT38 | P_DATA16)
443
#define OPC_VPSRLVQ (0x45 | P_EXT38 | P_DATA16 | P_VEXW)
444
#define OPC_VPTERNLOGQ (0x25 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
445
#define OPC_VZEROUPPER (0x77 | P_EXT)
446
#define OPC_XCHG_ax_r32 (0x90)
447
#define OPC_XCHG_EvGv (0x87)
449
#define OPC_GRP3_Eb (0xf6)
450
#define OPC_GRP3_Ev (0xf7)
451
#define OPC_GRP5 (0xff)
452
#define OPC_GRP14 (0x73 | P_EXT | P_DATA16)
453
#define OPC_GRPBT (0xba | P_EXT)
455
#define OPC_GRPBT_BT 4
456
#define OPC_GRPBT_BTS 5
457
#define OPC_GRPBT_BTR 6
458
#define OPC_GRPBT_BTC 7
460
/* Group 1 opcode extensions for 0x80-0x83.
461
These are also used as modifiers for OPC_ARITH. */
471
/* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3. */
478
/* Group 3 opcode extensions for 0xf6, 0xf7. To be used with OPC_GRP3. */
487
/* Group 5 opcode extensions for 0xff. To be used with OPC_GRP5. */
490
#define EXT5_CALLN_Ev 2
491
#define EXT5_JMPN_Ev 4
493
/* Condition codes to be added to OPC_JCC_{long,short}. */
512
static const uint8_t tcg_cond_to_jcc[] = {
513
[TCG_COND_EQ] = JCC_JE,
514
[TCG_COND_NE] = JCC_JNE,
515
[TCG_COND_LT] = JCC_JL,
516
[TCG_COND_GE] = JCC_JGE,
517
[TCG_COND_LE] = JCC_JLE,
518
[TCG_COND_GT] = JCC_JG,
519
[TCG_COND_LTU] = JCC_JB,
520
[TCG_COND_GEU] = JCC_JAE,
521
[TCG_COND_LEU] = JCC_JBE,
522
[TCG_COND_GTU] = JCC_JA,
523
[TCG_COND_TSTEQ] = JCC_JE,
524
[TCG_COND_TSTNE] = JCC_JNE,
527
#if TCG_TARGET_REG_BITS == 64
528
static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
535
if (opc & P_DATA16) {
536
/* We should never be asking for both 16 and 64-bit operation. */
537
tcg_debug_assert((opc & P_REXW) == 0);
540
if (opc & P_SIMDF3) {
542
} else if (opc & P_SIMDF2) {
547
rex |= (opc & P_REXW) ? 0x8 : 0x0; /* REX.W */
548
rex |= (r & 8) >> 1; /* REX.R */
549
rex |= (x & 8) >> 2; /* REX.X */
550
rex |= (rm & 8) >> 3; /* REX.B */
552
/* P_REXB_{R,RM} indicates that the given register is the low byte.
553
For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
554
as otherwise the encoding indicates %[abcd]h. Note that the values
555
that are ORed in merely indicate that the REX byte must be present;
556
those bits get discarded in output. */
557
rex |= opc & (r >= 4 ? P_REXB_R : 0);
558
rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
561
tcg_out8(s, (uint8_t)(rex | 0x40));
564
if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
568
} else if (opc & P_EXT3A) {
576
static void tcg_out_opc(TCGContext *s, int opc)
578
if (opc & P_DATA16) {
581
if (opc & P_SIMDF3) {
583
} else if (opc & P_SIMDF2) {
586
if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
590
} else if (opc & P_EXT3A) {
596
/* Discard the register arguments to tcg_out_opc early, so as not to penalize
597
the 32-bit compilation paths. This method works with all versions of gcc,
598
whereas relying on optimization may not be able to exclude them. */
599
#define tcg_out_opc(s, opc, r, rm, x) (tcg_out_opc)(s, opc)
602
static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
604
tcg_out_opc(s, opc, r, rm, 0);
605
tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
608
static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
616
/* Use the two byte form if possible, which cannot encode
617
VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT. */
618
if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_VEXW)) == P_EXT
619
&& ((rm | index) & 8) == 0) {
620
/* Two byte VEX prefix. */
623
tmp = (r & 8 ? 0 : 0x80); /* VEX.R */
625
/* Three byte VEX prefix. */
631
} else if (opc & P_EXT38) {
633
} else if (opc & P_EXT) {
636
g_assert_not_reached();
638
tmp |= (r & 8 ? 0 : 0x80); /* VEX.R */
639
tmp |= (index & 8 ? 0 : 0x40); /* VEX.X */
640
tmp |= (rm & 8 ? 0 : 0x20); /* VEX.B */
643
tmp = (opc & P_VEXW ? 0x80 : 0); /* VEX.W */
646
tmp |= (opc & P_VEXL ? 0x04 : 0); /* VEX.L */
648
if (opc & P_DATA16) {
650
} else if (opc & P_SIMDF3) {
652
} else if (opc & P_SIMDF2) {
655
tmp |= (~v & 15) << 3; /* VEX.vvvv */
660
static void tcg_out_evex_opc(TCGContext *s, int opc, int r, int v,
663
/* The entire 4-byte evex prefix; with R' and V' set. */
664
uint32_t p = 0x08041062;
667
tcg_debug_assert(have_avx512vl);
672
} else if (opc & P_EXT38) {
674
} else if (opc & P_EXT) {
677
g_assert_not_reached();
681
if (opc & P_DATA16) {
683
} else if (opc & P_SIMDF3) {
685
} else if (opc & P_SIMDF2) {
691
p = deposit32(p, 8, 2, mm);
692
p = deposit32(p, 13, 1, (rm & 8) == 0); /* EVEX.RXB.B */
693
p = deposit32(p, 14, 1, (index & 8) == 0); /* EVEX.RXB.X */
694
p = deposit32(p, 15, 1, (r & 8) == 0); /* EVEX.RXB.R */
695
p = deposit32(p, 16, 2, pp);
696
p = deposit32(p, 19, 4, ~v);
697
p = deposit32(p, 23, 1, (opc & P_VEXW) != 0);
698
p = deposit32(p, 29, 2, (opc & P_VEXL) != 0);
704
static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
707
tcg_out_evex_opc(s, opc, r, v, rm, 0);
709
tcg_out_vex_opc(s, opc, r, v, rm, 0);
711
tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
714
/* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
715
We handle either RM and INDEX missing with a negative value. In 64-bit
716
mode for absolute addresses, ~RM is the size of the immediate operand
717
that will follow the instruction. */
719
static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
720
int shift, intptr_t offset)
724
if (index < 0 && rm < 0) {
725
if (TCG_TARGET_REG_BITS == 64) {
726
/* Try for a rip-relative addressing mode. This has replaced
727
the 32-bit-mode absolute addressing encoding. */
728
intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
729
intptr_t disp = offset - pc;
730
if (disp == (int32_t)disp) {
731
tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
736
/* Try for an absolute address encoding. This requires the
737
use of the MODRM+SIB encoding and is therefore larger than
738
rip-relative addressing. */
739
if (offset == (int32_t)offset) {
740
tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
741
tcg_out8(s, (4 << 3) | 5);
742
tcg_out32(s, offset);
746
/* ??? The memory isn't directly addressable. */
747
g_assert_not_reached();
749
/* Absolute address. */
750
tcg_out8(s, (r << 3) | 5);
751
tcg_out32(s, offset);
756
/* Find the length of the immediate addend. Note that the encoding
757
that would be used for (%ebp) indicates absolute addressing. */
759
mod = 0, len = 4, rm = 5;
760
} else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
762
} else if (offset == (int8_t)offset) {
768
/* Use a single byte MODRM format if possible. Note that the encoding
769
that would be used for %esp is the escape to the two byte form. */
770
if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
771
/* Single byte MODRM format. */
772
tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
774
/* Two byte MODRM+SIB format. */
776
/* Note that the encoding that would place %esp into the index
777
field indicates no index register. In 64-bit mode, the REX.X
778
bit counts, so %r12 can be used as the index. */
782
tcg_debug_assert(index != TCG_REG_ESP);
785
tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
786
tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
791
} else if (len == 4) {
792
tcg_out32(s, offset);
796
static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
797
int index, int shift, intptr_t offset)
799
tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
800
tcg_out_sib_offset(s, r, rm, index, shift, offset);
803
static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
804
int rm, int index, int shift,
807
tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
808
tcg_out_sib_offset(s, r, rm, index, shift, offset);
811
/* A simplification of the above with no index or shift. */
812
static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
813
int rm, intptr_t offset)
815
tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
818
static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
819
int v, int rm, intptr_t offset)
821
tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
824
/* Output an opcode with an expected reference to the constant pool. */
825
static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r)
827
tcg_out_opc(s, opc, r, 0, 0);
828
/* Absolute for 32-bit, pc-relative for 64-bit. */
829
tcg_out8(s, LOWREGMASK(r) << 3 | 5);
833
/* Output an opcode with an expected reference to the constant pool. */
834
static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r)
836
tcg_out_vex_opc(s, opc, r, 0, 0, 0);
837
/* Absolute for 32-bit, pc-relative for 64-bit. */
838
tcg_out8(s, LOWREGMASK(r) << 3 | 5);
842
/* Generate dest op= src. Uses the same ARITH_* codes as tgen_arithi. */
843
static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
845
/* Propagate an opcode prefix, such as P_REXW. */
846
int ext = subop & ~0x7;
849
tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
852
static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
866
tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
868
tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret);
872
tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
874
tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
880
tcg_debug_assert(ret >= 16 && arg >= 16);
881
tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
884
tcg_debug_assert(ret >= 16 && arg >= 16);
885
tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
888
tcg_debug_assert(ret >= 16 && arg >= 16);
889
tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
893
g_assert_not_reached();
898
static const int avx2_dup_insn[4] = {
899
OPC_VPBROADCASTB, OPC_VPBROADCASTW,
900
OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
903
static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
907
int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
908
tcg_out_vex_modrm(s, avx2_dup_insn[vece] + vex_l, r, 0, a);
912
/* ??? With zero in a register, use PSHUFB. */
913
tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a);
917
tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a);
921
tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a);
922
/* imm8 operand: all output lanes selected from input lane 0. */
926
tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a);
929
g_assert_not_reached();
935
static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
936
TCGReg r, TCGReg base, intptr_t offset)
939
int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
940
tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l,
945
tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset);
948
tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset);
951
tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset);
952
tcg_out8(s, 0); /* imm8 */
953
tcg_out_dup_vec(s, type, vece, r, r);
956
tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset);
957
tcg_out8(s, 0); /* imm8 */
958
tcg_out_dup_vec(s, type, vece, r, r);
961
g_assert_not_reached();
967
static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
968
TCGReg ret, int64_t arg)
970
int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
973
tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
977
tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret);
981
if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) {
983
tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
985
tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
987
new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
989
if (type == TCG_TYPE_V64) {
990
tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
991
} else if (have_avx2) {
992
tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
994
tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
996
if (TCG_TARGET_REG_BITS == 64) {
997
new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
999
new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32);
1004
static void tcg_out_movi_vec(TCGContext *s, TCGType type,
1005
TCGReg ret, tcg_target_long arg)
1008
tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
1012
tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret);
1016
int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
1017
tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret);
1018
if (TCG_TARGET_REG_BITS == 64) {
1019
new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
1021
new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
1025
static void tcg_out_movi_int(TCGContext *s, TCGType type,
1026
TCGReg ret, tcg_target_long arg)
1028
tcg_target_long diff;
1031
tgen_arithr(s, ARITH_XOR, ret, ret);
1034
if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
1035
tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
1039
if (arg == (int32_t)arg) {
1040
tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
1045
/* Try a 7 byte pc-relative lea before the 10 byte movq. */
1046
diff = tcg_pcrel_diff(s, (const void *)arg) - 7;
1047
if (diff == (int32_t)diff) {
1048
tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
1049
tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
1054
tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
1058
static void tcg_out_movi(TCGContext *s, TCGType type,
1059
TCGReg ret, tcg_target_long arg)
1063
#if TCG_TARGET_REG_BITS == 64
1067
tcg_out_movi_int(s, type, ret, arg);
1069
tcg_out_movi_vec(s, type, ret, arg);
1073
g_assert_not_reached();
1077
static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2)
1079
int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1080
tcg_out_modrm(s, OPC_XCHG_EvGv + rexw, r1, r2);
1084
static void tcg_out_addi_ptr(TCGContext *s, TCGReg rd, TCGReg rs,
1085
tcg_target_long imm)
1087
/* This function is only used for passing structs by reference. */
1088
tcg_debug_assert(imm == (int32_t)imm);
1089
tcg_out_modrm_offset(s, OPC_LEA | P_REXW, rd, rs, imm);
1092
static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
1094
if (val == (int8_t)val) {
1095
tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
1097
} else if (val == (int32_t)val) {
1098
tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
1101
g_assert_not_reached();
1105
static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
1107
/* Given the strength of x86 memory ordering, we only need care for
1108
store-load ordering. Experimentally, "lock orl $0,0(%esp)" is
1109
faster than "mfence", so don't bother with the sse insn. */
1110
if (a0 & TCG_MO_ST_LD) {
1112
tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
1117
static inline void tcg_out_push(TCGContext *s, int reg)
1119
tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
1122
static inline void tcg_out_pop(TCGContext *s, int reg)
1124
tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
1127
static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
1128
TCGReg arg1, intptr_t arg2)
1133
tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
1135
tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2);
1140
tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
1145
/* There is no instruction that can validate 8-byte alignment. */
1146
tcg_debug_assert(ret >= 16);
1147
tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
1151
* The gvec infrastructure is asserts that v128 vector loads
1152
* and stores use a 16-byte aligned offset. Validate that the
1153
* final pointer is aligned by using an insn that will SIGSEGV.
1155
tcg_debug_assert(ret >= 16);
1156
tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2);
1160
* The gvec infrastructure only requires 16-byte alignment,
1161
* so here we must use an unaligned load.
1163
tcg_debug_assert(ret >= 16);
1164
tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
1165
ret, 0, arg1, arg2);
1168
g_assert_not_reached();
1172
static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
1173
TCGReg arg1, intptr_t arg2)
1178
tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
1180
tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2);
1185
tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
1190
/* There is no instruction that can validate 8-byte alignment. */
1191
tcg_debug_assert(arg >= 16);
1192
tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
1196
* The gvec infrastructure is asserts that v128 vector loads
1197
* and stores use a 16-byte aligned offset. Validate that the
1198
* final pointer is aligned by using an insn that will SIGSEGV.
1200
* This specific instance is also used by TCG_CALL_RET_BY_VEC,
1201
* for _WIN64, which must have SSE2 but may not have AVX.
1203
tcg_debug_assert(arg >= 16);
1205
tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2);
1207
tcg_out_modrm_offset(s, OPC_MOVDQA_WxVx, arg, arg1, arg2);
1212
* The gvec infrastructure only requires 16-byte alignment,
1213
* so here we must use an unaligned store.
1215
tcg_debug_assert(arg >= 16);
1216
tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
1217
arg, 0, arg1, arg2);
1220
g_assert_not_reached();
1224
static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
1225
TCGReg base, intptr_t ofs)
1228
if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
1229
if (val != (int32_t)val) {
1233
} else if (type != TCG_TYPE_I32) {
1236
tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
1241
static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
1243
/* Propagate an opcode prefix, such as P_DATA16. */
1244
int ext = subopc & ~0x7;
1248
tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
1250
tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
1255
static inline void tcg_out_bswap32(TCGContext *s, int reg)
1257
tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
1260
static inline void tcg_out_rolw_8(TCGContext *s, int reg)
1262
tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
1265
static void tcg_out_ext8u(TCGContext *s, TCGReg dest, TCGReg src)
1268
tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1269
tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
1272
static void tcg_out_ext8s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src)
1274
int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1276
tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1277
tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
1280
static void tcg_out_ext16u(TCGContext *s, TCGReg dest, TCGReg src)
1283
tcg_out_modrm(s, OPC_MOVZWL, dest, src);
1286
static void tcg_out_ext16s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src)
1288
int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1290
tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
1293
static void tcg_out_ext32u(TCGContext *s, TCGReg dest, TCGReg src)
1295
/* 32-bit mov zero extends. */
1296
tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
1299
static void tcg_out_ext32s(TCGContext *s, TCGReg dest, TCGReg src)
1301
tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
1302
tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
1305
static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg dest, TCGReg src)
1307
tcg_out_ext32s(s, dest, src);
1310
static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg dest, TCGReg src)
1313
tcg_out_ext32u(s, dest, src);
1317
static void tcg_out_extrl_i64_i32(TCGContext *s, TCGReg dest, TCGReg src)
1319
tcg_out_ext32u(s, dest, src);
1322
static inline void tcg_out_bswap64(TCGContext *s, int reg)
1324
tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
1327
static void tgen_arithi(TCGContext *s, int c, int r0,
1328
tcg_target_long val, int cf)
1332
if (TCG_TARGET_REG_BITS == 64) {
1342
* ??? While INC is 2 bytes shorter than ADDL $1, they also induce
1343
* partial flags update stalls on Pentium4 and are not recommended
1344
* by current Intel optimization manuals.
1346
if (val == 1 || val == -1) {
1347
int is_inc = (c == ARITH_ADD) ^ (val < 0);
1348
if (TCG_TARGET_REG_BITS == 64) {
1350
* The single-byte increment encodings are re-tasked
1351
* as the REX prefixes. Use the MODRM encoding.
1353
tcg_out_modrm(s, OPC_GRP5 + rexw,
1354
(is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
1356
tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
1362
* Facilitate using an 8-bit immediate. Carry is inverted
1363
* by this transformation, so do it only if cf == 0.
1365
c ^= ARITH_ADD ^ ARITH_SUB;
1372
if (TCG_TARGET_REG_BITS == 64) {
1373
if (val == 0xffffffffu) {
1374
tcg_out_ext32u(s, r0, r0);
1377
if (val == (uint32_t)val) {
1378
/* AND with no high bits set can use a 32-bit operation. */
1382
if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1383
tcg_out_ext8u(s, r0, r0);
1386
if (val == 0xffffu) {
1387
tcg_out_ext16u(s, r0, r0);
1394
if (val >= 0x80 && val <= 0xff
1395
&& (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1396
tcg_out_modrm(s, OPC_ARITH_EbIb + P_REXB_RM, c, r0);
1403
if (val == (int8_t)val) {
1404
tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
1408
if (rexw == 0 || val == (int32_t)val) {
1409
tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
1414
g_assert_not_reached();
1417
static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
1420
tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
1424
/* Set SMALL to force a short forward branch. */
1425
static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, bool small)
1430
val = tcg_pcrel_diff(s, l->u.value_ptr);
1432
if ((int8_t)val1 == val1) {
1434
tcg_out8(s, OPC_JMP_short);
1436
tcg_out8(s, OPC_JCC_short + opc);
1440
tcg_debug_assert(!small);
1442
tcg_out8(s, OPC_JMP_long);
1443
tcg_out32(s, val - 5);
1445
tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1446
tcg_out32(s, val - 6);
1451
tcg_out8(s, OPC_JMP_short);
1453
tcg_out8(s, OPC_JCC_short + opc);
1455
tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
1459
tcg_out8(s, OPC_JMP_long);
1461
tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1463
tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
1468
static int tcg_out_cmp(TCGContext *s, TCGCond cond, TCGArg arg1,
1469
TCGArg arg2, int const_arg2, int rexw)
1473
if (!is_tst_cond(cond)) {
1475
tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
1476
} else if (arg2 == 0) {
1477
tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
1479
tcg_debug_assert(!rexw || arg2 == (int32_t)arg2);
1480
tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
1482
return tcg_cond_to_jcc[cond];
1485
jz = tcg_cond_to_jcc[cond];
1486
js = (cond == TCG_COND_TSTNE ? JCC_JS : JCC_JNS);
1489
tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg2);
1493
if (arg2 <= 0xff && (TCG_TARGET_REG_BITS == 64 || arg1 < 4)) {
1495
tcg_out_modrm(s, OPC_TESTB | P_REXB_R, arg1, arg1);
1499
tcg_out_modrm(s, OPC_TESTB | P_REXB_R, arg1, arg1);
1502
tcg_out_modrm(s, OPC_GRP3_Eb | P_REXB_RM, EXT3_TESTi, arg1);
1507
if ((arg2 & ~0xff00) == 0 && arg1 < 4) {
1508
if (arg2 == 0x8000) {
1509
tcg_out_modrm(s, OPC_TESTB, arg1 + 4, arg1 + 4);
1512
if (arg2 == 0xff00) {
1513
tcg_out_modrm(s, OPC_TESTB, arg1 + 4, arg1 + 4);
1516
tcg_out_modrm(s, OPC_GRP3_Eb, EXT3_TESTi, arg1 + 4);
1517
tcg_out8(s, arg2 >> 8);
1521
if (arg2 == 0xffff) {
1522
tcg_out_modrm(s, OPC_TESTL | P_DATA16, arg1, arg1);
1525
if (arg2 == 0xffffffffu) {
1526
tcg_out_modrm(s, OPC_TESTL, arg1, arg1);
1530
if (is_power_of_2(rexw ? arg2 : (uint32_t)arg2)) {
1531
int jc = (cond == TCG_COND_TSTNE ? JCC_JB : JCC_JAE);
1532
int sh = ctz64(arg2);
1534
rexw = (sh & 32 ? P_REXW : 0);
1535
if ((sh & 31) == 31) {
1536
tcg_out_modrm(s, OPC_TESTL | rexw, arg1, arg1);
1539
tcg_out_modrm(s, OPC_GRPBT | rexw, OPC_GRPBT_BT, arg1);
1546
if (arg2 == (uint32_t)arg2) {
1549
tcg_debug_assert(arg2 == (int32_t)arg2);
1552
tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_TESTi, arg1);
1557
static void tcg_out_brcond(TCGContext *s, int rexw, TCGCond cond,
1558
TCGArg arg1, TCGArg arg2, int const_arg2,
1559
TCGLabel *label, bool small)
1561
int jcc = tcg_out_cmp(s, cond, arg1, arg2, const_arg2, rexw);
1562
tcg_out_jxx(s, jcc, label, small);
1565
#if TCG_TARGET_REG_BITS == 32
1566
static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
1567
const int *const_args, bool small)
1569
TCGLabel *label_next = gen_new_label();
1570
TCGLabel *label_this = arg_label(args[5]);
1571
TCGCond cond = args[4];
1575
case TCG_COND_TSTEQ:
1576
tcg_out_brcond(s, 0, tcg_invert_cond(cond),
1577
args[0], args[2], const_args[2], label_next, 1);
1578
tcg_out_brcond(s, 0, cond, args[1], args[3], const_args[3],
1582
case TCG_COND_TSTNE:
1583
tcg_out_brcond(s, 0, cond, args[0], args[2], const_args[2],
1585
tcg_out_brcond(s, 0, cond, args[1], args[3], const_args[3],
1589
tcg_out_brcond(s, 0, TCG_COND_LT, args[1], args[3], const_args[3],
1591
tcg_out_jxx(s, JCC_JNE, label_next, 1);
1592
tcg_out_brcond(s, 0, TCG_COND_LTU, args[0], args[2], const_args[2],
1596
tcg_out_brcond(s, 0, TCG_COND_LT, args[1], args[3], const_args[3],
1598
tcg_out_jxx(s, JCC_JNE, label_next, 1);
1599
tcg_out_brcond(s, 0, TCG_COND_LEU, args[0], args[2], const_args[2],
1603
tcg_out_brcond(s, 0, TCG_COND_GT, args[1], args[3], const_args[3],
1605
tcg_out_jxx(s, JCC_JNE, label_next, 1);
1606
tcg_out_brcond(s, 0, TCG_COND_GTU, args[0], args[2], const_args[2],
1610
tcg_out_brcond(s, 0, TCG_COND_GT, args[1], args[3], const_args[3],
1612
tcg_out_jxx(s, JCC_JNE, label_next, 1);
1613
tcg_out_brcond(s, 0, TCG_COND_GEU, args[0], args[2], const_args[2],
1617
tcg_out_brcond(s, 0, TCG_COND_LTU, args[1], args[3], const_args[3],
1619
tcg_out_jxx(s, JCC_JNE, label_next, 1);
1620
tcg_out_brcond(s, 0, TCG_COND_LTU, args[0], args[2], const_args[2],
1624
tcg_out_brcond(s, 0, TCG_COND_LTU, args[1], args[3], const_args[3],
1626
tcg_out_jxx(s, JCC_JNE, label_next, 1);
1627
tcg_out_brcond(s, 0, TCG_COND_LEU, args[0], args[2], const_args[2],
1631
tcg_out_brcond(s, 0, TCG_COND_GTU, args[1], args[3], const_args[3],
1633
tcg_out_jxx(s, JCC_JNE, label_next, 1);
1634
tcg_out_brcond(s, 0, TCG_COND_GTU, args[0], args[2], const_args[2],
1638
tcg_out_brcond(s, 0, TCG_COND_GTU, args[1], args[3], const_args[3],
1640
tcg_out_jxx(s, JCC_JNE, label_next, 1);
1641
tcg_out_brcond(s, 0, TCG_COND_GEU, args[0], args[2], const_args[2],
1645
g_assert_not_reached();
1647
tcg_out_label(s, label_next);
1651
static void tcg_out_setcond(TCGContext *s, int rexw, TCGCond cond,
1652
TCGArg dest, TCGArg arg1, TCGArg arg2,
1653
int const_arg2, bool neg)
1655
int cmp_rexw = rexw;
1665
/* If arg2 is 0, convert to LTU/GEU vs 1. */
1666
if (const_arg2 && arg2 == 0) {
1672
case TCG_COND_TSTNE:
1675
case TCG_COND_TSTEQ:
1676
/* If arg2 is -1, convert to LTU/GEU vs 1. */
1677
if (const_arg2 && arg2 == 0xffffffffu) {
1688
/* If arg2 is a register, swap for LTU/GEU. */
1703
* Relying on the carry bit, use SBB to produce -1 if LTU, 0 if GEU.
1704
* We can then use NEG or INC to produce the desired result.
1705
* This is always smaller than the SETCC expansion.
1707
tcg_out_cmp(s, TCG_COND_LTU, arg1, arg2, const_arg2, cmp_rexw);
1709
/* X - X - C = -C = (C ? -1 : 0) */
1710
tgen_arithr(s, ARITH_SBB + (neg ? rexw : 0), dest, dest);
1712
/* ~(C ? -1 : 0) = (C ? 0 : -1) */
1713
tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest);
1715
/* (C ? -1 : 0) + 1 = (C ? 0 : 1) */
1716
tgen_arithi(s, ARITH_ADD, dest, 1, 0);
1718
/* -(C ? -1 : 0) = (C ? 1 : 0) */
1719
tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_NEG, dest);
1727
/* If arg2 is 0, extract the sign bit. */
1728
if (const_arg2 && arg2 == 0) {
1729
tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, dest, arg1);
1731
tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest);
1733
tcg_out_shifti(s, (neg ? SHIFT_SAR : SHIFT_SHR) + rexw,
1734
dest, rexw ? 63 : 31);
1744
* If dest does not overlap the inputs, clearing it first is preferred.
1745
* The XOR breaks any false dependency for the low-byte write to dest,
1746
* and is also one byte smaller than MOVZBL.
1749
if (dest != arg1 && (const_arg2 || dest != arg2)) {
1750
tgen_arithr(s, ARITH_XOR, dest, dest);
1754
jcc = tcg_out_cmp(s, cond, arg1, arg2, const_arg2, cmp_rexw);
1755
tcg_out_modrm(s, OPC_SETCC | jcc, 0, dest);
1758
tcg_out_ext8u(s, dest, dest);
1761
tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, dest);
1765
#if TCG_TARGET_REG_BITS == 32
1766
static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
1767
const int *const_args)
1770
TCGLabel *label_true, *label_over;
1772
memcpy(new_args, args+1, 5*sizeof(TCGArg));
1774
if (args[0] == args[1] || args[0] == args[2]
1775
|| (!const_args[3] && args[0] == args[3])
1776
|| (!const_args[4] && args[0] == args[4])) {
1777
/* When the destination overlaps with one of the argument
1778
registers, don't do anything tricky. */
1779
label_true = gen_new_label();
1780
label_over = gen_new_label();
1782
new_args[5] = label_arg(label_true);
1783
tcg_out_brcond2(s, new_args, const_args+1, 1);
1785
tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1786
tcg_out_jxx(s, JCC_JMP, label_over, 1);
1787
tcg_out_label(s, label_true);
1789
tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
1790
tcg_out_label(s, label_over);
1792
/* When the destination does not overlap one of the arguments,
1793
clear the destination first, jump if cond false, and emit an
1794
increment in the true case. This results in smaller code. */
1796
tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1798
label_over = gen_new_label();
1799
new_args[4] = tcg_invert_cond(new_args[4]);
1800
new_args[5] = label_arg(label_over);
1801
tcg_out_brcond2(s, new_args, const_args+1, 1);
1803
tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
1804
tcg_out_label(s, label_over);
1809
static void tcg_out_cmov(TCGContext *s, int jcc, int rexw,
1810
TCGReg dest, TCGReg v1)
1812
tcg_out_modrm(s, OPC_CMOVCC | jcc | rexw, dest, v1);
1815
static void tcg_out_movcond(TCGContext *s, int rexw, TCGCond cond,
1816
TCGReg dest, TCGReg c1, TCGArg c2, int const_c2,
1819
int jcc = tcg_out_cmp(s, cond, c1, c2, const_c2, rexw);
1820
tcg_out_cmov(s, jcc, rexw, dest, v1);
1823
static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1824
TCGArg arg2, bool const_a2)
1827
tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
1829
tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1831
tcg_debug_assert(dest != arg2);
1832
tcg_out_cmov(s, JCC_JB, rexw, dest, arg2);
1835
tcg_debug_assert(dest != arg2);
1836
tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
1837
tcg_out_cmov(s, JCC_JE, rexw, dest, arg2);
1841
static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1842
TCGArg arg2, bool const_a2)
1845
tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
1847
tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1849
tcg_debug_assert(dest != arg2);
1850
tcg_out_cmov(s, JCC_JB, rexw, dest, arg2);
1853
tcg_debug_assert(!const_a2);
1854
tcg_debug_assert(dest != arg1);
1855
tcg_debug_assert(dest != arg2);
1857
/* Recall that the output of BSR is the index not the count. */
1858
tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
1859
tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);
1861
/* Since we have destroyed the flags from BSR, we have to re-test. */
1862
int jcc = tcg_out_cmp(s, TCG_COND_EQ, arg1, 0, 1, rexw);
1863
tcg_out_cmov(s, jcc, rexw, dest, arg2);
1867
static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest)
1869
intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1871
if (disp == (int32_t)disp) {
1872
tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
1875
/* rip-relative addressing into the constant pool.
1876
This is 6 + 8 = 14 bytes, as compared to using an
1877
immediate load 10 + 6 = 16 bytes, plus we may
1878
be able to re-use the pool constant for more calls. */
1879
tcg_out_opc(s, OPC_GRP5, 0, 0, 0);
1880
tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5);
1881
new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4);
1886
static void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest,
1887
const TCGHelperInfo *info)
1889
tcg_out_branch(s, 1, dest);
1892
if (TCG_TARGET_REG_BITS == 32 && info->out_kind == TCG_CALL_RET_BY_REF) {
1894
* The sysv i386 abi for struct return places a reference as the
1895
* first argument of the stack, and pops that argument with the
1896
* return statement. Since we want to retain the aligned stack
1897
* pointer for the callee, we do not want to actually push that
1898
* argument before the call but rely on the normal store to the
1899
* stack slot. But we do need to compensate for the pop in order
1900
* to reset our correct stack pointer value.
1901
* Pushing a garbage value back onto the stack is quickest.
1903
tcg_out_push(s, TCG_REG_EAX);
1908
static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest)
1910
tcg_out_branch(s, 0, dest);
1913
static void tcg_out_nopn(TCGContext *s, int n)
1916
/* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1917
* "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1918
* duplicate prefix, and all of the interesting recent cores can
1919
* decode and discard the duplicates in a single cycle.
1921
tcg_debug_assert(n >= 1);
1922
for (i = 1; i < n; ++i) {
1936
bool tcg_target_has_memory_bswap(MemOp memop)
1943
if ((memop & MO_SIZE) < MO_128) {
1948
* Reject 16-byte memop with 16-byte atomicity, i.e. VMOVDQA,
1949
* but do allow a pair of 64-bit operations, i.e. MOVBEQ.
1951
aa = atom_and_align_for_opc(tcg_ctx, memop, MO_ATOM_IFALIGN, true);
1952
return aa.atom < MO_128;
1956
* Because i686 has no register parameters and because x86_64 has xchg
1957
* to handle addr/data register overlap, we have placed all input arguments
1958
* before we need might need a scratch reg.
1960
* Even then, a scratch is only needed for l->raddr. Rather than expose
1961
* a general-purpose scratch when we don't actually know it's available,
1962
* use the ra_gen hook to load into RAX if needed.
1964
#if TCG_TARGET_REG_BITS == 64
1965
static TCGReg ldst_ra_gen(TCGContext *s, const TCGLabelQemuLdst *l, int arg)
1970
tcg_out_movi(s, TCG_TYPE_PTR, arg, (uintptr_t)l->raddr);
1973
static const TCGLdstHelperParam ldst_helper_param = {
1974
.ra_gen = ldst_ra_gen
1977
static const TCGLdstHelperParam ldst_helper_param = { };
1980
static void tcg_out_vec_to_pair(TCGContext *s, TCGType type,
1981
TCGReg l, TCGReg h, TCGReg v)
1983
int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1985
/* vpmov{d,q} %v, %l */
1986
tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, v, 0, l);
1987
/* vpextr{d,q} $1, %v, %h */
1988
tcg_out_vex_modrm(s, OPC_PEXTRD + rexw, v, 0, h);
1992
static void tcg_out_pair_to_vec(TCGContext *s, TCGType type,
1993
TCGReg v, TCGReg l, TCGReg h)
1995
int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1997
/* vmov{d,q} %l, %v */
1998
tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, v, 0, l);
1999
/* vpinsr{d,q} $1, %h, %v, %v */
2000
tcg_out_vex_modrm(s, OPC_PINSRD + rexw, v, v, h);
2005
* Generate code for the slow path for a load at the end of block
2007
static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
2009
MemOp opc = get_memop(l->oi);
2010
tcg_insn_unit **label_ptr = &l->label_ptr[0];
2012
/* resolve label address */
2013
tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
2015
tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
2018
tcg_out_ld_helper_args(s, l, &ldst_helper_param);
2019
tcg_out_branch(s, 1, qemu_ld_helpers[opc & MO_SIZE]);
2020
tcg_out_ld_helper_ret(s, l, false, &ldst_helper_param);
2022
tcg_out_jmp(s, l->raddr);
2027
* Generate code for the slow path for a store at the end of block
2029
static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
2031
MemOp opc = get_memop(l->oi);
2032
tcg_insn_unit **label_ptr = &l->label_ptr[0];
2034
/* resolve label address */
2035
tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
2037
tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
2040
tcg_out_st_helper_args(s, l, &ldst_helper_param);
2041
tcg_out_branch(s, 1, qemu_st_helpers[opc & MO_SIZE]);
2043
tcg_out_jmp(s, l->raddr);
2047
#ifdef CONFIG_USER_ONLY
2048
static HostAddress x86_guest_base = {
2052
#if defined(__x86_64__) && defined(__linux__)
2053
# include <asm/prctl.h>
2054
# include <sys/prctl.h>
2055
int arch_prctl(int code, unsigned long addr);
2056
static inline int setup_guest_base_seg(void)
2058
if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
2063
#define setup_guest_base_seg setup_guest_base_seg
2064
#elif defined(__x86_64__) && \
2065
(defined (__FreeBSD__) || defined (__FreeBSD_kernel__))
2066
# include <machine/sysarch.h>
2067
static inline int setup_guest_base_seg(void)
2069
if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) {
2074
#define setup_guest_base_seg setup_guest_base_seg
2077
# define x86_guest_base (*(HostAddress *)({ qemu_build_not_reached(); NULL; }))
2078
#endif /* CONFIG_USER_ONLY */
2079
#ifndef setup_guest_base_seg
2080
# define setup_guest_base_seg() 0
2083
#define MIN_TLB_MASK_TABLE_OFS INT_MIN
2086
* For softmmu, perform the TLB load and compare.
2087
* For useronly, perform any required alignment tests.
2088
* In both cases, return a TCGLabelQemuLdst structure if the slow path
2089
* is required and fill in @h with the host address for the fast path.
2091
static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
2092
TCGReg addrlo, TCGReg addrhi,
2093
MemOpIdx oi, bool is_ld)
2095
TCGLabelQemuLdst *ldst = NULL;
2096
MemOp opc = get_memop(oi);
2097
MemOp s_bits = opc & MO_SIZE;
2100
if (tcg_use_softmmu) {
2101
h->index = TCG_REG_L0;
2105
*h = x86_guest_base;
2108
h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, s_bits == MO_128);
2109
a_mask = (1 << h->aa.align) - 1;
2111
if (tcg_use_softmmu) {
2112
int cmp_ofs = is_ld ? offsetof(CPUTLBEntry, addr_read)
2113
: offsetof(CPUTLBEntry, addr_write);
2114
TCGType ttype = TCG_TYPE_I32;
2115
TCGType tlbtype = TCG_TYPE_I32;
2116
int trexw = 0, hrexw = 0, tlbrexw = 0;
2117
unsigned mem_index = get_mmuidx(oi);
2118
unsigned s_mask = (1 << s_bits) - 1;
2119
int fast_ofs = tlb_mask_table_ofs(s, mem_index);
2122
ldst = new_ldst_label(s);
2123
ldst->is_ld = is_ld;
2125
ldst->addrlo_reg = addrlo;
2126
ldst->addrhi_reg = addrhi;
2128
if (TCG_TARGET_REG_BITS == 64) {
2129
ttype = s->addr_type;
2130
trexw = (ttype == TCG_TYPE_I32 ? 0 : P_REXW);
2131
if (TCG_TYPE_PTR == TCG_TYPE_I64) {
2133
if (s->page_bits + s->tlb_dyn_max_bits > 32) {
2134
tlbtype = TCG_TYPE_I64;
2140
tcg_out_mov(s, tlbtype, TCG_REG_L0, addrlo);
2141
tcg_out_shifti(s, SHIFT_SHR + tlbrexw, TCG_REG_L0,
2142
s->page_bits - CPU_TLB_ENTRY_BITS);
2144
tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, TCG_REG_L0, TCG_AREG0,
2145
fast_ofs + offsetof(CPUTLBDescFast, mask));
2147
tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, TCG_REG_L0, TCG_AREG0,
2148
fast_ofs + offsetof(CPUTLBDescFast, table));
2151
* If the required alignment is at least as large as the access,
2152
* simply copy the address and mask. For lesser alignments,
2153
* check that we don't cross pages for the complete access.
2155
if (a_mask >= s_mask) {
2156
tcg_out_mov(s, ttype, TCG_REG_L1, addrlo);
2158
tcg_out_modrm_offset(s, OPC_LEA + trexw, TCG_REG_L1,
2159
addrlo, s_mask - a_mask);
2161
tlb_mask = s->page_mask | a_mask;
2162
tgen_arithi(s, ARITH_AND + trexw, TCG_REG_L1, tlb_mask, 0);
2164
/* cmp 0(TCG_REG_L0), TCG_REG_L1 */
2165
tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw,
2166
TCG_REG_L1, TCG_REG_L0, cmp_ofs);
2169
tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
2170
ldst->label_ptr[0] = s->code_ptr;
2173
if (TCG_TARGET_REG_BITS == 32 && s->addr_type == TCG_TYPE_I64) {
2174
/* cmp 4(TCG_REG_L0), addrhi */
2175
tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi,
2176
TCG_REG_L0, cmp_ofs + 4);
2179
tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
2180
ldst->label_ptr[1] = s->code_ptr;
2185
tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_L0, TCG_REG_L0,
2186
offsetof(CPUTLBEntry, addend));
2187
} else if (a_mask) {
2190
ldst = new_ldst_label(s);
2191
ldst->is_ld = is_ld;
2193
ldst->addrlo_reg = addrlo;
2194
ldst->addrhi_reg = addrhi;
2197
jcc = tcg_out_cmp(s, TCG_COND_TSTNE, addrlo, a_mask, true, false);
2198
tcg_out_opc(s, OPC_JCC_long + jcc, 0, 0, 0);
2199
ldst->label_ptr[0] = s->code_ptr;
2206
static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2207
HostAddress h, TCGType type, MemOp memop)
2209
bool use_movbe = false;
2210
int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
2211
int movop = OPC_MOVL_GvEv;
2213
/* Do big-endian loads with movbe. */
2214
if (memop & MO_BSWAP) {
2215
tcg_debug_assert(have_movbe);
2217
movop = OPC_MOVBE_GyMy;
2220
switch (memop & MO_SSIZE) {
2222
tcg_out_modrm_sib_offset(s, OPC_MOVZBL + h.seg, datalo,
2223
h.base, h.index, 0, h.ofs);
2226
tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + h.seg, datalo,
2227
h.base, h.index, 0, h.ofs);
2231
/* There is no extending movbe; only low 16-bits are modified. */
2232
if (datalo != h.base && datalo != h.index) {
2233
/* XOR breaks dependency chains. */
2234
tgen_arithr(s, ARITH_XOR, datalo, datalo);
2235
tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2236
datalo, h.base, h.index, 0, h.ofs);
2238
tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2239
datalo, h.base, h.index, 0, h.ofs);
2240
tcg_out_ext16u(s, datalo, datalo);
2243
tcg_out_modrm_sib_offset(s, OPC_MOVZWL + h.seg, datalo,
2244
h.base, h.index, 0, h.ofs);
2249
tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2250
datalo, h.base, h.index, 0, h.ofs);
2251
tcg_out_ext16s(s, type, datalo, datalo);
2253
tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + h.seg,
2254
datalo, h.base, h.index, 0, h.ofs);
2258
tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2259
h.base, h.index, 0, h.ofs);
2261
#if TCG_TARGET_REG_BITS == 64
2264
tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + h.seg, datalo,
2265
h.base, h.index, 0, h.ofs);
2266
tcg_out_ext32s(s, datalo, datalo);
2268
tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + h.seg, datalo,
2269
h.base, h.index, 0, h.ofs);
2274
if (TCG_TARGET_REG_BITS == 64) {
2275
tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2276
h.base, h.index, 0, h.ofs);
2284
if (h.base == datalo || h.index == datalo) {
2285
tcg_out_modrm_sib_offset(s, OPC_LEA, datahi,
2286
h.base, h.index, 0, h.ofs);
2287
tcg_out_modrm_offset(s, movop + h.seg, datalo, datahi, 0);
2288
tcg_out_modrm_offset(s, movop + h.seg, datahi, datahi, 4);
2290
tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2291
h.base, h.index, 0, h.ofs);
2292
tcg_out_modrm_sib_offset(s, movop + h.seg, datahi,
2293
h.base, h.index, 0, h.ofs + 4);
2298
tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2301
* Without 16-byte atomicity, use integer regs.
2302
* That is where we want the data, and it allows bswaps.
2304
if (h.aa.atom < MO_128) {
2310
if (h.base == datalo || h.index == datalo) {
2311
tcg_out_modrm_sib_offset(s, OPC_LEA + P_REXW, datahi,
2312
h.base, h.index, 0, h.ofs);
2313
tcg_out_modrm_offset(s, movop + P_REXW + h.seg,
2315
tcg_out_modrm_offset(s, movop + P_REXW + h.seg,
2318
tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2319
h.base, h.index, 0, h.ofs);
2320
tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi,
2321
h.base, h.index, 0, h.ofs + 8);
2327
* With 16-byte atomicity, a vector load is required.
2328
* If we already have 16-byte alignment, then VMOVDQA always works.
2329
* Else if VMOVDQU has atomicity with dynamic alignment, use that.
2330
* Else use we require a runtime test for alignment for VMOVDQA;
2331
* use VMOVDQU on the unaligned nonatomic path for simplicity.
2333
if (h.aa.align >= MO_128) {
2334
tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg,
2336
h.base, h.index, 0, h.ofs);
2337
} else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) {
2338
tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg,
2340
h.base, h.index, 0, h.ofs);
2342
TCGLabel *l1 = gen_new_label();
2343
TCGLabel *l2 = gen_new_label();
2346
jcc = tcg_out_cmp(s, TCG_COND_TSTNE, h.base, 15, true, false);
2347
tcg_out_jxx(s, jcc, l1, true);
2349
tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg,
2351
h.base, h.index, 0, h.ofs);
2352
tcg_out_jxx(s, JCC_JMP, l2, true);
2354
tcg_out_label(s, l1);
2355
tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg,
2357
h.base, h.index, 0, h.ofs);
2358
tcg_out_label(s, l2);
2360
tcg_out_vec_to_pair(s, TCG_TYPE_I64, datalo, datahi, TCG_TMP_VEC);
2364
g_assert_not_reached();
2368
static void tcg_out_qemu_ld(TCGContext *s, TCGReg datalo, TCGReg datahi,
2369
TCGReg addrlo, TCGReg addrhi,
2370
MemOpIdx oi, TCGType data_type)
2372
TCGLabelQemuLdst *ldst;
2375
ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, true);
2376
tcg_out_qemu_ld_direct(s, datalo, datahi, h, data_type, get_memop(oi));
2379
ldst->type = data_type;
2380
ldst->datalo_reg = datalo;
2381
ldst->datahi_reg = datahi;
2382
ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
2386
static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2387
HostAddress h, MemOp memop)
2389
bool use_movbe = false;
2390
int movop = OPC_MOVL_EvGv;
2393
* Do big-endian stores with movbe or system-mode.
2394
* User-only without movbe will have its swapping done generically.
2396
if (memop & MO_BSWAP) {
2397
tcg_debug_assert(have_movbe);
2399
movop = OPC_MOVBE_MyGy;
2402
switch (memop & MO_SIZE) {
2404
/* This is handled with constraints on INDEX_op_qemu_st8_i32. */
2405
tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4);
2406
tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + h.seg,
2407
datalo, h.base, h.index, 0, h.ofs);
2410
tcg_out_modrm_sib_offset(s, movop + P_DATA16 + h.seg, datalo,
2411
h.base, h.index, 0, h.ofs);
2414
tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2415
h.base, h.index, 0, h.ofs);
2418
if (TCG_TARGET_REG_BITS == 64) {
2419
tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2420
h.base, h.index, 0, h.ofs);
2427
tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2428
h.base, h.index, 0, h.ofs);
2429
tcg_out_modrm_sib_offset(s, movop + h.seg, datahi,
2430
h.base, h.index, 0, h.ofs + 4);
2435
tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2438
* Without 16-byte atomicity, use integer regs.
2439
* That is where we have the data, and it allows bswaps.
2441
if (h.aa.atom < MO_128) {
2447
tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2448
h.base, h.index, 0, h.ofs);
2449
tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi,
2450
h.base, h.index, 0, h.ofs + 8);
2455
* With 16-byte atomicity, a vector store is required.
2456
* If we already have 16-byte alignment, then VMOVDQA always works.
2457
* Else if VMOVDQU has atomicity with dynamic alignment, use that.
2458
* Else use we require a runtime test for alignment for VMOVDQA;
2459
* use VMOVDQU on the unaligned nonatomic path for simplicity.
2461
tcg_out_pair_to_vec(s, TCG_TYPE_I64, TCG_TMP_VEC, datalo, datahi);
2462
if (h.aa.align >= MO_128) {
2463
tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg,
2465
h.base, h.index, 0, h.ofs);
2466
} else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) {
2467
tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg,
2469
h.base, h.index, 0, h.ofs);
2471
TCGLabel *l1 = gen_new_label();
2472
TCGLabel *l2 = gen_new_label();
2475
jcc = tcg_out_cmp(s, TCG_COND_TSTNE, h.base, 15, true, false);
2476
tcg_out_jxx(s, jcc, l1, true);
2478
tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg,
2480
h.base, h.index, 0, h.ofs);
2481
tcg_out_jxx(s, JCC_JMP, l2, true);
2483
tcg_out_label(s, l1);
2484
tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg,
2486
h.base, h.index, 0, h.ofs);
2487
tcg_out_label(s, l2);
2492
g_assert_not_reached();
2496
static void tcg_out_qemu_st(TCGContext *s, TCGReg datalo, TCGReg datahi,
2497
TCGReg addrlo, TCGReg addrhi,
2498
MemOpIdx oi, TCGType data_type)
2500
TCGLabelQemuLdst *ldst;
2503
ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, false);
2504
tcg_out_qemu_st_direct(s, datalo, datahi, h, get_memop(oi));
2507
ldst->type = data_type;
2508
ldst->datalo_reg = datalo;
2509
ldst->datahi_reg = datahi;
2510
ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
2514
static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
2516
/* Reuse the zeroing that exists for goto_ptr. */
2518
tcg_out_jmp(s, tcg_code_gen_epilogue);
2520
tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
2521
tcg_out_jmp(s, tb_ret_addr);
2525
static void tcg_out_goto_tb(TCGContext *s, int which)
2528
* Jump displacement must be aligned for atomic patching;
2529
* see if we need to add extra nops before jump
2531
int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
2533
tcg_out_nopn(s, gap - 1);
2535
tcg_out8(s, OPC_JMP_long); /* jmp im */
2536
set_jmp_insn_offset(s, which);
2538
set_jmp_reset_offset(s, which);
2541
void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
2542
uintptr_t jmp_rx, uintptr_t jmp_rw)
2544
/* patch the branch destination */
2545
uintptr_t addr = tb->jmp_target_addr[n];
2546
qatomic_set((int32_t *)jmp_rw, addr - (jmp_rx + 4));
2547
/* no need to flush icache explicitly */
2550
static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
2551
const TCGArg args[TCG_MAX_OP_ARGS],
2552
const int const_args[TCG_MAX_OP_ARGS])
2555
int c, const_a2, vexop, rexw = 0;
2557
#if TCG_TARGET_REG_BITS == 64
2558
# define OP_32_64(x) \
2559
case glue(glue(INDEX_op_, x), _i64): \
2560
rexw = P_REXW; /* FALLTHRU */ \
2561
case glue(glue(INDEX_op_, x), _i32)
2563
# define OP_32_64(x) \
2564
case glue(glue(INDEX_op_, x), _i32)
2567
/* Hoist the loads of the most common arguments. */
2571
const_a2 = const_args[2];
2574
case INDEX_op_goto_ptr:
2575
/* jmp to the given host address (could be epilogue) */
2576
tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
2579
tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
2582
/* Note that we can ignore REXW for the zero-extend to 64-bit. */
2583
tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
2586
tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
2589
/* Note that we can ignore REXW for the zero-extend to 64-bit. */
2590
tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
2593
tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
2595
#if TCG_TARGET_REG_BITS == 64
2596
case INDEX_op_ld32u_i64:
2598
case INDEX_op_ld_i32:
2599
tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
2603
if (const_args[0]) {
2604
tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
2607
tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
2611
if (const_args[0]) {
2612
tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
2615
tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
2618
#if TCG_TARGET_REG_BITS == 64
2619
case INDEX_op_st32_i64:
2621
case INDEX_op_st_i32:
2622
if (const_args[0]) {
2623
tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
2626
tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
2631
/* For 3-operand addition, use LEA. */
2636
} else if (a0 == a2) {
2637
/* Watch out for dest = src + dest, since we've removed
2638
the matching constraint on the add. */
2639
tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
2643
tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
2662
tgen_arithi(s, c + rexw, a0, a2, 0);
2664
tgen_arithr(s, c + rexw, a0, a2);
2670
tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2671
tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
2673
tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
2681
if (val == (int8_t)val) {
2682
tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
2685
tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
2689
tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
2694
tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
2697
tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
2701
/* For small constant 3-operand shift, use LEA. */
2702
if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
2704
/* shl $1,a1,a0 -> lea (a1,a1),a0 */
2705
tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
2707
/* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2708
tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
2714
goto gen_shift_maybe_vex;
2718
goto gen_shift_maybe_vex;
2722
goto gen_shift_maybe_vex;
2729
gen_shift_maybe_vex:
2732
tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
2735
tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2740
tcg_out_shifti(s, c + rexw, a0, a2);
2742
tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
2747
tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
2750
tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
2753
tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
2757
tcg_out_brcond(s, rexw, a2, a0, a1, const_args[1],
2758
arg_label(args[3]), 0);
2761
tcg_out_setcond(s, rexw, args[3], a0, a1, a2, const_a2, false);
2763
OP_32_64(negsetcond):
2764
tcg_out_setcond(s, rexw, args[3], a0, a1, a2, const_a2, true);
2767
tcg_out_movcond(s, rexw, args[5], a0, a1, a2, const_a2, args[3]);
2771
if (a2 & TCG_BSWAP_OS) {
2772
/* Output must be sign-extended. */
2774
tcg_out_bswap64(s, a0);
2775
tcg_out_shifti(s, SHIFT_SAR + rexw, a0, 48);
2777
tcg_out_bswap32(s, a0);
2778
tcg_out_shifti(s, SHIFT_SAR, a0, 16);
2780
} else if ((a2 & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) {
2781
/* Output must be zero-extended, but input isn't. */
2782
tcg_out_bswap32(s, a0);
2783
tcg_out_shifti(s, SHIFT_SHR, a0, 16);
2785
tcg_out_rolw_8(s, a0);
2789
tcg_out_bswap32(s, a0);
2790
if (rexw && (a2 & TCG_BSWAP_OS)) {
2791
tcg_out_ext32s(s, a0, a0);
2796
tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
2799
tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
2802
case INDEX_op_qemu_ld_a64_i32:
2803
if (TCG_TARGET_REG_BITS == 32) {
2804
tcg_out_qemu_ld(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32);
2808
case INDEX_op_qemu_ld_a32_i32:
2809
tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I32);
2811
case INDEX_op_qemu_ld_a32_i64:
2812
if (TCG_TARGET_REG_BITS == 64) {
2813
tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2815
tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64);
2818
case INDEX_op_qemu_ld_a64_i64:
2819
if (TCG_TARGET_REG_BITS == 64) {
2820
tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2822
tcg_out_qemu_ld(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64);
2825
case INDEX_op_qemu_ld_a32_i128:
2826
case INDEX_op_qemu_ld_a64_i128:
2827
tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2828
tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128);
2831
case INDEX_op_qemu_st_a64_i32:
2832
case INDEX_op_qemu_st8_a64_i32:
2833
if (TCG_TARGET_REG_BITS == 32) {
2834
tcg_out_qemu_st(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32);
2838
case INDEX_op_qemu_st_a32_i32:
2839
case INDEX_op_qemu_st8_a32_i32:
2840
tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I32);
2842
case INDEX_op_qemu_st_a32_i64:
2843
if (TCG_TARGET_REG_BITS == 64) {
2844
tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2846
tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64);
2849
case INDEX_op_qemu_st_a64_i64:
2850
if (TCG_TARGET_REG_BITS == 64) {
2851
tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2853
tcg_out_qemu_st(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64);
2856
case INDEX_op_qemu_st_a32_i128:
2857
case INDEX_op_qemu_st_a64_i128:
2858
tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2859
tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128);
2863
tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
2866
tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
2869
if (const_args[4]) {
2870
tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
2872
tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
2874
if (const_args[5]) {
2875
tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
2877
tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
2881
if (const_args[4]) {
2882
tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
2884
tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
2886
if (const_args[5]) {
2887
tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
2889
tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
2893
#if TCG_TARGET_REG_BITS == 32
2894
case INDEX_op_brcond2_i32:
2895
tcg_out_brcond2(s, args, const_args, 0);
2897
case INDEX_op_setcond2_i32:
2898
tcg_out_setcond2(s, args, const_args);
2900
#else /* TCG_TARGET_REG_BITS == 64 */
2901
case INDEX_op_ld32s_i64:
2902
tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
2904
case INDEX_op_ld_i64:
2905
tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
2907
case INDEX_op_st_i64:
2908
if (const_args[0]) {
2909
tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
2912
tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
2916
case INDEX_op_bswap64_i64:
2917
tcg_out_bswap64(s, a0);
2919
case INDEX_op_extrh_i64_i32:
2920
tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32);
2925
if (args[3] == 0 && args[4] == 8) {
2926
/* load bits 0..7 */
2928
tcg_out_opc(s, OPC_MOVB_Ib | P_REXB_RM | LOWREGMASK(a0),
2932
tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
2934
} else if (TCG_TARGET_REG_BITS == 32 && args[3] == 8 && args[4] == 8) {
2935
/* load bits 8..15 */
2937
tcg_out8(s, OPC_MOVB_Ib + a0 + 4);
2940
tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
2942
} else if (args[3] == 0 && args[4] == 16) {
2943
/* load bits 0..15 */
2945
tcg_out_opc(s, OPC_MOVL_Iv | P_DATA16 | LOWREGMASK(a0),
2949
tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
2952
g_assert_not_reached();
2956
case INDEX_op_extract_i64:
2957
if (a2 + args[3] == 32) {
2958
/* This is a 32-bit zero-extending right shift. */
2959
tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
2960
tcg_out_shifti(s, SHIFT_SHR, a0, a2);
2964
case INDEX_op_extract_i32:
2965
/* On the off-chance that we can use the high-byte registers.
2966
Otherwise we emit the same ext16 + shift pattern that we
2967
would have gotten from the normal tcg-op.c expansion. */
2968
tcg_debug_assert(a2 == 8 && args[3] == 8);
2969
if (a1 < 4 && a0 < 8) {
2970
tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
2972
tcg_out_ext16u(s, a0, a1);
2973
tcg_out_shifti(s, SHIFT_SHR, a0, 8);
2977
case INDEX_op_sextract_i32:
2978
/* We don't implement sextract_i64, as we cannot sign-extend to
2979
64-bits without using the REX prefix that explicitly excludes
2980
access to the high-byte registers. */
2981
tcg_debug_assert(a2 == 8 && args[3] == 8);
2982
if (a1 < 4 && a0 < 8) {
2983
tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
2985
tcg_out_ext16s(s, TCG_TYPE_I32, a0, a1);
2986
tcg_out_shifti(s, SHIFT_SAR, a0, 8);
2991
/* Note that SHRD outputs to the r/m operand. */
2992
tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0);
2993
tcg_out8(s, args[3]);
2999
case INDEX_op_mov_i32: /* Always emitted via tcg_out_mov. */
3000
case INDEX_op_mov_i64:
3001
case INDEX_op_call: /* Always emitted via tcg_out_call. */
3002
case INDEX_op_exit_tb: /* Always emitted via tcg_out_exit_tb. */
3003
case INDEX_op_goto_tb: /* Always emitted via tcg_out_goto_tb. */
3004
case INDEX_op_ext8s_i32: /* Always emitted via tcg_reg_alloc_op. */
3005
case INDEX_op_ext8s_i64:
3006
case INDEX_op_ext8u_i32:
3007
case INDEX_op_ext8u_i64:
3008
case INDEX_op_ext16s_i32:
3009
case INDEX_op_ext16s_i64:
3010
case INDEX_op_ext16u_i32:
3011
case INDEX_op_ext16u_i64:
3012
case INDEX_op_ext32s_i64:
3013
case INDEX_op_ext32u_i64:
3014
case INDEX_op_ext_i32_i64:
3015
case INDEX_op_extu_i32_i64:
3016
case INDEX_op_extrl_i64_i32:
3018
g_assert_not_reached();
3024
static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
3025
unsigned vecl, unsigned vece,
3026
const TCGArg args[TCG_MAX_OP_ARGS],
3027
const int const_args[TCG_MAX_OP_ARGS])
3029
static int const add_insn[4] = {
3030
OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
3032
static int const ssadd_insn[4] = {
3033
OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2
3035
static int const usadd_insn[4] = {
3036
OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2
3038
static int const sub_insn[4] = {
3039
OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
3041
static int const sssub_insn[4] = {
3042
OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2
3044
static int const ussub_insn[4] = {
3045
OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2
3047
static int const mul_insn[4] = {
3048
OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_VPMULLQ
3050
static int const shift_imm_insn[4] = {
3051
OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
3053
static int const cmpeq_insn[4] = {
3054
OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
3056
static int const cmpgt_insn[4] = {
3057
OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
3059
static int const punpckl_insn[4] = {
3060
OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
3062
static int const punpckh_insn[4] = {
3063
OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ
3065
static int const packss_insn[4] = {
3066
OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2
3068
static int const packus_insn[4] = {
3069
OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
3071
static int const smin_insn[4] = {
3072
OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_VPMINSQ
3074
static int const smax_insn[4] = {
3075
OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_VPMAXSQ
3077
static int const umin_insn[4] = {
3078
OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_VPMINUQ
3080
static int const umax_insn[4] = {
3081
OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_VPMAXUQ
3083
static int const rotlv_insn[4] = {
3084
OPC_UD2, OPC_UD2, OPC_VPROLVD, OPC_VPROLVQ
3086
static int const rotrv_insn[4] = {
3087
OPC_UD2, OPC_UD2, OPC_VPRORVD, OPC_VPRORVQ
3089
static int const shlv_insn[4] = {
3090
OPC_UD2, OPC_VPSLLVW, OPC_VPSLLVD, OPC_VPSLLVQ
3092
static int const shrv_insn[4] = {
3093
OPC_UD2, OPC_VPSRLVW, OPC_VPSRLVD, OPC_VPSRLVQ
3095
static int const sarv_insn[4] = {
3096
OPC_UD2, OPC_VPSRAVW, OPC_VPSRAVD, OPC_VPSRAVQ
3098
static int const shls_insn[4] = {
3099
OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ
3101
static int const shrs_insn[4] = {
3102
OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ
3104
static int const sars_insn[4] = {
3105
OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_VPSRAQ
3107
static int const vpshldi_insn[4] = {
3108
OPC_UD2, OPC_VPSHLDW, OPC_VPSHLDD, OPC_VPSHLDQ
3110
static int const vpshldv_insn[4] = {
3111
OPC_UD2, OPC_VPSHLDVW, OPC_VPSHLDVD, OPC_VPSHLDVQ
3113
static int const vpshrdv_insn[4] = {
3114
OPC_UD2, OPC_VPSHRDVW, OPC_VPSHRDVD, OPC_VPSHRDVQ
3116
static int const abs_insn[4] = {
3117
OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_VPABSQ
3120
TCGType type = vecl + TCG_TYPE_V64;
3122
TCGArg a0, a1, a2, a3;
3129
case INDEX_op_add_vec:
3130
insn = add_insn[vece];
3132
case INDEX_op_ssadd_vec:
3133
insn = ssadd_insn[vece];
3135
case INDEX_op_usadd_vec:
3136
insn = usadd_insn[vece];
3138
case INDEX_op_sub_vec:
3139
insn = sub_insn[vece];
3141
case INDEX_op_sssub_vec:
3142
insn = sssub_insn[vece];
3144
case INDEX_op_ussub_vec:
3145
insn = ussub_insn[vece];
3147
case INDEX_op_mul_vec:
3148
insn = mul_insn[vece];
3150
case INDEX_op_and_vec:
3153
case INDEX_op_or_vec:
3156
case INDEX_op_xor_vec:
3159
case INDEX_op_smin_vec:
3160
insn = smin_insn[vece];
3162
case INDEX_op_umin_vec:
3163
insn = umin_insn[vece];
3165
case INDEX_op_smax_vec:
3166
insn = smax_insn[vece];
3168
case INDEX_op_umax_vec:
3169
insn = umax_insn[vece];
3171
case INDEX_op_shlv_vec:
3172
insn = shlv_insn[vece];
3174
case INDEX_op_shrv_vec:
3175
insn = shrv_insn[vece];
3177
case INDEX_op_sarv_vec:
3178
insn = sarv_insn[vece];
3180
case INDEX_op_rotlv_vec:
3181
insn = rotlv_insn[vece];
3183
case INDEX_op_rotrv_vec:
3184
insn = rotrv_insn[vece];
3186
case INDEX_op_shls_vec:
3187
insn = shls_insn[vece];
3189
case INDEX_op_shrs_vec:
3190
insn = shrs_insn[vece];
3192
case INDEX_op_sars_vec:
3193
insn = sars_insn[vece];
3195
case INDEX_op_x86_punpckl_vec:
3196
insn = punpckl_insn[vece];
3198
case INDEX_op_x86_punpckh_vec:
3199
insn = punpckh_insn[vece];
3201
case INDEX_op_x86_packss_vec:
3202
insn = packss_insn[vece];
3204
case INDEX_op_x86_packus_vec:
3205
insn = packus_insn[vece];
3207
case INDEX_op_x86_vpshldv_vec:
3208
insn = vpshldv_insn[vece];
3212
case INDEX_op_x86_vpshrdv_vec:
3213
insn = vpshrdv_insn[vece];
3217
#if TCG_TARGET_REG_BITS == 32
3218
case INDEX_op_dup2_vec:
3219
/* First merge the two 32-bit inputs to a single 64-bit element. */
3220
tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2);
3221
/* Then replicate the 64-bit elements across the rest of the vector. */
3222
if (type != TCG_TYPE_V64) {
3223
tcg_out_dup_vec(s, type, MO_64, a0, a0);
3227
case INDEX_op_abs_vec:
3228
insn = abs_insn[vece];
3233
tcg_debug_assert(insn != OPC_UD2);
3234
if (type == TCG_TYPE_V256) {
3237
tcg_out_vex_modrm(s, insn, a0, a1, a2);
3240
case INDEX_op_cmp_vec:
3242
if (sub == TCG_COND_EQ) {
3243
insn = cmpeq_insn[vece];
3244
} else if (sub == TCG_COND_GT) {
3245
insn = cmpgt_insn[vece];
3247
g_assert_not_reached();
3251
case INDEX_op_andc_vec:
3253
if (type == TCG_TYPE_V256) {
3256
tcg_out_vex_modrm(s, insn, a0, a2, a1);
3259
case INDEX_op_shli_vec:
3260
insn = shift_imm_insn[vece];
3263
case INDEX_op_shri_vec:
3264
insn = shift_imm_insn[vece];
3267
case INDEX_op_sari_vec:
3268
if (vece == MO_64) {
3269
insn = OPC_PSHIFTD_Ib | P_VEXW | P_EVEX;
3271
insn = shift_imm_insn[vece];
3275
case INDEX_op_rotli_vec:
3276
insn = OPC_PSHIFTD_Ib | P_EVEX; /* VPROL[DQ] */
3277
if (vece == MO_64) {
3283
tcg_debug_assert(vece != MO_8);
3284
if (type == TCG_TYPE_V256) {
3287
tcg_out_vex_modrm(s, insn, sub, a0, a1);
3291
case INDEX_op_ld_vec:
3292
tcg_out_ld(s, type, a0, a1, a2);
3294
case INDEX_op_st_vec:
3295
tcg_out_st(s, type, a0, a1, a2);
3297
case INDEX_op_dupm_vec:
3298
tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
3301
case INDEX_op_x86_shufps_vec:
3305
case INDEX_op_x86_blend_vec:
3306
if (vece == MO_16) {
3308
} else if (vece == MO_32) {
3309
insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS);
3311
g_assert_not_reached();
3315
case INDEX_op_x86_vperm2i128_vec:
3316
insn = OPC_VPERM2I128;
3319
case INDEX_op_x86_vpshldi_vec:
3320
insn = vpshldi_insn[vece];
3324
case INDEX_op_not_vec:
3325
insn = OPC_VPTERNLOGQ;
3327
sub = 0x33; /* !B */
3329
case INDEX_op_nor_vec:
3330
insn = OPC_VPTERNLOGQ;
3331
sub = 0x11; /* norCB */
3333
case INDEX_op_nand_vec:
3334
insn = OPC_VPTERNLOGQ;
3335
sub = 0x77; /* nandCB */
3337
case INDEX_op_eqv_vec:
3338
insn = OPC_VPTERNLOGQ;
3339
sub = 0x99; /* xnorCB */
3341
case INDEX_op_orc_vec:
3342
insn = OPC_VPTERNLOGQ;
3343
sub = 0xdd; /* orB!C */
3346
case INDEX_op_bitsel_vec:
3347
insn = OPC_VPTERNLOGQ;
3352
sub = 0xca; /* A?B:C */
3353
} else if (a0 == a2) {
3355
sub = 0xe2; /* B?A:C */
3357
tcg_out_mov(s, type, a0, a3);
3358
sub = 0xb8; /* B?C:A */
3363
tcg_debug_assert(insn != OPC_UD2);
3364
if (type == TCG_TYPE_V256) {
3367
tcg_out_vex_modrm(s, insn, a0, a1, a2);
3371
case INDEX_op_x86_vpblendvb_vec:
3372
insn = OPC_VPBLENDVB;
3373
if (type == TCG_TYPE_V256) {
3376
tcg_out_vex_modrm(s, insn, a0, a1, a2);
3377
tcg_out8(s, args[3] << 4);
3380
case INDEX_op_x86_psrldq_vec:
3381
tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
3385
case INDEX_op_mov_vec: /* Always emitted via tcg_out_mov. */
3386
case INDEX_op_dup_vec: /* Always emitted via tcg_out_dup_vec. */
3388
g_assert_not_reached();
3392
static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
3395
case INDEX_op_goto_ptr:
3398
case INDEX_op_ld8u_i32:
3399
case INDEX_op_ld8u_i64:
3400
case INDEX_op_ld8s_i32:
3401
case INDEX_op_ld8s_i64:
3402
case INDEX_op_ld16u_i32:
3403
case INDEX_op_ld16u_i64:
3404
case INDEX_op_ld16s_i32:
3405
case INDEX_op_ld16s_i64:
3406
case INDEX_op_ld_i32:
3407
case INDEX_op_ld32u_i64:
3408
case INDEX_op_ld32s_i64:
3409
case INDEX_op_ld_i64:
3410
return C_O1_I1(r, r);
3412
case INDEX_op_st8_i32:
3413
case INDEX_op_st8_i64:
3414
return C_O0_I2(qi, r);
3416
case INDEX_op_st16_i32:
3417
case INDEX_op_st16_i64:
3418
case INDEX_op_st_i32:
3419
case INDEX_op_st32_i64:
3420
return C_O0_I2(ri, r);
3422
case INDEX_op_st_i64:
3423
return C_O0_I2(re, r);
3425
case INDEX_op_add_i32:
3426
case INDEX_op_add_i64:
3427
return C_O1_I2(r, r, re);
3429
case INDEX_op_sub_i32:
3430
case INDEX_op_sub_i64:
3431
case INDEX_op_mul_i32:
3432
case INDEX_op_mul_i64:
3433
case INDEX_op_or_i32:
3434
case INDEX_op_or_i64:
3435
case INDEX_op_xor_i32:
3436
case INDEX_op_xor_i64:
3437
return C_O1_I2(r, 0, re);
3439
case INDEX_op_and_i32:
3440
case INDEX_op_and_i64:
3441
return C_O1_I2(r, 0, reZ);
3443
case INDEX_op_andc_i32:
3444
case INDEX_op_andc_i64:
3445
return C_O1_I2(r, r, rI);
3447
case INDEX_op_shl_i32:
3448
case INDEX_op_shl_i64:
3449
case INDEX_op_shr_i32:
3450
case INDEX_op_shr_i64:
3451
case INDEX_op_sar_i32:
3452
case INDEX_op_sar_i64:
3453
return have_bmi2 ? C_O1_I2(r, r, ri) : C_O1_I2(r, 0, ci);
3455
case INDEX_op_rotl_i32:
3456
case INDEX_op_rotl_i64:
3457
case INDEX_op_rotr_i32:
3458
case INDEX_op_rotr_i64:
3459
return C_O1_I2(r, 0, ci);
3461
case INDEX_op_brcond_i32:
3462
case INDEX_op_brcond_i64:
3463
return C_O0_I2(r, reT);
3465
case INDEX_op_bswap16_i32:
3466
case INDEX_op_bswap16_i64:
3467
case INDEX_op_bswap32_i32:
3468
case INDEX_op_bswap32_i64:
3469
case INDEX_op_bswap64_i64:
3470
case INDEX_op_neg_i32:
3471
case INDEX_op_neg_i64:
3472
case INDEX_op_not_i32:
3473
case INDEX_op_not_i64:
3474
case INDEX_op_extrh_i64_i32:
3475
return C_O1_I1(r, 0);
3477
case INDEX_op_ext8s_i32:
3478
case INDEX_op_ext8s_i64:
3479
case INDEX_op_ext8u_i32:
3480
case INDEX_op_ext8u_i64:
3481
return C_O1_I1(r, q);
3483
case INDEX_op_ext16s_i32:
3484
case INDEX_op_ext16s_i64:
3485
case INDEX_op_ext16u_i32:
3486
case INDEX_op_ext16u_i64:
3487
case INDEX_op_ext32s_i64:
3488
case INDEX_op_ext32u_i64:
3489
case INDEX_op_ext_i32_i64:
3490
case INDEX_op_extu_i32_i64:
3491
case INDEX_op_extrl_i64_i32:
3492
case INDEX_op_extract_i32:
3493
case INDEX_op_extract_i64:
3494
case INDEX_op_sextract_i32:
3495
case INDEX_op_ctpop_i32:
3496
case INDEX_op_ctpop_i64:
3497
return C_O1_I1(r, r);
3499
case INDEX_op_extract2_i32:
3500
case INDEX_op_extract2_i64:
3501
return C_O1_I2(r, 0, r);
3503
case INDEX_op_deposit_i32:
3504
case INDEX_op_deposit_i64:
3505
return C_O1_I2(q, 0, qi);
3507
case INDEX_op_setcond_i32:
3508
case INDEX_op_setcond_i64:
3509
case INDEX_op_negsetcond_i32:
3510
case INDEX_op_negsetcond_i64:
3511
return C_O1_I2(q, r, reT);
3513
case INDEX_op_movcond_i32:
3514
case INDEX_op_movcond_i64:
3515
return C_O1_I4(r, r, reT, r, 0);
3517
case INDEX_op_div2_i32:
3518
case INDEX_op_div2_i64:
3519
case INDEX_op_divu2_i32:
3520
case INDEX_op_divu2_i64:
3521
return C_O2_I3(a, d, 0, 1, r);
3523
case INDEX_op_mulu2_i32:
3524
case INDEX_op_mulu2_i64:
3525
case INDEX_op_muls2_i32:
3526
case INDEX_op_muls2_i64:
3527
return C_O2_I2(a, d, a, r);
3529
case INDEX_op_add2_i32:
3530
case INDEX_op_add2_i64:
3531
case INDEX_op_sub2_i32:
3532
case INDEX_op_sub2_i64:
3533
return C_N1_O1_I4(r, r, 0, 1, re, re);
3535
case INDEX_op_ctz_i32:
3536
case INDEX_op_ctz_i64:
3537
return have_bmi1 ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3539
case INDEX_op_clz_i32:
3540
case INDEX_op_clz_i64:
3541
return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3543
case INDEX_op_qemu_ld_a32_i32:
3544
return C_O1_I1(r, L);
3545
case INDEX_op_qemu_ld_a64_i32:
3546
return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O1_I2(r, L, L);
3548
case INDEX_op_qemu_st_a32_i32:
3549
return C_O0_I2(L, L);
3550
case INDEX_op_qemu_st_a64_i32:
3551
return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L);
3552
case INDEX_op_qemu_st8_a32_i32:
3553
return C_O0_I2(s, L);
3554
case INDEX_op_qemu_st8_a64_i32:
3555
return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(s, L) : C_O0_I3(s, L, L);
3557
case INDEX_op_qemu_ld_a32_i64:
3558
return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I1(r, r, L);
3559
case INDEX_op_qemu_ld_a64_i64:
3560
return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I2(r, r, L, L);
3562
case INDEX_op_qemu_st_a32_i64:
3563
return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L);
3564
case INDEX_op_qemu_st_a64_i64:
3565
return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I4(L, L, L, L);
3567
case INDEX_op_qemu_ld_a32_i128:
3568
case INDEX_op_qemu_ld_a64_i128:
3569
tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
3570
return C_O2_I1(r, r, L);
3571
case INDEX_op_qemu_st_a32_i128:
3572
case INDEX_op_qemu_st_a64_i128:
3573
tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
3574
return C_O0_I3(L, L, L);
3576
case INDEX_op_brcond2_i32:
3577
return C_O0_I4(r, r, ri, ri);
3579
case INDEX_op_setcond2_i32:
3580
return C_O1_I4(r, r, r, ri, ri);
3582
case INDEX_op_ld_vec:
3583
case INDEX_op_dupm_vec:
3584
return C_O1_I1(x, r);
3586
case INDEX_op_st_vec:
3587
return C_O0_I2(x, r);
3589
case INDEX_op_add_vec:
3590
case INDEX_op_sub_vec:
3591
case INDEX_op_mul_vec:
3592
case INDEX_op_and_vec:
3593
case INDEX_op_or_vec:
3594
case INDEX_op_xor_vec:
3595
case INDEX_op_andc_vec:
3596
case INDEX_op_orc_vec:
3597
case INDEX_op_nand_vec:
3598
case INDEX_op_nor_vec:
3599
case INDEX_op_eqv_vec:
3600
case INDEX_op_ssadd_vec:
3601
case INDEX_op_usadd_vec:
3602
case INDEX_op_sssub_vec:
3603
case INDEX_op_ussub_vec:
3604
case INDEX_op_smin_vec:
3605
case INDEX_op_umin_vec:
3606
case INDEX_op_smax_vec:
3607
case INDEX_op_umax_vec:
3608
case INDEX_op_shlv_vec:
3609
case INDEX_op_shrv_vec:
3610
case INDEX_op_sarv_vec:
3611
case INDEX_op_rotlv_vec:
3612
case INDEX_op_rotrv_vec:
3613
case INDEX_op_shls_vec:
3614
case INDEX_op_shrs_vec:
3615
case INDEX_op_sars_vec:
3616
case INDEX_op_cmp_vec:
3617
case INDEX_op_x86_shufps_vec:
3618
case INDEX_op_x86_blend_vec:
3619
case INDEX_op_x86_packss_vec:
3620
case INDEX_op_x86_packus_vec:
3621
case INDEX_op_x86_vperm2i128_vec:
3622
case INDEX_op_x86_punpckl_vec:
3623
case INDEX_op_x86_punpckh_vec:
3624
case INDEX_op_x86_vpshldi_vec:
3625
#if TCG_TARGET_REG_BITS == 32
3626
case INDEX_op_dup2_vec:
3628
return C_O1_I2(x, x, x);
3630
case INDEX_op_abs_vec:
3631
case INDEX_op_dup_vec:
3632
case INDEX_op_not_vec:
3633
case INDEX_op_shli_vec:
3634
case INDEX_op_shri_vec:
3635
case INDEX_op_sari_vec:
3636
case INDEX_op_rotli_vec:
3637
case INDEX_op_x86_psrldq_vec:
3638
return C_O1_I1(x, x);
3640
case INDEX_op_x86_vpshldv_vec:
3641
case INDEX_op_x86_vpshrdv_vec:
3642
return C_O1_I3(x, 0, x, x);
3644
case INDEX_op_bitsel_vec:
3645
case INDEX_op_x86_vpblendvb_vec:
3646
return C_O1_I3(x, x, x, x);
3649
g_assert_not_reached();
3653
int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
3656
case INDEX_op_add_vec:
3657
case INDEX_op_sub_vec:
3658
case INDEX_op_and_vec:
3659
case INDEX_op_or_vec:
3660
case INDEX_op_xor_vec:
3661
case INDEX_op_andc_vec:
3662
case INDEX_op_orc_vec:
3663
case INDEX_op_nand_vec:
3664
case INDEX_op_nor_vec:
3665
case INDEX_op_eqv_vec:
3666
case INDEX_op_not_vec:
3667
case INDEX_op_bitsel_vec:
3669
case INDEX_op_cmp_vec:
3670
case INDEX_op_cmpsel_vec:
3673
case INDEX_op_rotli_vec:
3674
return have_avx512vl && vece >= MO_32 ? 1 : -1;
3676
case INDEX_op_shli_vec:
3677
case INDEX_op_shri_vec:
3678
/* We must expand the operation for MO_8. */
3679
return vece == MO_8 ? -1 : 1;
3681
case INDEX_op_sari_vec:
3689
if (have_avx512vl) {
3693
* We can emulate this for MO_64, but it does not pay off
3694
* unless we're producing at least 4 values.
3696
return type >= TCG_TYPE_V256 ? -1 : 0;
3700
case INDEX_op_shls_vec:
3701
case INDEX_op_shrs_vec:
3702
return vece >= MO_16;
3703
case INDEX_op_sars_vec:
3709
return have_avx512vl;
3712
case INDEX_op_rotls_vec:
3713
return vece >= MO_16 ? -1 : 0;
3715
case INDEX_op_shlv_vec:
3716
case INDEX_op_shrv_vec:
3719
return have_avx512bw;
3725
case INDEX_op_sarv_vec:
3728
return have_avx512bw;
3732
return have_avx512vl;
3735
case INDEX_op_rotlv_vec:
3736
case INDEX_op_rotrv_vec:
3739
return have_avx512vbmi2 ? -1 : 0;
3742
return have_avx512vl ? 1 : have_avx2 ? -1 : 0;
3746
case INDEX_op_mul_vec:
3751
return have_avx512dq;
3755
case INDEX_op_ssadd_vec:
3756
case INDEX_op_usadd_vec:
3757
case INDEX_op_sssub_vec:
3758
case INDEX_op_ussub_vec:
3759
return vece <= MO_16;
3760
case INDEX_op_smin_vec:
3761
case INDEX_op_smax_vec:
3762
case INDEX_op_umin_vec:
3763
case INDEX_op_umax_vec:
3764
case INDEX_op_abs_vec:
3765
return vece <= MO_32 || have_avx512vl;
3772
static void expand_vec_shi(TCGType type, unsigned vece, bool right,
3773
TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3777
tcg_debug_assert(vece == MO_8);
3780
tcg_gen_shri_vec(MO_16, v0, v1, imm);
3783
tcg_gen_shli_vec(MO_16, v0, v1, imm);
3785
tcg_gen_and_vec(MO_8, v0, v0, tcg_constant_vec(type, MO_8, mask));
3788
static void expand_vec_sari(TCGType type, unsigned vece,
3789
TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3795
/* Unpack to 16-bit, shift, and repack. */
3796
t1 = tcg_temp_new_vec(type);
3797
t2 = tcg_temp_new_vec(type);
3798
vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3799
tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3800
vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3801
tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3802
tcg_gen_sari_vec(MO_16, t1, t1, imm + 8);
3803
tcg_gen_sari_vec(MO_16, t2, t2, imm + 8);
3804
vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
3805
tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3806
tcg_temp_free_vec(t1);
3807
tcg_temp_free_vec(t2);
3811
t1 = tcg_temp_new_vec(type);
3814
* We can emulate a small sign extend by performing an arithmetic
3815
* 32-bit shift and overwriting the high half of a 64-bit logical
3816
* shift. Note that the ISA says shift of 32 is valid, but TCG
3817
* does not, so we have to bound the smaller shift -- we get the
3818
* same result in the high half either way.
3820
tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31));
3821
tcg_gen_shri_vec(MO_64, v0, v1, imm);
3822
vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
3823
tcgv_vec_arg(v0), tcgv_vec_arg(v0),
3824
tcgv_vec_arg(t1), 0xaa);
3826
/* Otherwise we will need to use a compare vs 0 to produce
3827
* the sign-extend, shift and merge.
3829
tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1,
3830
tcg_constant_vec(type, MO_64, 0), v1);
3831
tcg_gen_shri_vec(MO_64, v0, v1, imm);
3832
tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm);
3833
tcg_gen_or_vec(MO_64, v0, v0, t1);
3835
tcg_temp_free_vec(t1);
3839
g_assert_not_reached();
3843
static void expand_vec_rotli(TCGType type, unsigned vece,
3844
TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3848
if (vece != MO_8 && have_avx512vbmi2) {
3849
vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece,
3850
tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v1), imm);
3854
t = tcg_temp_new_vec(type);
3855
tcg_gen_shli_vec(vece, t, v1, imm);
3856
tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm);
3857
tcg_gen_or_vec(vece, v0, v0, t);
3858
tcg_temp_free_vec(t);
3861
static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0,
3862
TCGv_vec v1, TCGv_vec sh, bool right)
3866
if (have_avx512vbmi2) {
3867
vec_gen_4(right ? INDEX_op_x86_vpshrdv_vec : INDEX_op_x86_vpshldv_vec,
3868
type, vece, tcgv_vec_arg(v0), tcgv_vec_arg(v1),
3869
tcgv_vec_arg(v1), tcgv_vec_arg(sh));
3873
t = tcg_temp_new_vec(type);
3874
tcg_gen_dupi_vec(vece, t, 8 << vece);
3875
tcg_gen_sub_vec(vece, t, t, sh);
3877
tcg_gen_shlv_vec(vece, t, v1, t);
3878
tcg_gen_shrv_vec(vece, v0, v1, sh);
3880
tcg_gen_shrv_vec(vece, t, v1, t);
3881
tcg_gen_shlv_vec(vece, v0, v1, sh);
3883
tcg_gen_or_vec(vece, v0, v0, t);
3884
tcg_temp_free_vec(t);
3887
static void expand_vec_rotls(TCGType type, unsigned vece,
3888
TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh)
3890
TCGv_vec t = tcg_temp_new_vec(type);
3892
tcg_debug_assert(vece != MO_8);
3894
if (vece >= MO_32 ? have_avx512vl : have_avx512vbmi2) {
3895
tcg_gen_dup_i32_vec(vece, t, lsh);
3896
if (vece >= MO_32) {
3897
tcg_gen_rotlv_vec(vece, v0, v1, t);
3899
expand_vec_rotv(type, vece, v0, v1, t, false);
3902
TCGv_i32 rsh = tcg_temp_new_i32();
3904
tcg_gen_neg_i32(rsh, lsh);
3905
tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1);
3906
tcg_gen_shls_vec(vece, t, v1, lsh);
3907
tcg_gen_shrs_vec(vece, v0, v1, rsh);
3908
tcg_gen_or_vec(vece, v0, v0, t);
3910
tcg_temp_free_i32(rsh);
3913
tcg_temp_free_vec(t);
3916
static void expand_vec_mul(TCGType type, unsigned vece,
3917
TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
3919
TCGv_vec t1, t2, t3, t4, zero;
3921
tcg_debug_assert(vece == MO_8);
3924
* Unpack v1 bytes to words, 0 | x.
3925
* Unpack v2 bytes to words, y | 0.
3926
* This leaves the 8-bit result, x * y, with 8 bits of right padding.
3927
* Shift logical right by 8 bits to clear the high 8 bytes before
3928
* using an unsigned saturated pack.
3930
* The difference between the V64, V128 and V256 cases is merely how
3931
* we distribute the expansion between temporaries.
3935
t1 = tcg_temp_new_vec(TCG_TYPE_V128);
3936
t2 = tcg_temp_new_vec(TCG_TYPE_V128);
3937
zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3938
vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3939
tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3940
vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3941
tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3942
tcg_gen_mul_vec(MO_16, t1, t1, t2);
3943
tcg_gen_shri_vec(MO_16, t1, t1, 8);
3944
vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
3945
tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1));
3946
tcg_temp_free_vec(t1);
3947
tcg_temp_free_vec(t2);
3952
t1 = tcg_temp_new_vec(type);
3953
t2 = tcg_temp_new_vec(type);
3954
t3 = tcg_temp_new_vec(type);
3955
t4 = tcg_temp_new_vec(type);
3956
zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3957
vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3958
tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3959
vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3960
tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3961
vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3962
tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3963
vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3964
tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3965
tcg_gen_mul_vec(MO_16, t1, t1, t2);
3966
tcg_gen_mul_vec(MO_16, t3, t3, t4);
3967
tcg_gen_shri_vec(MO_16, t1, t1, 8);
3968
tcg_gen_shri_vec(MO_16, t3, t3, 8);
3969
vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3970
tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
3971
tcg_temp_free_vec(t1);
3972
tcg_temp_free_vec(t2);
3973
tcg_temp_free_vec(t3);
3974
tcg_temp_free_vec(t4);
3978
g_assert_not_reached();
3982
static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0,
3983
TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3992
TCGv_vec t1, t2, t3;
4008
fixup = NEED_SWAP | NEED_INV;
4011
if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) {
4014
fixup = NEED_BIAS | NEED_INV;
4018
if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) {
4019
fixup = NEED_UMIN | NEED_INV;
4025
if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) {
4028
fixup = NEED_BIAS | NEED_SWAP | NEED_INV;
4032
if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) {
4033
fixup = NEED_UMAX | NEED_INV;
4035
fixup = NEED_BIAS | NEED_SWAP;
4039
g_assert_not_reached();
4042
if (fixup & NEED_INV) {
4043
cond = tcg_invert_cond(cond);
4045
if (fixup & NEED_SWAP) {
4046
t1 = v1, v1 = v2, v2 = t1;
4047
cond = tcg_swap_cond(cond);
4051
if (fixup & (NEED_UMIN | NEED_UMAX)) {
4052
t1 = tcg_temp_new_vec(type);
4053
if (fixup & NEED_UMIN) {
4054
tcg_gen_umin_vec(vece, t1, v1, v2);
4056
tcg_gen_umax_vec(vece, t1, v1, v2);
4060
} else if (fixup & NEED_BIAS) {
4061
t1 = tcg_temp_new_vec(type);
4062
t2 = tcg_temp_new_vec(type);
4063
t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1));
4064
tcg_gen_sub_vec(vece, t1, v1, t3);
4065
tcg_gen_sub_vec(vece, t2, v2, t3);
4068
cond = tcg_signed_cond(cond);
4071
tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
4072
/* Expand directly; do not recurse. */
4073
vec_gen_4(INDEX_op_cmp_vec, type, vece,
4074
tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond);
4077
tcg_temp_free_vec(t1);
4079
tcg_temp_free_vec(t2);
4082
return fixup & NEED_INV;
4085
static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
4086
TCGv_vec v1, TCGv_vec v2, TCGCond cond)
4088
if (expand_vec_cmp_noinv(type, vece, v0, v1, v2, cond)) {
4089
tcg_gen_not_vec(vece, v0, v0);
4093
static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGv_vec v0,
4094
TCGv_vec c1, TCGv_vec c2,
4095
TCGv_vec v3, TCGv_vec v4, TCGCond cond)
4097
TCGv_vec t = tcg_temp_new_vec(type);
4099
if (expand_vec_cmp_noinv(type, vece, t, c1, c2, cond)) {
4100
/* Invert the sense of the compare by swapping arguments. */
4102
x = v3, v3 = v4, v4 = x;
4104
vec_gen_4(INDEX_op_x86_vpblendvb_vec, type, vece,
4105
tcgv_vec_arg(v0), tcgv_vec_arg(v4),
4106
tcgv_vec_arg(v3), tcgv_vec_arg(t));
4107
tcg_temp_free_vec(t);
4110
void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
4115
TCGv_vec v0, v1, v2, v3, v4;
4118
v0 = temp_tcgv_vec(arg_temp(a0));
4119
v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
4120
a2 = va_arg(va, TCGArg);
4123
case INDEX_op_shli_vec:
4124
expand_vec_shi(type, vece, false, v0, v1, a2);
4126
case INDEX_op_shri_vec:
4127
expand_vec_shi(type, vece, true, v0, v1, a2);
4129
case INDEX_op_sari_vec:
4130
expand_vec_sari(type, vece, v0, v1, a2);
4133
case INDEX_op_rotli_vec:
4134
expand_vec_rotli(type, vece, v0, v1, a2);
4137
case INDEX_op_rotls_vec:
4138
expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2)));
4141
case INDEX_op_rotlv_vec:
4142
v2 = temp_tcgv_vec(arg_temp(a2));
4143
expand_vec_rotv(type, vece, v0, v1, v2, false);
4145
case INDEX_op_rotrv_vec:
4146
v2 = temp_tcgv_vec(arg_temp(a2));
4147
expand_vec_rotv(type, vece, v0, v1, v2, true);
4150
case INDEX_op_mul_vec:
4151
v2 = temp_tcgv_vec(arg_temp(a2));
4152
expand_vec_mul(type, vece, v0, v1, v2);
4155
case INDEX_op_cmp_vec:
4156
v2 = temp_tcgv_vec(arg_temp(a2));
4157
expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg));
4160
case INDEX_op_cmpsel_vec:
4161
v2 = temp_tcgv_vec(arg_temp(a2));
4162
v3 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
4163
v4 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
4164
expand_vec_cmpsel(type, vece, v0, v1, v2, v3, v4, va_arg(va, TCGArg));
4174
static const int tcg_target_callee_save_regs[] = {
4175
#if TCG_TARGET_REG_BITS == 64
4184
TCG_REG_R14, /* Currently used for the global env. */
4187
TCG_REG_EBP, /* Currently used for the global env. */
4194
/* Compute frame size via macros, to share between tcg_target_qemu_prologue
4195
and tcg_register_jit. */
4198
((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
4199
* (TCG_TARGET_REG_BITS / 8))
4203
+ TCG_STATIC_CALL_ARGS_SIZE \
4204
+ CPU_TEMP_BUF_NLONGS * sizeof(long) \
4205
+ TCG_TARGET_STACK_ALIGN - 1) \
4206
& ~(TCG_TARGET_STACK_ALIGN - 1))
4208
/* Generate global QEMU prologue and epilogue code */
4209
static void tcg_target_qemu_prologue(TCGContext *s)
4211
int i, stack_addend;
4215
/* Reserve some stack space, also for TCG temps. */
4216
stack_addend = FRAME_SIZE - PUSH_SIZE;
4217
tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
4218
CPU_TEMP_BUF_NLONGS * sizeof(long));
4220
/* Save all callee saved registers. */
4221
for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
4222
tcg_out_push(s, tcg_target_callee_save_regs[i]);
4225
if (!tcg_use_softmmu && guest_base) {
4226
int seg = setup_guest_base_seg();
4228
x86_guest_base.seg = seg;
4229
} else if (guest_base == (int32_t)guest_base) {
4230
x86_guest_base.ofs = guest_base;
4232
assert(TCG_TARGET_REG_BITS == 64);
4233
/* Choose R12 because, as a base, it requires a SIB byte. */
4234
x86_guest_base.index = TCG_REG_R12;
4235
tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base.index, guest_base);
4236
tcg_regset_set_reg(s->reserved_regs, x86_guest_base.index);
4240
if (TCG_TARGET_REG_BITS == 32) {
4241
tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
4242
(ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
4243
tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
4245
tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
4246
(ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
4249
tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
4250
tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
4252
tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
4256
* Return path for goto_ptr. Set return value to 0, a-la exit_tb,
4257
* and fall through to the rest of the epilogue.
4259
tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr);
4260
tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
4263
tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr);
4265
tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
4268
tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
4270
for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
4271
tcg_out_pop(s, tcg_target_callee_save_regs[i]);
4273
tcg_out_opc(s, OPC_RET, 0, 0, 0);
4276
static void tcg_out_tb_start(TCGContext *s)
4281
static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
4283
memset(p, 0x90, count);
4286
static void tcg_target_init(TCGContext *s)
4288
tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
4289
if (TCG_TARGET_REG_BITS == 64) {
4290
tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
4293
tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS;
4294
tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
4297
tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS;
4300
tcg_target_call_clobber_regs = ALL_VECTOR_REGS;
4301
tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
4302
tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
4303
tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
4304
if (TCG_TARGET_REG_BITS == 64) {
4306
tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
4307
tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
4309
tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
4310
tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
4311
tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
4312
tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
4315
s->reserved_regs = 0;
4316
tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
4317
tcg_regset_set_reg(s->reserved_regs, TCG_TMP_VEC);
4319
/* These are call saved, and we don't save them, so don't use them. */
4320
tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM6);
4321
tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM7);
4322
tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM8);
4323
tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM9);
4324
tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM10);
4325
tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM11);
4326
tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM12);
4327
tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM13);
4328
tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM14);
4329
tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM15);
4335
uint8_t fde_def_cfa[4];
4336
uint8_t fde_reg_ofs[14];
4339
/* We're expecting a 2 byte uleb128 encoded value. */
4340
QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
4342
#if !defined(__ELF__)
4343
/* Host machine without ELF. */
4344
#elif TCG_TARGET_REG_BITS == 64
4345
#define ELF_HOST_MACHINE EM_X86_64
4346
static const DebugFrame debug_frame = {
4347
.h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
4350
.h.cie.code_align = 1,
4351
.h.cie.data_align = 0x78, /* sleb128 -8 */
4352
.h.cie.return_column = 16,
4354
/* Total FDE size does not include the "len" member. */
4355
.h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4358
12, 7, /* DW_CFA_def_cfa %rsp, ... */
4359
(FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */
4363
0x90, 1, /* DW_CFA_offset, %rip, -8 */
4364
/* The following ordering must match tcg_target_callee_save_regs. */
4365
0x86, 2, /* DW_CFA_offset, %rbp, -16 */
4366
0x83, 3, /* DW_CFA_offset, %rbx, -24 */
4367
0x8c, 4, /* DW_CFA_offset, %r12, -32 */
4368
0x8d, 5, /* DW_CFA_offset, %r13, -40 */
4369
0x8e, 6, /* DW_CFA_offset, %r14, -48 */
4370
0x8f, 7, /* DW_CFA_offset, %r15, -56 */
4374
#define ELF_HOST_MACHINE EM_386
4375
static const DebugFrame debug_frame = {
4376
.h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
4379
.h.cie.code_align = 1,
4380
.h.cie.data_align = 0x7c, /* sleb128 -4 */
4381
.h.cie.return_column = 8,
4383
/* Total FDE size does not include the "len" member. */
4384
.h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4387
12, 4, /* DW_CFA_def_cfa %esp, ... */
4388
(FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */
4392
0x88, 1, /* DW_CFA_offset, %eip, -4 */
4393
/* The following ordering must match tcg_target_callee_save_regs. */
4394
0x85, 2, /* DW_CFA_offset, %ebp, -8 */
4395
0x83, 3, /* DW_CFA_offset, %ebx, -12 */
4396
0x86, 4, /* DW_CFA_offset, %esi, -16 */
4397
0x87, 5, /* DW_CFA_offset, %edi, -20 */
4402
#if defined(ELF_HOST_MACHINE)
4403
void tcg_register_jit(const void *buf, size_t buf_size)
4405
tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));