jdk

Форк
0
/
x86.ad 
10350 строк · 411.0 Кб
1
//
2
// Copyright (c) 2011, 2024, Oracle and/or its affiliates. All rights reserved.
3
// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4
//
5
// This code is free software; you can redistribute it and/or modify it
6
// under the terms of the GNU General Public License version 2 only, as
7
// published by the Free Software Foundation.
8
//
9
// This code is distributed in the hope that it will be useful, but WITHOUT
10
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11
// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12
// version 2 for more details (a copy is included in the LICENSE file that
13
// accompanied this code).
14
//
15
// You should have received a copy of the GNU General Public License version
16
// 2 along with this work; if not, write to the Free Software Foundation,
17
// Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18
//
19
// Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20
// or visit www.oracle.com if you need additional information or have any
21
// questions.
22
//
23
//
24

25
// X86 Common Architecture Description File
26

27
//----------REGISTER DEFINITION BLOCK------------------------------------------
28
// This information is used by the matcher and the register allocator to
29
// describe individual registers and classes of registers within the target
30
// architecture.
31

32
register %{
33
//----------Architecture Description Register Definitions----------------------
34
// General Registers
35
// "reg_def"  name ( register save type, C convention save type,
36
//                   ideal register type, encoding );
37
// Register Save Types:
38
//
39
// NS  = No-Save:       The register allocator assumes that these registers
40
//                      can be used without saving upon entry to the method, &
41
//                      that they do not need to be saved at call sites.
42
//
43
// SOC = Save-On-Call:  The register allocator assumes that these registers
44
//                      can be used without saving upon entry to the method,
45
//                      but that they must be saved at call sites.
46
//
47
// SOE = Save-On-Entry: The register allocator assumes that these registers
48
//                      must be saved before using them upon entry to the
49
//                      method, but they do not need to be saved at call
50
//                      sites.
51
//
52
// AS  = Always-Save:   The register allocator assumes that these registers
53
//                      must be saved before using them upon entry to the
54
//                      method, & that they must be saved at call sites.
55
//
56
// Ideal Register Type is used to determine how to save & restore a
57
// register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
58
// spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
59
//
60
// The encoding number is the actual bit-pattern placed into the opcodes.
61

62
// XMM registers.  512-bit registers or 8 words each, labeled (a)-p.
63
// Word a in each register holds a Float, words ab hold a Double.
64
// The whole registers are used in SSE4.2 version intrinsics,
65
// array copy stubs and superword operations (see UseSSE42Intrinsics,
66
// UseXMMForArrayCopy and UseSuperword flags).
67
// For pre EVEX enabled architectures:
68
//      XMM8-XMM15 must be encoded with REX (VEX for UseAVX)
69
// For EVEX enabled architectures:
70
//      XMM8-XMM31 must be encoded with REX (EVEX for UseAVX).
71
//
72
// Linux ABI:   No register preserved across function calls
73
//              XMM0-XMM7 might hold parameters
74
// Windows ABI: XMM6-XMM15 preserved across function calls
75
//              XMM0-XMM3 might hold parameters
76

77
reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
78
reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
79
reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
80
reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
81
reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
82
reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
83
reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
84
reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
85
reg_def XMM0i( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(8));
86
reg_def XMM0j( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(9));
87
reg_def XMM0k( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(10));
88
reg_def XMM0l( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(11));
89
reg_def XMM0m( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(12));
90
reg_def XMM0n( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(13));
91
reg_def XMM0o( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(14));
92
reg_def XMM0p( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(15));
93

94
reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
95
reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
96
reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
97
reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
98
reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
99
reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
100
reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
101
reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
102
reg_def XMM1i( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(8));
103
reg_def XMM1j( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(9));
104
reg_def XMM1k( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(10));
105
reg_def XMM1l( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(11));
106
reg_def XMM1m( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(12));
107
reg_def XMM1n( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(13));
108
reg_def XMM1o( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(14));
109
reg_def XMM1p( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(15));
110

111
reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
112
reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
113
reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
114
reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
115
reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
116
reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
117
reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
118
reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
119
reg_def XMM2i( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(8));
120
reg_def XMM2j( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(9));
121
reg_def XMM2k( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(10));
122
reg_def XMM2l( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(11));
123
reg_def XMM2m( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(12));
124
reg_def XMM2n( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(13));
125
reg_def XMM2o( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(14));
126
reg_def XMM2p( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(15));
127

128
reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
129
reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
130
reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
131
reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
132
reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
133
reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
134
reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
135
reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
136
reg_def XMM3i( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(8));
137
reg_def XMM3j( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(9));
138
reg_def XMM3k( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(10));
139
reg_def XMM3l( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(11));
140
reg_def XMM3m( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(12));
141
reg_def XMM3n( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(13));
142
reg_def XMM3o( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(14));
143
reg_def XMM3p( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(15));
144

145
reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
146
reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
147
reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
148
reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
149
reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
150
reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
151
reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
152
reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
153
reg_def XMM4i( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(8));
154
reg_def XMM4j( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(9));
155
reg_def XMM4k( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(10));
156
reg_def XMM4l( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(11));
157
reg_def XMM4m( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(12));
158
reg_def XMM4n( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(13));
159
reg_def XMM4o( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(14));
160
reg_def XMM4p( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(15));
161

162
reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
163
reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
164
reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
165
reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
166
reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
167
reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
168
reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
169
reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
170
reg_def XMM5i( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(8));
171
reg_def XMM5j( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(9));
172
reg_def XMM5k( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(10));
173
reg_def XMM5l( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(11));
174
reg_def XMM5m( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(12));
175
reg_def XMM5n( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(13));
176
reg_def XMM5o( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(14));
177
reg_def XMM5p( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(15));
178

179
reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
180
reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
181
reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
182
reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
183
reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
184
reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
185
reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
186
reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
187
reg_def XMM6i( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(8));
188
reg_def XMM6j( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(9));
189
reg_def XMM6k( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(10));
190
reg_def XMM6l( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(11));
191
reg_def XMM6m( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(12));
192
reg_def XMM6n( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(13));
193
reg_def XMM6o( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(14));
194
reg_def XMM6p( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(15));
195

196
reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
197
reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
198
reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
199
reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
200
reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
201
reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
202
reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
203
reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
204
reg_def XMM7i( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(8));
205
reg_def XMM7j( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(9));
206
reg_def XMM7k( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(10));
207
reg_def XMM7l( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(11));
208
reg_def XMM7m( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(12));
209
reg_def XMM7n( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(13));
210
reg_def XMM7o( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(14));
211
reg_def XMM7p( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(15));
212

213
#ifdef _LP64
214

215
reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
216
reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
217
reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
218
reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
219
reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
220
reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
221
reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
222
reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
223
reg_def XMM8i( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(8));
224
reg_def XMM8j( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(9));
225
reg_def XMM8k( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(10));
226
reg_def XMM8l( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(11));
227
reg_def XMM8m( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(12));
228
reg_def XMM8n( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(13));
229
reg_def XMM8o( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(14));
230
reg_def XMM8p( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(15));
231

232
reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
233
reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
234
reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
235
reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
236
reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
237
reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
238
reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
239
reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
240
reg_def XMM9i( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(8));
241
reg_def XMM9j( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(9));
242
reg_def XMM9k( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(10));
243
reg_def XMM9l( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(11));
244
reg_def XMM9m( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(12));
245
reg_def XMM9n( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(13));
246
reg_def XMM9o( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(14));
247
reg_def XMM9p( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(15));
248

249
reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
250
reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
251
reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
252
reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
253
reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
254
reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
255
reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
256
reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
257
reg_def XMM10i( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(8));
258
reg_def XMM10j( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(9));
259
reg_def XMM10k( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(10));
260
reg_def XMM10l( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(11));
261
reg_def XMM10m( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(12));
262
reg_def XMM10n( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(13));
263
reg_def XMM10o( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(14));
264
reg_def XMM10p( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(15));
265

266
reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
267
reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
268
reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
269
reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
270
reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
271
reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
272
reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
273
reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
274
reg_def XMM11i( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(8));
275
reg_def XMM11j( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(9));
276
reg_def XMM11k( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(10));
277
reg_def XMM11l( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(11));
278
reg_def XMM11m( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(12));
279
reg_def XMM11n( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(13));
280
reg_def XMM11o( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(14));
281
reg_def XMM11p( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(15));
282

283
reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
284
reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
285
reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
286
reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
287
reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
288
reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
289
reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
290
reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
291
reg_def XMM12i( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(8));
292
reg_def XMM12j( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(9));
293
reg_def XMM12k( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(10));
294
reg_def XMM12l( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(11));
295
reg_def XMM12m( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(12));
296
reg_def XMM12n( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(13));
297
reg_def XMM12o( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(14));
298
reg_def XMM12p( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(15));
299

300
reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
301
reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
302
reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
303
reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
304
reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
305
reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
306
reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
307
reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
308
reg_def XMM13i( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(8));
309
reg_def XMM13j( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(9));
310
reg_def XMM13k( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(10));
311
reg_def XMM13l( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(11));
312
reg_def XMM13m( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(12));
313
reg_def XMM13n( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(13));
314
reg_def XMM13o( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(14));
315
reg_def XMM13p( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(15));
316

317
reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
318
reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
319
reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
320
reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
321
reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
322
reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
323
reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
324
reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
325
reg_def XMM14i( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(8));
326
reg_def XMM14j( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(9));
327
reg_def XMM14k( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(10));
328
reg_def XMM14l( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(11));
329
reg_def XMM14m( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(12));
330
reg_def XMM14n( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(13));
331
reg_def XMM14o( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(14));
332
reg_def XMM14p( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(15));
333

334
reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
335
reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
336
reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
337
reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
338
reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
339
reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
340
reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
341
reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
342
reg_def XMM15i( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(8));
343
reg_def XMM15j( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(9));
344
reg_def XMM15k( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(10));
345
reg_def XMM15l( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(11));
346
reg_def XMM15m( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(12));
347
reg_def XMM15n( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(13));
348
reg_def XMM15o( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(14));
349
reg_def XMM15p( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(15));
350

351
reg_def XMM16 ( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg());
352
reg_def XMM16b( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(1));
353
reg_def XMM16c( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(2));
354
reg_def XMM16d( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(3));
355
reg_def XMM16e( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(4));
356
reg_def XMM16f( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(5));
357
reg_def XMM16g( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(6));
358
reg_def XMM16h( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(7));
359
reg_def XMM16i( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(8));
360
reg_def XMM16j( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(9));
361
reg_def XMM16k( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(10));
362
reg_def XMM16l( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(11));
363
reg_def XMM16m( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(12));
364
reg_def XMM16n( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(13));
365
reg_def XMM16o( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(14));
366
reg_def XMM16p( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(15));
367

368
reg_def XMM17 ( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg());
369
reg_def XMM17b( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(1));
370
reg_def XMM17c( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(2));
371
reg_def XMM17d( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(3));
372
reg_def XMM17e( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(4));
373
reg_def XMM17f( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(5));
374
reg_def XMM17g( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(6));
375
reg_def XMM17h( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(7));
376
reg_def XMM17i( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(8));
377
reg_def XMM17j( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(9));
378
reg_def XMM17k( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(10));
379
reg_def XMM17l( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(11));
380
reg_def XMM17m( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(12));
381
reg_def XMM17n( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(13));
382
reg_def XMM17o( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(14));
383
reg_def XMM17p( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(15));
384

385
reg_def XMM18 ( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg());
386
reg_def XMM18b( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(1));
387
reg_def XMM18c( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(2));
388
reg_def XMM18d( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(3));
389
reg_def XMM18e( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(4));
390
reg_def XMM18f( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(5));
391
reg_def XMM18g( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(6));
392
reg_def XMM18h( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(7));
393
reg_def XMM18i( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(8));
394
reg_def XMM18j( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(9));
395
reg_def XMM18k( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(10));
396
reg_def XMM18l( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(11));
397
reg_def XMM18m( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(12));
398
reg_def XMM18n( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(13));
399
reg_def XMM18o( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(14));
400
reg_def XMM18p( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(15));
401

402
reg_def XMM19 ( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg());
403
reg_def XMM19b( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(1));
404
reg_def XMM19c( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(2));
405
reg_def XMM19d( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(3));
406
reg_def XMM19e( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(4));
407
reg_def XMM19f( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(5));
408
reg_def XMM19g( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(6));
409
reg_def XMM19h( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(7));
410
reg_def XMM19i( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(8));
411
reg_def XMM19j( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(9));
412
reg_def XMM19k( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(10));
413
reg_def XMM19l( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(11));
414
reg_def XMM19m( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(12));
415
reg_def XMM19n( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(13));
416
reg_def XMM19o( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(14));
417
reg_def XMM19p( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(15));
418

419
reg_def XMM20 ( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg());
420
reg_def XMM20b( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(1));
421
reg_def XMM20c( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(2));
422
reg_def XMM20d( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(3));
423
reg_def XMM20e( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(4));
424
reg_def XMM20f( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(5));
425
reg_def XMM20g( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(6));
426
reg_def XMM20h( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(7));
427
reg_def XMM20i( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(8));
428
reg_def XMM20j( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(9));
429
reg_def XMM20k( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(10));
430
reg_def XMM20l( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(11));
431
reg_def XMM20m( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(12));
432
reg_def XMM20n( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(13));
433
reg_def XMM20o( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(14));
434
reg_def XMM20p( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(15));
435

436
reg_def XMM21 ( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg());
437
reg_def XMM21b( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(1));
438
reg_def XMM21c( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(2));
439
reg_def XMM21d( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(3));
440
reg_def XMM21e( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(4));
441
reg_def XMM21f( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(5));
442
reg_def XMM21g( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(6));
443
reg_def XMM21h( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(7));
444
reg_def XMM21i( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(8));
445
reg_def XMM21j( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(9));
446
reg_def XMM21k( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(10));
447
reg_def XMM21l( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(11));
448
reg_def XMM21m( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(12));
449
reg_def XMM21n( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(13));
450
reg_def XMM21o( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(14));
451
reg_def XMM21p( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(15));
452

453
reg_def XMM22 ( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg());
454
reg_def XMM22b( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(1));
455
reg_def XMM22c( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(2));
456
reg_def XMM22d( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(3));
457
reg_def XMM22e( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(4));
458
reg_def XMM22f( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(5));
459
reg_def XMM22g( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(6));
460
reg_def XMM22h( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(7));
461
reg_def XMM22i( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(8));
462
reg_def XMM22j( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(9));
463
reg_def XMM22k( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(10));
464
reg_def XMM22l( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(11));
465
reg_def XMM22m( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(12));
466
reg_def XMM22n( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(13));
467
reg_def XMM22o( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(14));
468
reg_def XMM22p( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(15));
469

470
reg_def XMM23 ( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg());
471
reg_def XMM23b( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(1));
472
reg_def XMM23c( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(2));
473
reg_def XMM23d( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(3));
474
reg_def XMM23e( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(4));
475
reg_def XMM23f( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(5));
476
reg_def XMM23g( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(6));
477
reg_def XMM23h( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(7));
478
reg_def XMM23i( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(8));
479
reg_def XMM23j( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(9));
480
reg_def XMM23k( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(10));
481
reg_def XMM23l( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(11));
482
reg_def XMM23m( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(12));
483
reg_def XMM23n( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(13));
484
reg_def XMM23o( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(14));
485
reg_def XMM23p( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(15));
486

487
reg_def XMM24 ( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg());
488
reg_def XMM24b( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(1));
489
reg_def XMM24c( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(2));
490
reg_def XMM24d( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(3));
491
reg_def XMM24e( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(4));
492
reg_def XMM24f( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(5));
493
reg_def XMM24g( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(6));
494
reg_def XMM24h( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(7));
495
reg_def XMM24i( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(8));
496
reg_def XMM24j( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(9));
497
reg_def XMM24k( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(10));
498
reg_def XMM24l( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(11));
499
reg_def XMM24m( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(12));
500
reg_def XMM24n( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(13));
501
reg_def XMM24o( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(14));
502
reg_def XMM24p( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(15));
503

504
reg_def XMM25 ( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg());
505
reg_def XMM25b( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(1));
506
reg_def XMM25c( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(2));
507
reg_def XMM25d( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(3));
508
reg_def XMM25e( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(4));
509
reg_def XMM25f( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(5));
510
reg_def XMM25g( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(6));
511
reg_def XMM25h( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(7));
512
reg_def XMM25i( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(8));
513
reg_def XMM25j( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(9));
514
reg_def XMM25k( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(10));
515
reg_def XMM25l( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(11));
516
reg_def XMM25m( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(12));
517
reg_def XMM25n( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(13));
518
reg_def XMM25o( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(14));
519
reg_def XMM25p( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(15));
520

521
reg_def XMM26 ( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg());
522
reg_def XMM26b( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(1));
523
reg_def XMM26c( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(2));
524
reg_def XMM26d( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(3));
525
reg_def XMM26e( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(4));
526
reg_def XMM26f( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(5));
527
reg_def XMM26g( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(6));
528
reg_def XMM26h( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(7));
529
reg_def XMM26i( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(8));
530
reg_def XMM26j( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(9));
531
reg_def XMM26k( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(10));
532
reg_def XMM26l( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(11));
533
reg_def XMM26m( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(12));
534
reg_def XMM26n( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(13));
535
reg_def XMM26o( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(14));
536
reg_def XMM26p( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(15));
537

538
reg_def XMM27 ( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg());
539
reg_def XMM27b( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(1));
540
reg_def XMM27c( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(2));
541
reg_def XMM27d( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(3));
542
reg_def XMM27e( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(4));
543
reg_def XMM27f( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(5));
544
reg_def XMM27g( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(6));
545
reg_def XMM27h( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(7));
546
reg_def XMM27i( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(8));
547
reg_def XMM27j( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(9));
548
reg_def XMM27k( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(10));
549
reg_def XMM27l( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(11));
550
reg_def XMM27m( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(12));
551
reg_def XMM27n( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(13));
552
reg_def XMM27o( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(14));
553
reg_def XMM27p( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(15));
554

555
reg_def XMM28 ( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg());
556
reg_def XMM28b( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(1));
557
reg_def XMM28c( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(2));
558
reg_def XMM28d( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(3));
559
reg_def XMM28e( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(4));
560
reg_def XMM28f( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(5));
561
reg_def XMM28g( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(6));
562
reg_def XMM28h( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(7));
563
reg_def XMM28i( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(8));
564
reg_def XMM28j( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(9));
565
reg_def XMM28k( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(10));
566
reg_def XMM28l( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(11));
567
reg_def XMM28m( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(12));
568
reg_def XMM28n( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(13));
569
reg_def XMM28o( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(14));
570
reg_def XMM28p( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(15));
571

572
reg_def XMM29 ( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg());
573
reg_def XMM29b( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(1));
574
reg_def XMM29c( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(2));
575
reg_def XMM29d( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(3));
576
reg_def XMM29e( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(4));
577
reg_def XMM29f( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(5));
578
reg_def XMM29g( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(6));
579
reg_def XMM29h( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(7));
580
reg_def XMM29i( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(8));
581
reg_def XMM29j( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(9));
582
reg_def XMM29k( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(10));
583
reg_def XMM29l( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(11));
584
reg_def XMM29m( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(12));
585
reg_def XMM29n( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(13));
586
reg_def XMM29o( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(14));
587
reg_def XMM29p( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(15));
588

589
reg_def XMM30 ( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg());
590
reg_def XMM30b( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(1));
591
reg_def XMM30c( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(2));
592
reg_def XMM30d( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(3));
593
reg_def XMM30e( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(4));
594
reg_def XMM30f( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(5));
595
reg_def XMM30g( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(6));
596
reg_def XMM30h( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(7));
597
reg_def XMM30i( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(8));
598
reg_def XMM30j( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(9));
599
reg_def XMM30k( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(10));
600
reg_def XMM30l( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(11));
601
reg_def XMM30m( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(12));
602
reg_def XMM30n( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(13));
603
reg_def XMM30o( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(14));
604
reg_def XMM30p( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(15));
605

606
reg_def XMM31 ( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg());
607
reg_def XMM31b( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(1));
608
reg_def XMM31c( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(2));
609
reg_def XMM31d( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(3));
610
reg_def XMM31e( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(4));
611
reg_def XMM31f( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(5));
612
reg_def XMM31g( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(6));
613
reg_def XMM31h( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(7));
614
reg_def XMM31i( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(8));
615
reg_def XMM31j( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(9));
616
reg_def XMM31k( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(10));
617
reg_def XMM31l( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(11));
618
reg_def XMM31m( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(12));
619
reg_def XMM31n( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(13));
620
reg_def XMM31o( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(14));
621
reg_def XMM31p( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(15));
622

623
#endif // _LP64
624

625
#ifdef _LP64
626
reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
627
#else
628
reg_def RFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
629
#endif // _LP64
630

631
// AVX3 Mask Registers.
632
reg_def K1   (SOC, SOC, Op_RegI,  1, k1->as_VMReg());
633
reg_def K1_H (SOC, SOC, Op_RegI,  1, k1->as_VMReg()->next());
634

635
reg_def K2   (SOC, SOC, Op_RegI,  2, k2->as_VMReg());
636
reg_def K2_H (SOC, SOC, Op_RegI,  2, k2->as_VMReg()->next());
637

638
reg_def K3   (SOC, SOC, Op_RegI,  3, k3->as_VMReg());
639
reg_def K3_H (SOC, SOC, Op_RegI,  3, k3->as_VMReg()->next());
640

641
reg_def K4   (SOC, SOC, Op_RegI,  4, k4->as_VMReg());
642
reg_def K4_H (SOC, SOC, Op_RegI,  4, k4->as_VMReg()->next());
643

644
reg_def K5   (SOC, SOC, Op_RegI,  5, k5->as_VMReg());
645
reg_def K5_H (SOC, SOC, Op_RegI,  5, k5->as_VMReg()->next());
646

647
reg_def K6   (SOC, SOC, Op_RegI,  6, k6->as_VMReg());
648
reg_def K6_H (SOC, SOC, Op_RegI,  6, k6->as_VMReg()->next());
649

650
reg_def K7   (SOC, SOC, Op_RegI,  7, k7->as_VMReg());
651
reg_def K7_H (SOC, SOC, Op_RegI,  7, k7->as_VMReg()->next());
652

653

654
alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
655
                   XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
656
                   XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
657
                   XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
658
                   XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
659
                   XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
660
                   XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
661
                   XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
662
#ifdef _LP64
663
                  ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
664
                   XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
665
                   XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
666
                   XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
667
                   XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
668
                   XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
669
                   XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
670
                   XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
671
                  ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
672
                   XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
673
                   XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
674
                   XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
675
                   XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
676
                   XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
677
                   XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
678
                   XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
679
                   XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
680
                   XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
681
                   XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
682
                   XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
683
                   XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
684
                   XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
685
                   XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
686
                   XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
687
#endif
688
                      );
689

690
alloc_class chunk2(K7, K7_H,
691
                   K6, K6_H,
692
                   K5, K5_H,
693
                   K4, K4_H,
694
                   K3, K3_H,
695
                   K2, K2_H,
696
                   K1, K1_H);
697

698
reg_class  vectmask_reg(K1, K1_H,
699
                        K2, K2_H,
700
                        K3, K3_H,
701
                        K4, K4_H,
702
                        K5, K5_H,
703
                        K6, K6_H,
704
                        K7, K7_H);
705

706
reg_class vectmask_reg_K1(K1, K1_H);
707
reg_class vectmask_reg_K2(K2, K2_H);
708
reg_class vectmask_reg_K3(K3, K3_H);
709
reg_class vectmask_reg_K4(K4, K4_H);
710
reg_class vectmask_reg_K5(K5, K5_H);
711
reg_class vectmask_reg_K6(K6, K6_H);
712
reg_class vectmask_reg_K7(K7, K7_H);
713

714
// flags allocation class should be last.
715
alloc_class chunk3(RFLAGS);
716

717

718
// Singleton class for condition codes
719
reg_class int_flags(RFLAGS);
720

721
// Class for pre evex float registers
722
reg_class float_reg_legacy(XMM0,
723
                    XMM1,
724
                    XMM2,
725
                    XMM3,
726
                    XMM4,
727
                    XMM5,
728
                    XMM6,
729
                    XMM7
730
#ifdef _LP64
731
                   ,XMM8,
732
                    XMM9,
733
                    XMM10,
734
                    XMM11,
735
                    XMM12,
736
                    XMM13,
737
                    XMM14,
738
                    XMM15
739
#endif
740
                    );
741

742
// Class for evex float registers
743
reg_class float_reg_evex(XMM0,
744
                    XMM1,
745
                    XMM2,
746
                    XMM3,
747
                    XMM4,
748
                    XMM5,
749
                    XMM6,
750
                    XMM7
751
#ifdef _LP64
752
                   ,XMM8,
753
                    XMM9,
754
                    XMM10,
755
                    XMM11,
756
                    XMM12,
757
                    XMM13,
758
                    XMM14,
759
                    XMM15,
760
                    XMM16,
761
                    XMM17,
762
                    XMM18,
763
                    XMM19,
764
                    XMM20,
765
                    XMM21,
766
                    XMM22,
767
                    XMM23,
768
                    XMM24,
769
                    XMM25,
770
                    XMM26,
771
                    XMM27,
772
                    XMM28,
773
                    XMM29,
774
                    XMM30,
775
                    XMM31
776
#endif
777
                    );
778

779
reg_class_dynamic float_reg(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() %} );
780
reg_class_dynamic float_reg_vl(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
781

782
// Class for pre evex double registers
783
reg_class double_reg_legacy(XMM0,  XMM0b,
784
                     XMM1,  XMM1b,
785
                     XMM2,  XMM2b,
786
                     XMM3,  XMM3b,
787
                     XMM4,  XMM4b,
788
                     XMM5,  XMM5b,
789
                     XMM6,  XMM6b,
790
                     XMM7,  XMM7b
791
#ifdef _LP64
792
                    ,XMM8,  XMM8b,
793
                     XMM9,  XMM9b,
794
                     XMM10, XMM10b,
795
                     XMM11, XMM11b,
796
                     XMM12, XMM12b,
797
                     XMM13, XMM13b,
798
                     XMM14, XMM14b,
799
                     XMM15, XMM15b
800
#endif
801
                     );
802

803
// Class for evex double registers
804
reg_class double_reg_evex(XMM0,  XMM0b,
805
                     XMM1,  XMM1b,
806
                     XMM2,  XMM2b,
807
                     XMM3,  XMM3b,
808
                     XMM4,  XMM4b,
809
                     XMM5,  XMM5b,
810
                     XMM6,  XMM6b,
811
                     XMM7,  XMM7b
812
#ifdef _LP64
813
                    ,XMM8,  XMM8b,
814
                     XMM9,  XMM9b,
815
                     XMM10, XMM10b,
816
                     XMM11, XMM11b,
817
                     XMM12, XMM12b,
818
                     XMM13, XMM13b,
819
                     XMM14, XMM14b,
820
                     XMM15, XMM15b,
821
                     XMM16, XMM16b,
822
                     XMM17, XMM17b,
823
                     XMM18, XMM18b,
824
                     XMM19, XMM19b,
825
                     XMM20, XMM20b,
826
                     XMM21, XMM21b,
827
                     XMM22, XMM22b,
828
                     XMM23, XMM23b,
829
                     XMM24, XMM24b,
830
                     XMM25, XMM25b,
831
                     XMM26, XMM26b,
832
                     XMM27, XMM27b,
833
                     XMM28, XMM28b,
834
                     XMM29, XMM29b,
835
                     XMM30, XMM30b,
836
                     XMM31, XMM31b
837
#endif
838
                     );
839

840
reg_class_dynamic double_reg(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() %} );
841
reg_class_dynamic double_reg_vl(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
842

843
// Class for pre evex 32bit vector registers
844
reg_class vectors_reg_legacy(XMM0,
845
                      XMM1,
846
                      XMM2,
847
                      XMM3,
848
                      XMM4,
849
                      XMM5,
850
                      XMM6,
851
                      XMM7
852
#ifdef _LP64
853
                     ,XMM8,
854
                      XMM9,
855
                      XMM10,
856
                      XMM11,
857
                      XMM12,
858
                      XMM13,
859
                      XMM14,
860
                      XMM15
861
#endif
862
                      );
863

864
// Class for evex 32bit vector registers
865
reg_class vectors_reg_evex(XMM0,
866
                      XMM1,
867
                      XMM2,
868
                      XMM3,
869
                      XMM4,
870
                      XMM5,
871
                      XMM6,
872
                      XMM7
873
#ifdef _LP64
874
                     ,XMM8,
875
                      XMM9,
876
                      XMM10,
877
                      XMM11,
878
                      XMM12,
879
                      XMM13,
880
                      XMM14,
881
                      XMM15,
882
                      XMM16,
883
                      XMM17,
884
                      XMM18,
885
                      XMM19,
886
                      XMM20,
887
                      XMM21,
888
                      XMM22,
889
                      XMM23,
890
                      XMM24,
891
                      XMM25,
892
                      XMM26,
893
                      XMM27,
894
                      XMM28,
895
                      XMM29,
896
                      XMM30,
897
                      XMM31
898
#endif
899
                      );
900

901
reg_class_dynamic vectors_reg(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_evex() %} );
902
reg_class_dynamic vectors_reg_vlbwdq(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
903

904
// Class for all 64bit vector registers
905
reg_class vectord_reg_legacy(XMM0,  XMM0b,
906
                      XMM1,  XMM1b,
907
                      XMM2,  XMM2b,
908
                      XMM3,  XMM3b,
909
                      XMM4,  XMM4b,
910
                      XMM5,  XMM5b,
911
                      XMM6,  XMM6b,
912
                      XMM7,  XMM7b
913
#ifdef _LP64
914
                     ,XMM8,  XMM8b,
915
                      XMM9,  XMM9b,
916
                      XMM10, XMM10b,
917
                      XMM11, XMM11b,
918
                      XMM12, XMM12b,
919
                      XMM13, XMM13b,
920
                      XMM14, XMM14b,
921
                      XMM15, XMM15b
922
#endif
923
                      );
924

925
// Class for all 64bit vector registers
926
reg_class vectord_reg_evex(XMM0,  XMM0b,
927
                      XMM1,  XMM1b,
928
                      XMM2,  XMM2b,
929
                      XMM3,  XMM3b,
930
                      XMM4,  XMM4b,
931
                      XMM5,  XMM5b,
932
                      XMM6,  XMM6b,
933
                      XMM7,  XMM7b
934
#ifdef _LP64
935
                     ,XMM8,  XMM8b,
936
                      XMM9,  XMM9b,
937
                      XMM10, XMM10b,
938
                      XMM11, XMM11b,
939
                      XMM12, XMM12b,
940
                      XMM13, XMM13b,
941
                      XMM14, XMM14b,
942
                      XMM15, XMM15b,
943
                      XMM16, XMM16b,
944
                      XMM17, XMM17b,
945
                      XMM18, XMM18b,
946
                      XMM19, XMM19b,
947
                      XMM20, XMM20b,
948
                      XMM21, XMM21b,
949
                      XMM22, XMM22b,
950
                      XMM23, XMM23b,
951
                      XMM24, XMM24b,
952
                      XMM25, XMM25b,
953
                      XMM26, XMM26b,
954
                      XMM27, XMM27b,
955
                      XMM28, XMM28b,
956
                      XMM29, XMM29b,
957
                      XMM30, XMM30b,
958
                      XMM31, XMM31b
959
#endif
960
                      );
961

962
reg_class_dynamic vectord_reg(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_evex() %} );
963
reg_class_dynamic vectord_reg_vlbwdq(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
964

965
// Class for all 128bit vector registers
966
reg_class vectorx_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,
967
                      XMM1,  XMM1b,  XMM1c,  XMM1d,
968
                      XMM2,  XMM2b,  XMM2c,  XMM2d,
969
                      XMM3,  XMM3b,  XMM3c,  XMM3d,
970
                      XMM4,  XMM4b,  XMM4c,  XMM4d,
971
                      XMM5,  XMM5b,  XMM5c,  XMM5d,
972
                      XMM6,  XMM6b,  XMM6c,  XMM6d,
973
                      XMM7,  XMM7b,  XMM7c,  XMM7d
974
#ifdef _LP64
975
                     ,XMM8,  XMM8b,  XMM8c,  XMM8d,
976
                      XMM9,  XMM9b,  XMM9c,  XMM9d,
977
                      XMM10, XMM10b, XMM10c, XMM10d,
978
                      XMM11, XMM11b, XMM11c, XMM11d,
979
                      XMM12, XMM12b, XMM12c, XMM12d,
980
                      XMM13, XMM13b, XMM13c, XMM13d,
981
                      XMM14, XMM14b, XMM14c, XMM14d,
982
                      XMM15, XMM15b, XMM15c, XMM15d
983
#endif
984
                      );
985

986
// Class for all 128bit vector registers
987
reg_class vectorx_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,
988
                      XMM1,  XMM1b,  XMM1c,  XMM1d,
989
                      XMM2,  XMM2b,  XMM2c,  XMM2d,
990
                      XMM3,  XMM3b,  XMM3c,  XMM3d,
991
                      XMM4,  XMM4b,  XMM4c,  XMM4d,
992
                      XMM5,  XMM5b,  XMM5c,  XMM5d,
993
                      XMM6,  XMM6b,  XMM6c,  XMM6d,
994
                      XMM7,  XMM7b,  XMM7c,  XMM7d
995
#ifdef _LP64
996
                     ,XMM8,  XMM8b,  XMM8c,  XMM8d,
997
                      XMM9,  XMM9b,  XMM9c,  XMM9d,
998
                      XMM10, XMM10b, XMM10c, XMM10d,
999
                      XMM11, XMM11b, XMM11c, XMM11d,
1000
                      XMM12, XMM12b, XMM12c, XMM12d,
1001
                      XMM13, XMM13b, XMM13c, XMM13d,
1002
                      XMM14, XMM14b, XMM14c, XMM14d,
1003
                      XMM15, XMM15b, XMM15c, XMM15d,
1004
                      XMM16, XMM16b, XMM16c, XMM16d,
1005
                      XMM17, XMM17b, XMM17c, XMM17d,
1006
                      XMM18, XMM18b, XMM18c, XMM18d,
1007
                      XMM19, XMM19b, XMM19c, XMM19d,
1008
                      XMM20, XMM20b, XMM20c, XMM20d,
1009
                      XMM21, XMM21b, XMM21c, XMM21d,
1010
                      XMM22, XMM22b, XMM22c, XMM22d,
1011
                      XMM23, XMM23b, XMM23c, XMM23d,
1012
                      XMM24, XMM24b, XMM24c, XMM24d,
1013
                      XMM25, XMM25b, XMM25c, XMM25d,
1014
                      XMM26, XMM26b, XMM26c, XMM26d,
1015
                      XMM27, XMM27b, XMM27c, XMM27d,
1016
                      XMM28, XMM28b, XMM28c, XMM28d,
1017
                      XMM29, XMM29b, XMM29c, XMM29d,
1018
                      XMM30, XMM30b, XMM30c, XMM30d,
1019
                      XMM31, XMM31b, XMM31c, XMM31d
1020
#endif
1021
                      );
1022

1023
reg_class_dynamic vectorx_reg(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_evex() %} );
1024
reg_class_dynamic vectorx_reg_vlbwdq(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
1025

1026
// Class for all 256bit vector registers
1027
reg_class vectory_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
1028
                      XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
1029
                      XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
1030
                      XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
1031
                      XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
1032
                      XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
1033
                      XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
1034
                      XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
1035
#ifdef _LP64
1036
                     ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
1037
                      XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
1038
                      XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
1039
                      XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
1040
                      XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
1041
                      XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
1042
                      XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
1043
                      XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
1044
#endif
1045
                      );
1046

1047
// Class for all 256bit vector registers
1048
reg_class vectory_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
1049
                      XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
1050
                      XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
1051
                      XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
1052
                      XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
1053
                      XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
1054
                      XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
1055
                      XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
1056
#ifdef _LP64
1057
                     ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
1058
                      XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
1059
                      XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
1060
                      XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
1061
                      XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
1062
                      XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
1063
                      XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
1064
                      XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h,
1065
                      XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h,
1066
                      XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h,
1067
                      XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h,
1068
                      XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h,
1069
                      XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h,
1070
                      XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h,
1071
                      XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h,
1072
                      XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h,
1073
                      XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h,
1074
                      XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h,
1075
                      XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h,
1076
                      XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h,
1077
                      XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h,
1078
                      XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h,
1079
                      XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h,
1080
                      XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h
1081
#endif
1082
                      );
1083

1084
reg_class_dynamic vectory_reg(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_evex() %} );
1085
reg_class_dynamic vectory_reg_vlbwdq(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
1086

1087
// Class for all 512bit vector registers
1088
reg_class vectorz_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
1089
                      XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
1090
                      XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
1091
                      XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
1092
                      XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
1093
                      XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
1094
                      XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
1095
                      XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
1096
#ifdef _LP64
1097
                     ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
1098
                      XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
1099
                      XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
1100
                      XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
1101
                      XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
1102
                      XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
1103
                      XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
1104
                      XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
1105
                     ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
1106
                      XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
1107
                      XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
1108
                      XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
1109
                      XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
1110
                      XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
1111
                      XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
1112
                      XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
1113
                      XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
1114
                      XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
1115
                      XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
1116
                      XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
1117
                      XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
1118
                      XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
1119
                      XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
1120
                      XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
1121
#endif
1122
                      );
1123

1124
// Class for restricted 512bit vector registers
1125
reg_class vectorz_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
1126
                      XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
1127
                      XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
1128
                      XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
1129
                      XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
1130
                      XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
1131
                      XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
1132
                      XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
1133
#ifdef _LP64
1134
                     ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
1135
                      XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
1136
                      XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
1137
                      XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
1138
                      XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
1139
                      XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
1140
                      XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
1141
                      XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
1142
#endif
1143
                      );
1144

1145
reg_class_dynamic vectorz_reg   (vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() %} );
1146
reg_class_dynamic vectorz_reg_vl(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
1147

1148
reg_class xmm0_reg(XMM0, XMM0b, XMM0c, XMM0d);
1149
%}
1150

1151

1152
//----------SOURCE BLOCK-------------------------------------------------------
1153
// This is a block of C++ code which provides values, functions, and
1154
// definitions necessary in the rest of the architecture description
1155

1156
source_hpp %{
1157
// Header information of the source block.
1158
// Method declarations/definitions which are used outside
1159
// the ad-scope can conveniently be defined here.
1160
//
1161
// To keep related declarations/definitions/uses close together,
1162
// we switch between source %{ }% and source_hpp %{ }% freely as needed.
1163

1164
#include "runtime/vm_version.hpp"
1165

1166
class NativeJump;
1167

1168
class CallStubImpl {
1169

1170
  //--------------------------------------------------------------
1171
  //---<  Used for optimization in Compile::shorten_branches  >---
1172
  //--------------------------------------------------------------
1173

1174
 public:
1175
  // Size of call trampoline stub.
1176
  static uint size_call_trampoline() {
1177
    return 0; // no call trampolines on this platform
1178
  }
1179

1180
  // number of relocations needed by a call trampoline stub
1181
  static uint reloc_call_trampoline() {
1182
    return 0; // no call trampolines on this platform
1183
  }
1184
};
1185

1186
class HandlerImpl {
1187

1188
 public:
1189

1190
  static int emit_exception_handler(C2_MacroAssembler *masm);
1191
  static int emit_deopt_handler(C2_MacroAssembler* masm);
1192

1193
  static uint size_exception_handler() {
1194
    // NativeCall instruction size is the same as NativeJump.
1195
    // exception handler starts out as jump and can be patched to
1196
    // a call be deoptimization.  (4932387)
1197
    // Note that this value is also credited (in output.cpp) to
1198
    // the size of the code section.
1199
    return NativeJump::instruction_size;
1200
  }
1201

1202
#ifdef _LP64
1203
  static uint size_deopt_handler() {
1204
    // three 5 byte instructions plus one move for unreachable address.
1205
    return 15+3;
1206
  }
1207
#else
1208
  static uint size_deopt_handler() {
1209
    // NativeCall instruction size is the same as NativeJump.
1210
    // exception handler starts out as jump and can be patched to
1211
    // a call be deoptimization.  (4932387)
1212
    // Note that this value is also credited (in output.cpp) to
1213
    // the size of the code section.
1214
    return 5 + NativeJump::instruction_size; // pushl(); jmp;
1215
  }
1216
#endif
1217
};
1218

1219
inline Assembler::AvxVectorLen vector_length_encoding(int bytes) {
1220
  switch(bytes) {
1221
    case  4: // fall-through
1222
    case  8: // fall-through
1223
    case 16: return Assembler::AVX_128bit;
1224
    case 32: return Assembler::AVX_256bit;
1225
    case 64: return Assembler::AVX_512bit;
1226

1227
    default: {
1228
      ShouldNotReachHere();
1229
      return Assembler::AVX_NoVec;
1230
    }
1231
  }
1232
}
1233

1234
static inline Assembler::AvxVectorLen vector_length_encoding(const Node* n) {
1235
  return vector_length_encoding(Matcher::vector_length_in_bytes(n));
1236
}
1237

1238
static inline Assembler::AvxVectorLen vector_length_encoding(const MachNode* use, MachOper* opnd) {
1239
  uint def_idx = use->operand_index(opnd);
1240
  Node* def = use->in(def_idx);
1241
  return vector_length_encoding(def);
1242
}
1243

1244
static inline bool is_vector_popcount_predicate(BasicType bt) {
1245
  return (is_subword_type(bt) && VM_Version::supports_avx512_bitalg()) ||
1246
         (is_non_subword_integral_type(bt) && VM_Version::supports_avx512_vpopcntdq());
1247
}
1248

1249
static inline bool is_clz_non_subword_predicate_evex(BasicType bt, int vlen_bytes) {
1250
  return is_non_subword_integral_type(bt) && VM_Version::supports_avx512cd() &&
1251
           (VM_Version::supports_avx512vl() || vlen_bytes == 64);
1252
}
1253

1254
class Node::PD {
1255
public:
1256
  enum NodeFlags {
1257
    Flag_intel_jcc_erratum    = Node::_last_flag << 1,
1258
    Flag_sets_carry_flag      = Node::_last_flag << 2,
1259
    Flag_sets_parity_flag     = Node::_last_flag << 3,
1260
    Flag_sets_zero_flag       = Node::_last_flag << 4,
1261
    Flag_sets_overflow_flag   = Node::_last_flag << 5,
1262
    Flag_sets_sign_flag       = Node::_last_flag << 6,
1263
    Flag_clears_carry_flag    = Node::_last_flag << 7,
1264
    Flag_clears_parity_flag   = Node::_last_flag << 8,
1265
    Flag_clears_zero_flag     = Node::_last_flag << 9,
1266
    Flag_clears_overflow_flag = Node::_last_flag << 10,
1267
    Flag_clears_sign_flag     = Node::_last_flag << 11,
1268
    _last_flag                = Flag_clears_sign_flag
1269
  };
1270
};
1271

1272
%} // end source_hpp
1273

1274
source %{
1275

1276
#include "opto/addnode.hpp"
1277
#include "c2_intelJccErratum_x86.hpp"
1278

1279
void PhaseOutput::pd_perform_mach_node_analysis() {
1280
  if (VM_Version::has_intel_jcc_erratum()) {
1281
    int extra_padding = IntelJccErratum::tag_affected_machnodes(C, C->cfg(), C->regalloc());
1282
    _buf_sizes._code += extra_padding;
1283
  }
1284
}
1285

1286
int MachNode::pd_alignment_required() const {
1287
  if (VM_Version::has_intel_jcc_erratum() && IntelJccErratum::is_jcc_erratum_branch(this)) {
1288
    // Conservatively add worst case padding. We assume that relocInfo::addr_unit() is 1 on x86.
1289
    return IntelJccErratum::largest_jcc_size() + 1;
1290
  } else {
1291
    return 1;
1292
  }
1293
}
1294

1295
int MachNode::compute_padding(int current_offset) const {
1296
  if (flags() & Node::PD::Flag_intel_jcc_erratum) {
1297
    Compile* C = Compile::current();
1298
    PhaseOutput* output = C->output();
1299
    Block* block = output->block();
1300
    int index = output->index();
1301
    return IntelJccErratum::compute_padding(current_offset, this, block, index, C->regalloc());
1302
  } else {
1303
    return 0;
1304
  }
1305
}
1306

1307
// Emit exception handler code.
1308
// Stuff framesize into a register and call a VM stub routine.
1309
int HandlerImpl::emit_exception_handler(C2_MacroAssembler* masm) {
1310

1311
  // Note that the code buffer's insts_mark is always relative to insts.
1312
  // That's why we must use the macroassembler to generate a handler.
1313
  address base = __ start_a_stub(size_exception_handler());
1314
  if (base == nullptr) {
1315
    ciEnv::current()->record_failure("CodeCache is full");
1316
    return 0;  // CodeBuffer::expand failed
1317
  }
1318
  int offset = __ offset();
1319
  __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
1320
  assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
1321
  __ end_a_stub();
1322
  return offset;
1323
}
1324

1325
// Emit deopt handler code.
1326
int HandlerImpl::emit_deopt_handler(C2_MacroAssembler* masm) {
1327

1328
  // Note that the code buffer's insts_mark is always relative to insts.
1329
  // That's why we must use the macroassembler to generate a handler.
1330
  address base = __ start_a_stub(size_deopt_handler());
1331
  if (base == nullptr) {
1332
    ciEnv::current()->record_failure("CodeCache is full");
1333
    return 0;  // CodeBuffer::expand failed
1334
  }
1335
  int offset = __ offset();
1336

1337
#ifdef _LP64
1338
  address the_pc = (address) __ pc();
1339
  Label next;
1340
  // push a "the_pc" on the stack without destroying any registers
1341
  // as they all may be live.
1342

1343
  // push address of "next"
1344
  __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
1345
  __ bind(next);
1346
  // adjust it so it matches "the_pc"
1347
  __ subptr(Address(rsp, 0), __ offset() - offset);
1348
#else
1349
  InternalAddress here(__ pc());
1350
  __ pushptr(here.addr(), noreg);
1351
#endif
1352

1353
  __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
1354
  assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow %d", (__ offset() - offset));
1355
  __ end_a_stub();
1356
  return offset;
1357
}
1358

1359
static Assembler::Width widthForType(BasicType bt) {
1360
  if (bt == T_BYTE) {
1361
    return Assembler::B;
1362
  } else if (bt == T_SHORT) {
1363
    return Assembler::W;
1364
  } else if (bt == T_INT) {
1365
    return Assembler::D;
1366
  } else {
1367
    assert(bt == T_LONG, "not a long: %s", type2name(bt));
1368
    return Assembler::Q;
1369
  }
1370
}
1371

1372
//=============================================================================
1373

1374
  // Float masks come from different places depending on platform.
1375
#ifdef _LP64
1376
  static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
1377
  static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
1378
  static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
1379
  static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
1380
#else
1381
  static address float_signmask()  { return (address)float_signmask_pool; }
1382
  static address float_signflip()  { return (address)float_signflip_pool; }
1383
  static address double_signmask() { return (address)double_signmask_pool; }
1384
  static address double_signflip() { return (address)double_signflip_pool; }
1385
#endif
1386
  static address vector_short_to_byte_mask() { return StubRoutines::x86::vector_short_to_byte_mask(); }
1387
  static address vector_int_to_byte_mask() { return StubRoutines::x86::vector_int_to_byte_mask(); }
1388
  static address vector_byte_perm_mask() { return StubRoutines::x86::vector_byte_perm_mask(); }
1389
  static address vector_long_sign_mask() { return StubRoutines::x86::vector_long_sign_mask(); }
1390
  static address vector_all_bits_set() { return StubRoutines::x86::vector_all_bits_set(); }
1391
  static address vector_int_mask_cmp_bits() { return StubRoutines::x86::vector_int_mask_cmp_bits(); }
1392
  static address vector_int_to_short_mask() { return StubRoutines::x86::vector_int_to_short_mask(); }
1393
  static address vector_byte_shufflemask() { return StubRoutines::x86::vector_byte_shuffle_mask(); }
1394
  static address vector_short_shufflemask() { return StubRoutines::x86::vector_short_shuffle_mask(); }
1395
  static address vector_int_shufflemask() { return StubRoutines::x86::vector_int_shuffle_mask(); }
1396
  static address vector_long_shufflemask() { return StubRoutines::x86::vector_long_shuffle_mask(); }
1397
  static address vector_32_bit_mask() { return StubRoutines::x86::vector_32_bit_mask(); }
1398
  static address vector_64_bit_mask() { return StubRoutines::x86::vector_64_bit_mask(); }
1399
  static address vector_float_signflip() { return StubRoutines::x86::vector_float_sign_flip();}
1400
  static address vector_double_signflip() { return StubRoutines::x86::vector_double_sign_flip();}
1401

1402
//=============================================================================
1403
bool Matcher::match_rule_supported(int opcode) {
1404
  if (!has_match_rule(opcode)) {
1405
    return false; // no match rule present
1406
  }
1407
  const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
1408
  switch (opcode) {
1409
    case Op_AbsVL:
1410
    case Op_StoreVectorScatter:
1411
      if (UseAVX < 3) {
1412
        return false;
1413
      }
1414
      break;
1415
    case Op_PopCountI:
1416
    case Op_PopCountL:
1417
      if (!UsePopCountInstruction) {
1418
        return false;
1419
      }
1420
      break;
1421
    case Op_PopCountVI:
1422
      if (UseAVX < 2) {
1423
        return false;
1424
      }
1425
      break;
1426
    case Op_CompressV:
1427
    case Op_ExpandV:
1428
    case Op_PopCountVL:
1429
      if (UseAVX < 2) {
1430
        return false;
1431
      }
1432
      break;
1433
    case Op_MulVI:
1434
      if ((UseSSE < 4) && (UseAVX < 1)) { // only with SSE4_1 or AVX
1435
        return false;
1436
      }
1437
      break;
1438
    case Op_MulVL:
1439
      if (UseSSE < 4) { // only with SSE4_1 or AVX
1440
        return false;
1441
      }
1442
      break;
1443
    case Op_MulReductionVL:
1444
      if (VM_Version::supports_avx512dq() == false) {
1445
        return false;
1446
      }
1447
      break;
1448
    case Op_AddReductionVL:
1449
      if (UseSSE < 2) { // requires at least SSE2
1450
        return false;
1451
      }
1452
      break;
1453
    case Op_AbsVB:
1454
    case Op_AbsVS:
1455
    case Op_AbsVI:
1456
    case Op_AddReductionVI:
1457
    case Op_AndReductionV:
1458
    case Op_OrReductionV:
1459
    case Op_XorReductionV:
1460
      if (UseSSE < 3) { // requires at least SSSE3
1461
        return false;
1462
      }
1463
      break;
1464
    case Op_VectorLoadShuffle:
1465
    case Op_VectorRearrange:
1466
    case Op_MulReductionVI:
1467
      if (UseSSE < 4) { // requires at least SSE4
1468
        return false;
1469
      }
1470
      break;
1471
    case Op_IsInfiniteF:
1472
    case Op_IsInfiniteD:
1473
      if (!VM_Version::supports_avx512dq()) {
1474
        return false;
1475
      }
1476
      break;
1477
    case Op_SqrtVD:
1478
    case Op_SqrtVF:
1479
    case Op_VectorMaskCmp:
1480
    case Op_VectorCastB2X:
1481
    case Op_VectorCastS2X:
1482
    case Op_VectorCastI2X:
1483
    case Op_VectorCastL2X:
1484
    case Op_VectorCastF2X:
1485
    case Op_VectorCastD2X:
1486
    case Op_VectorUCastB2X:
1487
    case Op_VectorUCastS2X:
1488
    case Op_VectorUCastI2X:
1489
    case Op_VectorMaskCast:
1490
      if (UseAVX < 1) { // enabled for AVX only
1491
        return false;
1492
      }
1493
      break;
1494
    case Op_PopulateIndex:
1495
      if (!is_LP64 || (UseAVX < 2)) {
1496
        return false;
1497
      }
1498
      break;
1499
    case Op_RoundVF:
1500
      if (UseAVX < 2) { // enabled for AVX2 only
1501
        return false;
1502
      }
1503
      break;
1504
    case Op_RoundVD:
1505
      if (UseAVX < 3) {
1506
        return false;  // enabled for AVX3 only
1507
      }
1508
      break;
1509
    case Op_CompareAndSwapL:
1510
#ifdef _LP64
1511
    case Op_CompareAndSwapP:
1512
#endif
1513
      break;
1514
    case Op_StrIndexOf:
1515
      if (!UseSSE42Intrinsics) {
1516
        return false;
1517
      }
1518
      break;
1519
    case Op_StrIndexOfChar:
1520
      if (!UseSSE42Intrinsics) {
1521
        return false;
1522
      }
1523
      break;
1524
    case Op_OnSpinWait:
1525
      if (VM_Version::supports_on_spin_wait() == false) {
1526
        return false;
1527
      }
1528
      break;
1529
    case Op_MulVB:
1530
    case Op_LShiftVB:
1531
    case Op_RShiftVB:
1532
    case Op_URShiftVB:
1533
    case Op_VectorInsert:
1534
    case Op_VectorLoadMask:
1535
    case Op_VectorStoreMask:
1536
    case Op_VectorBlend:
1537
      if (UseSSE < 4) {
1538
        return false;
1539
      }
1540
      break;
1541
#ifdef _LP64
1542
    case Op_MaxD:
1543
    case Op_MaxF:
1544
    case Op_MinD:
1545
    case Op_MinF:
1546
      if (UseAVX < 1) { // enabled for AVX only
1547
        return false;
1548
      }
1549
      break;
1550
#endif
1551
    case Op_CacheWB:
1552
    case Op_CacheWBPreSync:
1553
    case Op_CacheWBPostSync:
1554
      if (!VM_Version::supports_data_cache_line_flush()) {
1555
        return false;
1556
      }
1557
      break;
1558
    case Op_ExtractB:
1559
    case Op_ExtractL:
1560
    case Op_ExtractI:
1561
    case Op_RoundDoubleMode:
1562
      if (UseSSE < 4) {
1563
        return false;
1564
      }
1565
      break;
1566
    case Op_RoundDoubleModeV:
1567
      if (VM_Version::supports_avx() == false) {
1568
        return false; // 128bit vroundpd is not available
1569
      }
1570
      break;
1571
    case Op_LoadVectorGather:
1572
    case Op_LoadVectorGatherMasked:
1573
      if (UseAVX < 2) {
1574
        return false;
1575
      }
1576
      break;
1577
    case Op_FmaF:
1578
    case Op_FmaD:
1579
    case Op_FmaVD:
1580
    case Op_FmaVF:
1581
      if (!UseFMA) {
1582
        return false;
1583
      }
1584
      break;
1585
    case Op_MacroLogicV:
1586
      if (UseAVX < 3 || !UseVectorMacroLogic) {
1587
        return false;
1588
      }
1589
      break;
1590

1591
    case Op_VectorCmpMasked:
1592
    case Op_VectorMaskGen:
1593
      if (!is_LP64  || UseAVX < 3 || !VM_Version::supports_bmi2()) {
1594
        return false;
1595
      }
1596
      break;
1597
    case Op_VectorMaskFirstTrue:
1598
    case Op_VectorMaskLastTrue:
1599
    case Op_VectorMaskTrueCount:
1600
    case Op_VectorMaskToLong:
1601
      if (!is_LP64 || UseAVX < 1) {
1602
         return false;
1603
      }
1604
      break;
1605
    case Op_RoundF:
1606
    case Op_RoundD:
1607
      if (!is_LP64) {
1608
        return false;
1609
      }
1610
      break;
1611
    case Op_CopySignD:
1612
    case Op_CopySignF:
1613
      if (UseAVX < 3 || !is_LP64)  {
1614
        return false;
1615
      }
1616
      if (!VM_Version::supports_avx512vl()) {
1617
        return false;
1618
      }
1619
      break;
1620
#ifndef _LP64
1621
    case Op_AddReductionVF:
1622
    case Op_AddReductionVD:
1623
    case Op_MulReductionVF:
1624
    case Op_MulReductionVD:
1625
      if (UseSSE < 1) { // requires at least SSE
1626
        return false;
1627
      }
1628
      break;
1629
    case Op_MulAddVS2VI:
1630
    case Op_RShiftVL:
1631
    case Op_AbsVD:
1632
    case Op_NegVD:
1633
      if (UseSSE < 2) {
1634
        return false;
1635
      }
1636
      break;
1637
#endif // !LP64
1638
    case Op_CompressBits:
1639
      if (!VM_Version::supports_bmi2() || (!is_LP64 && UseSSE < 2)) {
1640
        return false;
1641
      }
1642
      break;
1643
    case Op_ExpandBits:
1644
      if (!VM_Version::supports_bmi2() || (!is_LP64 && (UseSSE < 2 || !VM_Version::supports_bmi1()))) {
1645
        return false;
1646
      }
1647
      break;
1648
    case Op_SignumF:
1649
      if (UseSSE < 1) {
1650
        return false;
1651
      }
1652
      break;
1653
    case Op_SignumD:
1654
      if (UseSSE < 2) {
1655
        return false;
1656
      }
1657
      break;
1658
    case Op_CompressM:
1659
      if (!VM_Version::supports_avx512vl() || !VM_Version::supports_bmi2()) {
1660
        return false;
1661
      }
1662
      break;
1663
    case Op_SqrtF:
1664
      if (UseSSE < 1) {
1665
        return false;
1666
      }
1667
      break;
1668
    case Op_SqrtD:
1669
#ifdef _LP64
1670
      if (UseSSE < 2) {
1671
        return false;
1672
      }
1673
#else
1674
      // x86_32.ad has a special match rule for SqrtD.
1675
      // Together with common x86 rules, this handles all UseSSE cases.
1676
#endif
1677
      break;
1678
    case Op_ConvF2HF:
1679
    case Op_ConvHF2F:
1680
      if (!VM_Version::supports_float16()) {
1681
        return false;
1682
      }
1683
      break;
1684
    case Op_VectorCastF2HF:
1685
    case Op_VectorCastHF2F:
1686
      if (!VM_Version::supports_f16c() && !VM_Version::supports_evex()) {
1687
        return false;
1688
      }
1689
      break;
1690
  }
1691
  return true;  // Match rules are supported by default.
1692
}
1693

1694
//------------------------------------------------------------------------
1695

1696
static inline bool is_pop_count_instr_target(BasicType bt) {
1697
  return (is_subword_type(bt) && VM_Version::supports_avx512_bitalg()) ||
1698
         (is_non_subword_integral_type(bt) && VM_Version::supports_avx512_vpopcntdq());
1699
}
1700

1701
bool Matcher::match_rule_supported_auto_vectorization(int opcode, int vlen, BasicType bt) {
1702
  return match_rule_supported_vector(opcode, vlen, bt);
1703
}
1704

1705
// Identify extra cases that we might want to provide match rules for vector nodes and
1706
// other intrinsics guarded with vector length (vlen) and element type (bt).
1707
bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
1708
  const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
1709
  if (!match_rule_supported(opcode)) {
1710
    return false;
1711
  }
1712
  // Matcher::vector_size_supported() restricts vector sizes in the following way (see Matcher::vector_width_in_bytes):
1713
  //   * SSE2 supports 128bit vectors for all types;
1714
  //   * AVX1 supports 256bit vectors only for FLOAT and DOUBLE types;
1715
  //   * AVX2 supports 256bit vectors for all types;
1716
  //   * AVX512F supports 512bit vectors only for INT, FLOAT, and DOUBLE types;
1717
  //   * AVX512BW supports 512bit vectors for BYTE, SHORT, and CHAR types.
1718
  // There's also a limit on minimum vector size supported: 2 elements (or 4 bytes for BYTE).
1719
  // And MaxVectorSize is taken into account as well.
1720
  if (!vector_size_supported(bt, vlen)) {
1721
    return false;
1722
  }
1723
  // Special cases which require vector length follow:
1724
  //   * implementation limitations
1725
  //   * some 512bit vector operations on FLOAT and DOUBLE types require AVX512DQ
1726
  //   * 128bit vroundpd instruction is present only in AVX1
1727
  int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
1728
  switch (opcode) {
1729
    case Op_AbsVF:
1730
    case Op_NegVF:
1731
      if ((vlen == 16) && (VM_Version::supports_avx512dq() == false)) {
1732
        return false; // 512bit vandps and vxorps are not available
1733
      }
1734
      break;
1735
    case Op_AbsVD:
1736
    case Op_NegVD:
1737
      if ((vlen == 8) && (VM_Version::supports_avx512dq() == false)) {
1738
        return false; // 512bit vpmullq, vandpd and vxorpd are not available
1739
      }
1740
      break;
1741
    case Op_RotateRightV:
1742
    case Op_RotateLeftV:
1743
      if (bt != T_INT && bt != T_LONG) {
1744
        return false;
1745
      } // fallthrough
1746
    case Op_MacroLogicV:
1747
      if (!VM_Version::supports_evex() ||
1748
          ((size_in_bits != 512) && !VM_Version::supports_avx512vl())) {
1749
        return false;
1750
      }
1751
      break;
1752
    case Op_ClearArray:
1753
    case Op_VectorMaskGen:
1754
    case Op_VectorCmpMasked:
1755
      if (!is_LP64 || !VM_Version::supports_avx512bw()) {
1756
        return false;
1757
      }
1758
      if ((size_in_bits != 512) && !VM_Version::supports_avx512vl()) {
1759
        return false;
1760
      }
1761
      break;
1762
    case Op_LoadVectorMasked:
1763
    case Op_StoreVectorMasked:
1764
      if (!VM_Version::supports_avx512bw() && (is_subword_type(bt) || UseAVX < 1)) {
1765
        return false;
1766
      }
1767
      break;
1768
    case Op_MaxV:
1769
    case Op_MinV:
1770
      if (UseSSE < 4 && is_integral_type(bt)) {
1771
        return false;
1772
      }
1773
      if ((bt == T_FLOAT || bt == T_DOUBLE)) {
1774
          // Float/Double intrinsics are enabled for AVX family currently.
1775
          if (UseAVX == 0) {
1776
            return false;
1777
          }
1778
          if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) { // 512 bit Float/Double intrinsics need AVX512DQ
1779
            return false;
1780
          }
1781
      }
1782
      break;
1783
    case Op_CallLeafVector:
1784
      if (size_in_bits == 512 && !VM_Version::supports_avx512vlbwdq()) {
1785
        return false;
1786
      }
1787
      break;
1788
    case Op_AddReductionVI:
1789
      if (bt == T_INT && (UseSSE < 3 || !VM_Version::supports_ssse3())) {
1790
        return false;
1791
      }
1792
      // fallthrough
1793
    case Op_AndReductionV:
1794
    case Op_OrReductionV:
1795
    case Op_XorReductionV:
1796
      if (is_subword_type(bt) && (UseSSE < 4)) {
1797
        return false;
1798
      }
1799
#ifndef _LP64
1800
      if (bt == T_BYTE || bt == T_LONG) {
1801
        return false;
1802
      }
1803
#endif
1804
      break;
1805
#ifndef _LP64
1806
    case Op_VectorInsert:
1807
      if (bt == T_LONG || bt == T_DOUBLE) {
1808
        return false;
1809
      }
1810
      break;
1811
#endif
1812
    case Op_MinReductionV:
1813
    case Op_MaxReductionV:
1814
      if ((bt == T_INT || is_subword_type(bt)) && UseSSE < 4) {
1815
        return false;
1816
      } else if (bt == T_LONG && (UseAVX < 3 || !VM_Version::supports_avx512vlbwdq())) {
1817
        return false;
1818
      }
1819
      // Float/Double intrinsics enabled for AVX family.
1820
      if (UseAVX == 0 && (bt == T_FLOAT || bt == T_DOUBLE)) {
1821
        return false;
1822
      }
1823
      if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) {
1824
        return false;
1825
      }
1826
#ifndef _LP64
1827
      if (bt == T_BYTE || bt == T_LONG) {
1828
        return false;
1829
      }
1830
#endif
1831
      break;
1832
    case Op_VectorTest:
1833
      if (UseSSE < 4) {
1834
        return false; // Implementation limitation
1835
      } else if (size_in_bits < 32) {
1836
        return false; // Implementation limitation
1837
      }
1838
      break;
1839
    case Op_VectorLoadShuffle:
1840
    case Op_VectorRearrange:
1841
      if(vlen == 2) {
1842
        return false; // Implementation limitation due to how shuffle is loaded
1843
      } else if (size_in_bits == 256 && UseAVX < 2) {
1844
        return false; // Implementation limitation
1845
      }
1846
      break;
1847
    case Op_VectorLoadMask:
1848
    case Op_VectorMaskCast:
1849
      if (size_in_bits == 256 && UseAVX < 2) {
1850
        return false; // Implementation limitation
1851
      }
1852
      // fallthrough
1853
    case Op_VectorStoreMask:
1854
      if (vlen == 2) {
1855
        return false; // Implementation limitation
1856
      }
1857
      break;
1858
    case Op_PopulateIndex:
1859
      if (size_in_bits > 256 && !VM_Version::supports_avx512bw()) {
1860
        return false;
1861
      }
1862
      break;
1863
    case Op_VectorCastB2X:
1864
    case Op_VectorCastS2X:
1865
    case Op_VectorCastI2X:
1866
      if (bt != T_DOUBLE && size_in_bits == 256 && UseAVX < 2) {
1867
        return false;
1868
      }
1869
      break;
1870
    case Op_VectorCastL2X:
1871
      if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) {
1872
        return false;
1873
      } else if (!is_integral_type(bt) && !VM_Version::supports_avx512dq()) {
1874
        return false;
1875
      }
1876
      break;
1877
    case Op_VectorCastF2X: {
1878
        // As per JLS section 5.1.3 narrowing conversion to sub-word types
1879
        // happen after intermediate conversion to integer and special handling
1880
        // code needs AVX2 vpcmpeqd instruction for 256 bit vectors.
1881
        int src_size_in_bits = type2aelembytes(T_FLOAT) * vlen * BitsPerByte;
1882
        if (is_integral_type(bt) && src_size_in_bits == 256 && UseAVX < 2) {
1883
          return false;
1884
        }
1885
      }
1886
      // fallthrough
1887
    case Op_VectorCastD2X:
1888
      if (bt == T_LONG && !VM_Version::supports_avx512dq()) {
1889
        return false;
1890
      }
1891
      break;
1892
    case Op_VectorCastF2HF:
1893
    case Op_VectorCastHF2F:
1894
      if (!VM_Version::supports_f16c() &&
1895
         ((!VM_Version::supports_evex() ||
1896
         ((size_in_bits != 512) && !VM_Version::supports_avx512vl())))) {
1897
        return false;
1898
      }
1899
      break;
1900
    case Op_RoundVD:
1901
      if (!VM_Version::supports_avx512dq()) {
1902
        return false;
1903
      }
1904
      break;
1905
    case Op_MulReductionVI:
1906
      if (bt == T_BYTE && size_in_bits == 512 && !VM_Version::supports_avx512bw()) {
1907
        return false;
1908
      }
1909
      break;
1910
    case Op_LoadVectorGatherMasked:
1911
      if (!is_subword_type(bt) && size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
1912
        return false;
1913
      }
1914
      if (is_subword_type(bt) &&
1915
         (!is_LP64                                                ||
1916
         (size_in_bits > 256 && !VM_Version::supports_avx512bw()) ||
1917
         (size_in_bits < 64)                                      ||
1918
         (bt == T_SHORT && !VM_Version::supports_bmi2()))) {
1919
        return false;
1920
      }
1921
      break;
1922
    case Op_StoreVectorScatterMasked:
1923
    case Op_StoreVectorScatter:
1924
      if (is_subword_type(bt)) {
1925
        return false;
1926
      } else if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
1927
        return false;
1928
      }
1929
      // fallthrough
1930
    case Op_LoadVectorGather:
1931
      if (!is_subword_type(bt) && size_in_bits == 64) {
1932
        return false;
1933
      }
1934
      if (is_subword_type(bt) && size_in_bits < 64) {
1935
        return false;
1936
      }
1937
      break;
1938
    case Op_MaskAll:
1939
      if (!VM_Version::supports_evex()) {
1940
        return false;
1941
      }
1942
      if ((vlen > 16 || is_subword_type(bt)) && !VM_Version::supports_avx512bw()) {
1943
        return false;
1944
      }
1945
      if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
1946
        return false;
1947
      }
1948
      break;
1949
    case Op_VectorMaskCmp:
1950
      if (vlen < 2 || size_in_bits < 32) {
1951
        return false;
1952
      }
1953
      break;
1954
    case Op_CompressM:
1955
      if (UseAVX < 3 || !VM_Version::supports_bmi2()) {
1956
        return false;
1957
      }
1958
      break;
1959
    case Op_CompressV:
1960
    case Op_ExpandV:
1961
      if (is_subword_type(bt) && !VM_Version::supports_avx512_vbmi2()) {
1962
        return false;
1963
      }
1964
      if (!is_LP64 && !VM_Version::supports_avx512vl() && size_in_bits < 512) {
1965
        return false;
1966
      }
1967
      if (size_in_bits < 128 ) {
1968
        return false;
1969
      }
1970
    case Op_VectorLongToMask:
1971
      if (UseAVX < 1 || !is_LP64) {
1972
        return false;
1973
      }
1974
      if (UseAVX < 3 && !VM_Version::supports_bmi2()) {
1975
        return false;
1976
      }
1977
      break;
1978
    case Op_SignumVD:
1979
    case Op_SignumVF:
1980
      if (UseAVX < 1) {
1981
        return false;
1982
      }
1983
      break;
1984
    case Op_PopCountVI:
1985
    case Op_PopCountVL: {
1986
        if (!is_pop_count_instr_target(bt) &&
1987
            (size_in_bits == 512) && !VM_Version::supports_avx512bw()) {
1988
          return false;
1989
        }
1990
      }
1991
      break;
1992
    case Op_ReverseV:
1993
    case Op_ReverseBytesV:
1994
      if (UseAVX < 2) {
1995
        return false;
1996
      }
1997
      break;
1998
    case Op_CountTrailingZerosV:
1999
    case Op_CountLeadingZerosV:
2000
      if (UseAVX < 2) {
2001
        return false;
2002
      }
2003
      break;
2004
  }
2005
  return true;  // Per default match rules are supported.
2006
}
2007

2008
bool Matcher::match_rule_supported_vector_masked(int opcode, int vlen, BasicType bt) {
2009
  // ADLC based match_rule_supported routine checks for the existence of pattern based
2010
  // on IR opcode. Most of the unary/binary/ternary masked operation share the IR nodes
2011
  // of their non-masked counterpart with mask edge being the differentiator.
2012
  // This routine does a strict check on the existence of masked operation patterns
2013
  // by returning a default false value for all the other opcodes apart from the
2014
  // ones whose masked instruction patterns are defined in this file.
2015
  if (!match_rule_supported_vector(opcode, vlen, bt)) {
2016
    return false;
2017
  }
2018

2019
  const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
2020
  int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
2021
  if (size_in_bits != 512 && !VM_Version::supports_avx512vl()) {
2022
    return false;
2023
  }
2024
  switch(opcode) {
2025
    // Unary masked operations
2026
    case Op_AbsVB:
2027
    case Op_AbsVS:
2028
      if(!VM_Version::supports_avx512bw()) {
2029
        return false;  // Implementation limitation
2030
      }
2031
    case Op_AbsVI:
2032
    case Op_AbsVL:
2033
      return true;
2034

2035
    // Ternary masked operations
2036
    case Op_FmaVF:
2037
    case Op_FmaVD:
2038
      return true;
2039

2040
    case Op_MacroLogicV:
2041
      if(bt != T_INT && bt != T_LONG) {
2042
        return false;
2043
      }
2044
      return true;
2045

2046
    // Binary masked operations
2047
    case Op_AddVB:
2048
    case Op_AddVS:
2049
    case Op_SubVB:
2050
    case Op_SubVS:
2051
    case Op_MulVS:
2052
    case Op_LShiftVS:
2053
    case Op_RShiftVS:
2054
    case Op_URShiftVS:
2055
      assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
2056
      if (!VM_Version::supports_avx512bw()) {
2057
        return false;  // Implementation limitation
2058
      }
2059
      return true;
2060

2061
    case Op_MulVL:
2062
      assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
2063
      if (!VM_Version::supports_avx512dq()) {
2064
        return false;  // Implementation limitation
2065
      }
2066
      return true;
2067

2068
    case Op_AndV:
2069
    case Op_OrV:
2070
    case Op_XorV:
2071
    case Op_RotateRightV:
2072
    case Op_RotateLeftV:
2073
      if (bt != T_INT && bt != T_LONG) {
2074
        return false; // Implementation limitation
2075
      }
2076
      return true;
2077

2078
    case Op_VectorLoadMask:
2079
      assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
2080
      if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
2081
        return false;
2082
      }
2083
      return true;
2084

2085
    case Op_AddVI:
2086
    case Op_AddVL:
2087
    case Op_AddVF:
2088
    case Op_AddVD:
2089
    case Op_SubVI:
2090
    case Op_SubVL:
2091
    case Op_SubVF:
2092
    case Op_SubVD:
2093
    case Op_MulVI:
2094
    case Op_MulVF:
2095
    case Op_MulVD:
2096
    case Op_DivVF:
2097
    case Op_DivVD:
2098
    case Op_SqrtVF:
2099
    case Op_SqrtVD:
2100
    case Op_LShiftVI:
2101
    case Op_LShiftVL:
2102
    case Op_RShiftVI:
2103
    case Op_RShiftVL:
2104
    case Op_URShiftVI:
2105
    case Op_URShiftVL:
2106
    case Op_LoadVectorMasked:
2107
    case Op_StoreVectorMasked:
2108
    case Op_LoadVectorGatherMasked:
2109
    case Op_StoreVectorScatterMasked:
2110
      return true;
2111

2112
    case Op_MaxV:
2113
    case Op_MinV:
2114
      if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
2115
        return false; // Implementation limitation
2116
      }
2117
      if (is_floating_point_type(bt)) {
2118
        return false; // Implementation limitation
2119
      }
2120
      return true;
2121

2122
    case Op_VectorMaskCmp:
2123
      if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
2124
        return false; // Implementation limitation
2125
      }
2126
      return true;
2127

2128
    case Op_VectorRearrange:
2129
      if (bt == T_SHORT && !VM_Version::supports_avx512bw()) {
2130
        return false; // Implementation limitation
2131
      }
2132
      if (bt == T_BYTE && !VM_Version::supports_avx512_vbmi()) {
2133
        return false; // Implementation limitation
2134
      } else if ((bt == T_INT || bt == T_FLOAT) && size_in_bits < 256) {
2135
        return false; // Implementation limitation
2136
      }
2137
      return true;
2138

2139
    // Binary Logical operations
2140
    case Op_AndVMask:
2141
    case Op_OrVMask:
2142
    case Op_XorVMask:
2143
      if (vlen > 16 && !VM_Version::supports_avx512bw()) {
2144
        return false; // Implementation limitation
2145
      }
2146
      return true;
2147

2148
    case Op_PopCountVI:
2149
    case Op_PopCountVL:
2150
      if (!is_pop_count_instr_target(bt)) {
2151
        return false;
2152
      }
2153
      return true;
2154

2155
    case Op_MaskAll:
2156
      return true;
2157

2158
    case Op_CountLeadingZerosV:
2159
      if (is_non_subword_integral_type(bt) && VM_Version::supports_avx512cd()) {
2160
        return true;
2161
      }
2162
    default:
2163
      return false;
2164
  }
2165
}
2166

2167
bool Matcher::vector_needs_partial_operations(Node* node, const TypeVect* vt) {
2168
  return false;
2169
}
2170

2171
MachOper* Matcher::pd_specialize_generic_vector_operand(MachOper* generic_opnd, uint ideal_reg, bool is_temp) {
2172
  assert(Matcher::is_generic_vector(generic_opnd), "not generic");
2173
  bool legacy = (generic_opnd->opcode() == LEGVEC);
2174
  if (!VM_Version::supports_avx512vlbwdq() && // KNL
2175
      is_temp && !legacy && (ideal_reg == Op_VecZ)) {
2176
    // Conservatively specialize 512bit vec TEMP operands to legVecZ (zmm0-15) on KNL.
2177
    return new legVecZOper();
2178
  }
2179
  if (legacy) {
2180
    switch (ideal_reg) {
2181
      case Op_VecS: return new legVecSOper();
2182
      case Op_VecD: return new legVecDOper();
2183
      case Op_VecX: return new legVecXOper();
2184
      case Op_VecY: return new legVecYOper();
2185
      case Op_VecZ: return new legVecZOper();
2186
    }
2187
  } else {
2188
    switch (ideal_reg) {
2189
      case Op_VecS: return new vecSOper();
2190
      case Op_VecD: return new vecDOper();
2191
      case Op_VecX: return new vecXOper();
2192
      case Op_VecY: return new vecYOper();
2193
      case Op_VecZ: return new vecZOper();
2194
    }
2195
  }
2196
  ShouldNotReachHere();
2197
  return nullptr;
2198
}
2199

2200
bool Matcher::is_reg2reg_move(MachNode* m) {
2201
  switch (m->rule()) {
2202
    case MoveVec2Leg_rule:
2203
    case MoveLeg2Vec_rule:
2204
    case MoveF2VL_rule:
2205
    case MoveF2LEG_rule:
2206
    case MoveVL2F_rule:
2207
    case MoveLEG2F_rule:
2208
    case MoveD2VL_rule:
2209
    case MoveD2LEG_rule:
2210
    case MoveVL2D_rule:
2211
    case MoveLEG2D_rule:
2212
      return true;
2213
    default:
2214
      return false;
2215
  }
2216
}
2217

2218
bool Matcher::is_generic_vector(MachOper* opnd) {
2219
  switch (opnd->opcode()) {
2220
    case VEC:
2221
    case LEGVEC:
2222
      return true;
2223
    default:
2224
      return false;
2225
  }
2226
}
2227

2228
//------------------------------------------------------------------------
2229

2230
const RegMask* Matcher::predicate_reg_mask(void) {
2231
  return &_VECTMASK_REG_mask;
2232
}
2233

2234
const TypeVectMask* Matcher::predicate_reg_type(const Type* elemTy, int length) {
2235
  return new TypeVectMask(elemTy, length);
2236
}
2237

2238
// Max vector size in bytes. 0 if not supported.
2239
int Matcher::vector_width_in_bytes(BasicType bt) {
2240
  assert(is_java_primitive(bt), "only primitive type vectors");
2241
  if (UseSSE < 2) return 0;
2242
  // SSE2 supports 128bit vectors for all types.
2243
  // AVX2 supports 256bit vectors for all types.
2244
  // AVX2/EVEX supports 512bit vectors for all types.
2245
  int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;
2246
  // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
2247
  if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
2248
    size = (UseAVX > 2) ? 64 : 32;
2249
  if (UseAVX > 2 && (bt == T_BYTE || bt == T_SHORT || bt == T_CHAR))
2250
    size = (VM_Version::supports_avx512bw()) ? 64 : 32;
2251
  // Use flag to limit vector size.
2252
  size = MIN2(size,(int)MaxVectorSize);
2253
  // Minimum 2 values in vector (or 4 for bytes).
2254
  switch (bt) {
2255
  case T_DOUBLE:
2256
  case T_LONG:
2257
    if (size < 16) return 0;
2258
    break;
2259
  case T_FLOAT:
2260
  case T_INT:
2261
    if (size < 8) return 0;
2262
    break;
2263
  case T_BOOLEAN:
2264
    if (size < 4) return 0;
2265
    break;
2266
  case T_CHAR:
2267
    if (size < 4) return 0;
2268
    break;
2269
  case T_BYTE:
2270
    if (size < 4) return 0;
2271
    break;
2272
  case T_SHORT:
2273
    if (size < 4) return 0;
2274
    break;
2275
  default:
2276
    ShouldNotReachHere();
2277
  }
2278
  return size;
2279
}
2280

2281
// Limits on vector size (number of elements) loaded into vector.
2282
int Matcher::max_vector_size(const BasicType bt) {
2283
  return vector_width_in_bytes(bt)/type2aelembytes(bt);
2284
}
2285
int Matcher::min_vector_size(const BasicType bt) {
2286
  int max_size = max_vector_size(bt);
2287
  // Min size which can be loaded into vector is 4 bytes.
2288
  int size = (type2aelembytes(bt) == 1) ? 4 : 2;
2289
  // Support for calling svml double64 vectors
2290
  if (bt == T_DOUBLE) {
2291
    size = 1;
2292
  }
2293
  return MIN2(size,max_size);
2294
}
2295

2296
int Matcher::max_vector_size_auto_vectorization(const BasicType bt) {
2297
  // Limit the max vector size for auto vectorization to 256 bits (32 bytes)
2298
  // by default on Cascade Lake
2299
  if (VM_Version::is_default_intel_cascade_lake()) {
2300
    return MIN2(Matcher::max_vector_size(bt), 32 / type2aelembytes(bt));
2301
  }
2302
  return Matcher::max_vector_size(bt);
2303
}
2304

2305
int Matcher::scalable_vector_reg_size(const BasicType bt) {
2306
  return -1;
2307
}
2308

2309
// Vector ideal reg corresponding to specified size in bytes
2310
uint Matcher::vector_ideal_reg(int size) {
2311
  assert(MaxVectorSize >= size, "");
2312
  switch(size) {
2313
    case  4: return Op_VecS;
2314
    case  8: return Op_VecD;
2315
    case 16: return Op_VecX;
2316
    case 32: return Op_VecY;
2317
    case 64: return Op_VecZ;
2318
  }
2319
  ShouldNotReachHere();
2320
  return 0;
2321
}
2322

2323
// Check for shift by small constant as well
2324
static bool clone_shift(Node* shift, Matcher* matcher, Matcher::MStack& mstack, VectorSet& address_visited) {
2325
  if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
2326
      shift->in(2)->get_int() <= 3 &&
2327
      // Are there other uses besides address expressions?
2328
      !matcher->is_visited(shift)) {
2329
    address_visited.set(shift->_idx); // Flag as address_visited
2330
    mstack.push(shift->in(2), Matcher::Visit);
2331
    Node *conv = shift->in(1);
2332
#ifdef _LP64
2333
    // Allow Matcher to match the rule which bypass
2334
    // ConvI2L operation for an array index on LP64
2335
    // if the index value is positive.
2336
    if (conv->Opcode() == Op_ConvI2L &&
2337
        conv->as_Type()->type()->is_long()->_lo >= 0 &&
2338
        // Are there other uses besides address expressions?
2339
        !matcher->is_visited(conv)) {
2340
      address_visited.set(conv->_idx); // Flag as address_visited
2341
      mstack.push(conv->in(1), Matcher::Pre_Visit);
2342
    } else
2343
#endif
2344
      mstack.push(conv, Matcher::Pre_Visit);
2345
    return true;
2346
  }
2347
  return false;
2348
}
2349

2350
// This function identifies sub-graphs in which a 'load' node is
2351
// input to two different nodes, and such that it can be matched
2352
// with BMI instructions like blsi, blsr, etc.
2353
// Example : for b = -a[i] & a[i] can be matched to blsi r32, m32.
2354
// The graph is (AndL (SubL Con0 LoadL*) LoadL*), where LoadL*
2355
// refers to the same node.
2356
//
2357
// Match the generic fused operations pattern (op1 (op2 Con{ConType} mop) mop)
2358
// This is a temporary solution until we make DAGs expressible in ADL.
2359
template<typename ConType>
2360
class FusedPatternMatcher {
2361
  Node* _op1_node;
2362
  Node* _mop_node;
2363
  int _con_op;
2364

2365
  static int match_next(Node* n, int next_op, int next_op_idx) {
2366
    if (n->in(1) == nullptr || n->in(2) == nullptr) {
2367
      return -1;
2368
    }
2369

2370
    if (next_op_idx == -1) { // n is commutative, try rotations
2371
      if (n->in(1)->Opcode() == next_op) {
2372
        return 1;
2373
      } else if (n->in(2)->Opcode() == next_op) {
2374
        return 2;
2375
      }
2376
    } else {
2377
      assert(next_op_idx > 0 && next_op_idx <= 2, "Bad argument index");
2378
      if (n->in(next_op_idx)->Opcode() == next_op) {
2379
        return next_op_idx;
2380
      }
2381
    }
2382
    return -1;
2383
  }
2384

2385
 public:
2386
  FusedPatternMatcher(Node* op1_node, Node* mop_node, int con_op) :
2387
    _op1_node(op1_node), _mop_node(mop_node), _con_op(con_op) { }
2388

2389
  bool match(int op1, int op1_op2_idx,  // op1 and the index of the op1->op2 edge, -1 if op1 is commutative
2390
             int op2, int op2_con_idx,  // op2 and the index of the op2->con edge, -1 if op2 is commutative
2391
             typename ConType::NativeType con_value) {
2392
    if (_op1_node->Opcode() != op1) {
2393
      return false;
2394
    }
2395
    if (_mop_node->outcnt() > 2) {
2396
      return false;
2397
    }
2398
    op1_op2_idx = match_next(_op1_node, op2, op1_op2_idx);
2399
    if (op1_op2_idx == -1) {
2400
      return false;
2401
    }
2402
    // Memory operation must be the other edge
2403
    int op1_mop_idx = (op1_op2_idx & 1) + 1;
2404

2405
    // Check that the mop node is really what we want
2406
    if (_op1_node->in(op1_mop_idx) == _mop_node) {
2407
      Node* op2_node = _op1_node->in(op1_op2_idx);
2408
      if (op2_node->outcnt() > 1) {
2409
        return false;
2410
      }
2411
      assert(op2_node->Opcode() == op2, "Should be");
2412
      op2_con_idx = match_next(op2_node, _con_op, op2_con_idx);
2413
      if (op2_con_idx == -1) {
2414
        return false;
2415
      }
2416
      // Memory operation must be the other edge
2417
      int op2_mop_idx = (op2_con_idx & 1) + 1;
2418
      // Check that the memory operation is the same node
2419
      if (op2_node->in(op2_mop_idx) == _mop_node) {
2420
        // Now check the constant
2421
        const Type* con_type = op2_node->in(op2_con_idx)->bottom_type();
2422
        if (con_type != Type::TOP && ConType::as_self(con_type)->get_con() == con_value) {
2423
          return true;
2424
        }
2425
      }
2426
    }
2427
    return false;
2428
  }
2429
};
2430

2431
static bool is_bmi_pattern(Node* n, Node* m) {
2432
  assert(UseBMI1Instructions, "sanity");
2433
  if (n != nullptr && m != nullptr) {
2434
    if (m->Opcode() == Op_LoadI) {
2435
      FusedPatternMatcher<TypeInt> bmii(n, m, Op_ConI);
2436
      return bmii.match(Op_AndI, -1, Op_SubI,  1,  0)  ||
2437
             bmii.match(Op_AndI, -1, Op_AddI, -1, -1)  ||
2438
             bmii.match(Op_XorI, -1, Op_AddI, -1, -1);
2439
    } else if (m->Opcode() == Op_LoadL) {
2440
      FusedPatternMatcher<TypeLong> bmil(n, m, Op_ConL);
2441
      return bmil.match(Op_AndL, -1, Op_SubL,  1,  0) ||
2442
             bmil.match(Op_AndL, -1, Op_AddL, -1, -1) ||
2443
             bmil.match(Op_XorL, -1, Op_AddL, -1, -1);
2444
    }
2445
  }
2446
  return false;
2447
}
2448

2449
// Should the matcher clone input 'm' of node 'n'?
2450
bool Matcher::pd_clone_node(Node* n, Node* m, Matcher::MStack& mstack) {
2451
  // If 'n' and 'm' are part of a graph for BMI instruction, clone the input 'm'.
2452
  if (UseBMI1Instructions && is_bmi_pattern(n, m)) {
2453
    mstack.push(m, Visit);
2454
    return true;
2455
  }
2456
  if (is_vshift_con_pattern(n, m)) { // ShiftV src (ShiftCntV con)
2457
    mstack.push(m, Visit);           // m = ShiftCntV
2458
    return true;
2459
  }
2460
  return false;
2461
}
2462

2463
// Should the Matcher clone shifts on addressing modes, expecting them
2464
// to be subsumed into complex addressing expressions or compute them
2465
// into registers?
2466
bool Matcher::pd_clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
2467
  Node *off = m->in(AddPNode::Offset);
2468
  if (off->is_Con()) {
2469
    address_visited.test_set(m->_idx); // Flag as address_visited
2470
    Node *adr = m->in(AddPNode::Address);
2471

2472
    // Intel can handle 2 adds in addressing mode
2473
    // AtomicAdd is not an addressing expression.
2474
    // Cheap to find it by looking for screwy base.
2475
    if (adr->is_AddP() &&
2476
        !adr->in(AddPNode::Base)->is_top() &&
2477
        LP64_ONLY( off->get_long() == (int) (off->get_long()) && ) // immL32
2478
        // Are there other uses besides address expressions?
2479
        !is_visited(adr)) {
2480
      address_visited.set(adr->_idx); // Flag as address_visited
2481
      Node *shift = adr->in(AddPNode::Offset);
2482
      if (!clone_shift(shift, this, mstack, address_visited)) {
2483
        mstack.push(shift, Pre_Visit);
2484
      }
2485
      mstack.push(adr->in(AddPNode::Address), Pre_Visit);
2486
      mstack.push(adr->in(AddPNode::Base), Pre_Visit);
2487
    } else {
2488
      mstack.push(adr, Pre_Visit);
2489
    }
2490

2491
    // Clone X+offset as it also folds into most addressing expressions
2492
    mstack.push(off, Visit);
2493
    mstack.push(m->in(AddPNode::Base), Pre_Visit);
2494
    return true;
2495
  } else if (clone_shift(off, this, mstack, address_visited)) {
2496
    address_visited.test_set(m->_idx); // Flag as address_visited
2497
    mstack.push(m->in(AddPNode::Address), Pre_Visit);
2498
    mstack.push(m->in(AddPNode::Base), Pre_Visit);
2499
    return true;
2500
  }
2501
  return false;
2502
}
2503

2504
static inline Assembler::ComparisonPredicate booltest_pred_to_comparison_pred(int bt) {
2505
  switch (bt) {
2506
    case BoolTest::eq:
2507
      return Assembler::eq;
2508
    case BoolTest::ne:
2509
      return Assembler::neq;
2510
    case BoolTest::le:
2511
    case BoolTest::ule:
2512
      return Assembler::le;
2513
    case BoolTest::ge:
2514
    case BoolTest::uge:
2515
      return Assembler::nlt;
2516
    case BoolTest::lt:
2517
    case BoolTest::ult:
2518
      return Assembler::lt;
2519
    case BoolTest::gt:
2520
    case BoolTest::ugt:
2521
      return Assembler::nle;
2522
    default : ShouldNotReachHere(); return Assembler::_false;
2523
  }
2524
}
2525

2526
static inline Assembler::ComparisonPredicateFP booltest_pred_to_comparison_pred_fp(int bt) {
2527
  switch (bt) {
2528
  case BoolTest::eq: return Assembler::EQ_OQ;  // ordered non-signaling
2529
  // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
2530
  case BoolTest::ne: return Assembler::NEQ_UQ; // unordered non-signaling
2531
  case BoolTest::le: return Assembler::LE_OQ;  // ordered non-signaling
2532
  case BoolTest::ge: return Assembler::GE_OQ;  // ordered non-signaling
2533
  case BoolTest::lt: return Assembler::LT_OQ;  // ordered non-signaling
2534
  case BoolTest::gt: return Assembler::GT_OQ;  // ordered non-signaling
2535
  default: ShouldNotReachHere(); return Assembler::FALSE_OS;
2536
  }
2537
}
2538

2539
// Helper methods for MachSpillCopyNode::implementation().
2540
static void vec_mov_helper(C2_MacroAssembler *masm, int src_lo, int dst_lo,
2541
                          int src_hi, int dst_hi, uint ireg, outputStream* st) {
2542
  assert(ireg == Op_VecS || // 32bit vector
2543
         ((src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
2544
          (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi),
2545
         "no non-adjacent vector moves" );
2546
  if (masm) {
2547
    switch (ireg) {
2548
    case Op_VecS: // copy whole register
2549
    case Op_VecD:
2550
    case Op_VecX:
2551
#ifndef _LP64
2552
      __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
2553
#else
2554
      if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2555
        __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
2556
      } else {
2557
        __ vextractf32x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
2558
     }
2559
#endif
2560
      break;
2561
    case Op_VecY:
2562
#ifndef _LP64
2563
      __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
2564
#else
2565
      if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2566
        __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
2567
      } else {
2568
        __ vextractf64x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
2569
     }
2570
#endif
2571
      break;
2572
    case Op_VecZ:
2573
      __ evmovdquq(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
2574
      break;
2575
    default:
2576
      ShouldNotReachHere();
2577
    }
2578
#ifndef PRODUCT
2579
  } else {
2580
    switch (ireg) {
2581
    case Op_VecS:
2582
    case Op_VecD:
2583
    case Op_VecX:
2584
      st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
2585
      break;
2586
    case Op_VecY:
2587
    case Op_VecZ:
2588
      st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
2589
      break;
2590
    default:
2591
      ShouldNotReachHere();
2592
    }
2593
#endif
2594
  }
2595
}
2596

2597
void vec_spill_helper(C2_MacroAssembler *masm, bool is_load,
2598
                     int stack_offset, int reg, uint ireg, outputStream* st) {
2599
  if (masm) {
2600
    if (is_load) {
2601
      switch (ireg) {
2602
      case Op_VecS:
2603
        __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2604
        break;
2605
      case Op_VecD:
2606
        __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2607
        break;
2608
      case Op_VecX:
2609
#ifndef _LP64
2610
        __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2611
#else
2612
        if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2613
          __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2614
        } else {
2615
          __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
2616
          __ vinsertf32x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
2617
        }
2618
#endif
2619
        break;
2620
      case Op_VecY:
2621
#ifndef _LP64
2622
        __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2623
#else
2624
        if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2625
          __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2626
        } else {
2627
          __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
2628
          __ vinsertf64x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
2629
        }
2630
#endif
2631
        break;
2632
      case Op_VecZ:
2633
        __ evmovdquq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
2634
        break;
2635
      default:
2636
        ShouldNotReachHere();
2637
      }
2638
    } else { // store
2639
      switch (ireg) {
2640
      case Op_VecS:
2641
        __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2642
        break;
2643
      case Op_VecD:
2644
        __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2645
        break;
2646
      case Op_VecX:
2647
#ifndef _LP64
2648
        __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2649
#else
2650
        if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2651
          __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2652
        }
2653
        else {
2654
          __ vextractf32x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
2655
        }
2656
#endif
2657
        break;
2658
      case Op_VecY:
2659
#ifndef _LP64
2660
        __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2661
#else
2662
        if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2663
          __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2664
        }
2665
        else {
2666
          __ vextractf64x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
2667
        }
2668
#endif
2669
        break;
2670
      case Op_VecZ:
2671
        __ evmovdquq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
2672
        break;
2673
      default:
2674
        ShouldNotReachHere();
2675
      }
2676
    }
2677
#ifndef PRODUCT
2678
  } else {
2679
    if (is_load) {
2680
      switch (ireg) {
2681
      case Op_VecS:
2682
        st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
2683
        break;
2684
      case Op_VecD:
2685
        st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
2686
        break;
2687
       case Op_VecX:
2688
        st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
2689
        break;
2690
      case Op_VecY:
2691
      case Op_VecZ:
2692
        st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
2693
        break;
2694
      default:
2695
        ShouldNotReachHere();
2696
      }
2697
    } else { // store
2698
      switch (ireg) {
2699
      case Op_VecS:
2700
        st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
2701
        break;
2702
      case Op_VecD:
2703
        st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
2704
        break;
2705
       case Op_VecX:
2706
        st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
2707
        break;
2708
      case Op_VecY:
2709
      case Op_VecZ:
2710
        st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
2711
        break;
2712
      default:
2713
        ShouldNotReachHere();
2714
      }
2715
    }
2716
#endif
2717
  }
2718
}
2719

2720
template <class T>
2721
static inline GrowableArray<jvalue>* vreplicate_imm(BasicType bt, T con, int len) {
2722
  GrowableArray<jvalue>* val = new GrowableArray<jvalue>(len);
2723
  jvalue ele;
2724
  switch (bt) {
2725
    case T_BYTE:   ele.b = con; break;
2726
    case T_SHORT:  ele.s = con; break;
2727
    case T_INT:    ele.i = con; break;
2728
    case T_LONG:   ele.j = con; break;
2729
    case T_FLOAT:  ele.f = con; break;
2730
    case T_DOUBLE: ele.d = con; break;
2731
    default: ShouldNotReachHere();
2732
  }
2733
  for (int i = 0; i < len; i++) {
2734
    val->append(ele);
2735
  }
2736
  return val;
2737
}
2738

2739
static inline jlong high_bit_set(BasicType bt) {
2740
  switch (bt) {
2741
    case T_BYTE:  return 0x8080808080808080;
2742
    case T_SHORT: return 0x8000800080008000;
2743
    case T_INT:   return 0x8000000080000000;
2744
    case T_LONG:  return 0x8000000000000000;
2745
    default:
2746
      ShouldNotReachHere();
2747
      return 0;
2748
  }
2749
}
2750

2751
#ifndef PRODUCT
2752
  void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
2753
    st->print("nop \t# %d bytes pad for loops and calls", _count);
2754
  }
2755
#endif
2756

2757
  void MachNopNode::emit(C2_MacroAssembler *masm, PhaseRegAlloc*) const {
2758
    __ nop(_count);
2759
  }
2760

2761
  uint MachNopNode::size(PhaseRegAlloc*) const {
2762
    return _count;
2763
  }
2764

2765
#ifndef PRODUCT
2766
  void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
2767
    st->print("# breakpoint");
2768
  }
2769
#endif
2770

2771
  void MachBreakpointNode::emit(C2_MacroAssembler *masm, PhaseRegAlloc* ra_) const {
2772
    __ int3();
2773
  }
2774

2775
  uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
2776
    return MachNode::size(ra_);
2777
  }
2778

2779
%}
2780

2781
encode %{
2782

2783
  enc_class call_epilog %{
2784
    if (VerifyStackAtCalls) {
2785
      // Check that stack depth is unchanged: find majik cookie on stack
2786
      int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
2787
      Label L;
2788
      __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
2789
      __ jccb(Assembler::equal, L);
2790
      // Die if stack mismatch
2791
      __ int3();
2792
      __ bind(L);
2793
    }
2794
  %}
2795

2796
%}
2797

2798
// Operands for bound floating pointer register arguments
2799
operand rxmm0() %{
2800
  constraint(ALLOC_IN_RC(xmm0_reg));
2801
  match(VecX);
2802
  format%{%}
2803
  interface(REG_INTER);
2804
%}
2805

2806
//----------OPERANDS-----------------------------------------------------------
2807
// Operand definitions must precede instruction definitions for correct parsing
2808
// in the ADLC because operands constitute user defined types which are used in
2809
// instruction definitions.
2810

2811
// Vectors
2812

2813
// Dummy generic vector class. Should be used for all vector operands.
2814
// Replaced with vec[SDXYZ] during post-selection pass.
2815
operand vec() %{
2816
  constraint(ALLOC_IN_RC(dynamic));
2817
  match(VecX);
2818
  match(VecY);
2819
  match(VecZ);
2820
  match(VecS);
2821
  match(VecD);
2822

2823
  format %{ %}
2824
  interface(REG_INTER);
2825
%}
2826

2827
// Dummy generic legacy vector class. Should be used for all legacy vector operands.
2828
// Replaced with legVec[SDXYZ] during post-selection cleanup.
2829
// Note: legacy register class is used to avoid extra (unneeded in 32-bit VM)
2830
// runtime code generation via reg_class_dynamic.
2831
operand legVec() %{
2832
  constraint(ALLOC_IN_RC(dynamic));
2833
  match(VecX);
2834
  match(VecY);
2835
  match(VecZ);
2836
  match(VecS);
2837
  match(VecD);
2838

2839
  format %{ %}
2840
  interface(REG_INTER);
2841
%}
2842

2843
// Replaces vec during post-selection cleanup. See above.
2844
operand vecS() %{
2845
  constraint(ALLOC_IN_RC(vectors_reg_vlbwdq));
2846
  match(VecS);
2847

2848
  format %{ %}
2849
  interface(REG_INTER);
2850
%}
2851

2852
// Replaces legVec during post-selection cleanup. See above.
2853
operand legVecS() %{
2854
  constraint(ALLOC_IN_RC(vectors_reg_legacy));
2855
  match(VecS);
2856

2857
  format %{ %}
2858
  interface(REG_INTER);
2859
%}
2860

2861
// Replaces vec during post-selection cleanup. See above.
2862
operand vecD() %{
2863
  constraint(ALLOC_IN_RC(vectord_reg_vlbwdq));
2864
  match(VecD);
2865

2866
  format %{ %}
2867
  interface(REG_INTER);
2868
%}
2869

2870
// Replaces legVec during post-selection cleanup. See above.
2871
operand legVecD() %{
2872
  constraint(ALLOC_IN_RC(vectord_reg_legacy));
2873
  match(VecD);
2874

2875
  format %{ %}
2876
  interface(REG_INTER);
2877
%}
2878

2879
// Replaces vec during post-selection cleanup. See above.
2880
operand vecX() %{
2881
  constraint(ALLOC_IN_RC(vectorx_reg_vlbwdq));
2882
  match(VecX);
2883

2884
  format %{ %}
2885
  interface(REG_INTER);
2886
%}
2887

2888
// Replaces legVec during post-selection cleanup. See above.
2889
operand legVecX() %{
2890
  constraint(ALLOC_IN_RC(vectorx_reg_legacy));
2891
  match(VecX);
2892

2893
  format %{ %}
2894
  interface(REG_INTER);
2895
%}
2896

2897
// Replaces vec during post-selection cleanup. See above.
2898
operand vecY() %{
2899
  constraint(ALLOC_IN_RC(vectory_reg_vlbwdq));
2900
  match(VecY);
2901

2902
  format %{ %}
2903
  interface(REG_INTER);
2904
%}
2905

2906
// Replaces legVec during post-selection cleanup. See above.
2907
operand legVecY() %{
2908
  constraint(ALLOC_IN_RC(vectory_reg_legacy));
2909
  match(VecY);
2910

2911
  format %{ %}
2912
  interface(REG_INTER);
2913
%}
2914

2915
// Replaces vec during post-selection cleanup. See above.
2916
operand vecZ() %{
2917
  constraint(ALLOC_IN_RC(vectorz_reg));
2918
  match(VecZ);
2919

2920
  format %{ %}
2921
  interface(REG_INTER);
2922
%}
2923

2924
// Replaces legVec during post-selection cleanup. See above.
2925
operand legVecZ() %{
2926
  constraint(ALLOC_IN_RC(vectorz_reg_legacy));
2927
  match(VecZ);
2928

2929
  format %{ %}
2930
  interface(REG_INTER);
2931
%}
2932

2933
// INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
2934

2935
// ============================================================================
2936

2937
instruct ShouldNotReachHere() %{
2938
  match(Halt);
2939
  format %{ "stop\t# ShouldNotReachHere" %}
2940
  ins_encode %{
2941
    if (is_reachable()) {
2942
      __ stop(_halt_reason);
2943
    }
2944
  %}
2945
  ins_pipe(pipe_slow);
2946
%}
2947

2948
// ============================================================================
2949

2950
instruct addF_reg(regF dst, regF src) %{
2951
  predicate((UseSSE>=1) && (UseAVX == 0));
2952
  match(Set dst (AddF dst src));
2953

2954
  format %{ "addss   $dst, $src" %}
2955
  ins_cost(150);
2956
  ins_encode %{
2957
    __ addss($dst$$XMMRegister, $src$$XMMRegister);
2958
  %}
2959
  ins_pipe(pipe_slow);
2960
%}
2961

2962
instruct addF_mem(regF dst, memory src) %{
2963
  predicate((UseSSE>=1) && (UseAVX == 0));
2964
  match(Set dst (AddF dst (LoadF src)));
2965

2966
  format %{ "addss   $dst, $src" %}
2967
  ins_cost(150);
2968
  ins_encode %{
2969
    __ addss($dst$$XMMRegister, $src$$Address);
2970
  %}
2971
  ins_pipe(pipe_slow);
2972
%}
2973

2974
instruct addF_imm(regF dst, immF con) %{
2975
  predicate((UseSSE>=1) && (UseAVX == 0));
2976
  match(Set dst (AddF dst con));
2977
  format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2978
  ins_cost(150);
2979
  ins_encode %{
2980
    __ addss($dst$$XMMRegister, $constantaddress($con));
2981
  %}
2982
  ins_pipe(pipe_slow);
2983
%}
2984

2985
instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
2986
  predicate(UseAVX > 0);
2987
  match(Set dst (AddF src1 src2));
2988

2989
  format %{ "vaddss  $dst, $src1, $src2" %}
2990
  ins_cost(150);
2991
  ins_encode %{
2992
    __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2993
  %}
2994
  ins_pipe(pipe_slow);
2995
%}
2996

2997
instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
2998
  predicate(UseAVX > 0);
2999
  match(Set dst (AddF src1 (LoadF src2)));
3000

3001
  format %{ "vaddss  $dst, $src1, $src2" %}
3002
  ins_cost(150);
3003
  ins_encode %{
3004
    __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
3005
  %}
3006
  ins_pipe(pipe_slow);
3007
%}
3008

3009
instruct addF_reg_imm(regF dst, regF src, immF con) %{
3010
  predicate(UseAVX > 0);
3011
  match(Set dst (AddF src con));
3012

3013
  format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
3014
  ins_cost(150);
3015
  ins_encode %{
3016
    __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
3017
  %}
3018
  ins_pipe(pipe_slow);
3019
%}
3020

3021
instruct addD_reg(regD dst, regD src) %{
3022
  predicate((UseSSE>=2) && (UseAVX == 0));
3023
  match(Set dst (AddD dst src));
3024

3025
  format %{ "addsd   $dst, $src" %}
3026
  ins_cost(150);
3027
  ins_encode %{
3028
    __ addsd($dst$$XMMRegister, $src$$XMMRegister);
3029
  %}
3030
  ins_pipe(pipe_slow);
3031
%}
3032

3033
instruct addD_mem(regD dst, memory src) %{
3034
  predicate((UseSSE>=2) && (UseAVX == 0));
3035
  match(Set dst (AddD dst (LoadD src)));
3036

3037
  format %{ "addsd   $dst, $src" %}
3038
  ins_cost(150);
3039
  ins_encode %{
3040
    __ addsd($dst$$XMMRegister, $src$$Address);
3041
  %}
3042
  ins_pipe(pipe_slow);
3043
%}
3044

3045
instruct addD_imm(regD dst, immD con) %{
3046
  predicate((UseSSE>=2) && (UseAVX == 0));
3047
  match(Set dst (AddD dst con));
3048
  format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
3049
  ins_cost(150);
3050
  ins_encode %{
3051
    __ addsd($dst$$XMMRegister, $constantaddress($con));
3052
  %}
3053
  ins_pipe(pipe_slow);
3054
%}
3055

3056
instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
3057
  predicate(UseAVX > 0);
3058
  match(Set dst (AddD src1 src2));
3059

3060
  format %{ "vaddsd  $dst, $src1, $src2" %}
3061
  ins_cost(150);
3062
  ins_encode %{
3063
    __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
3064
  %}
3065
  ins_pipe(pipe_slow);
3066
%}
3067

3068
instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
3069
  predicate(UseAVX > 0);
3070
  match(Set dst (AddD src1 (LoadD src2)));
3071

3072
  format %{ "vaddsd  $dst, $src1, $src2" %}
3073
  ins_cost(150);
3074
  ins_encode %{
3075
    __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
3076
  %}
3077
  ins_pipe(pipe_slow);
3078
%}
3079

3080
instruct addD_reg_imm(regD dst, regD src, immD con) %{
3081
  predicate(UseAVX > 0);
3082
  match(Set dst (AddD src con));
3083

3084
  format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
3085
  ins_cost(150);
3086
  ins_encode %{
3087
    __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
3088
  %}
3089
  ins_pipe(pipe_slow);
3090
%}
3091

3092
instruct subF_reg(regF dst, regF src) %{
3093
  predicate((UseSSE>=1) && (UseAVX == 0));
3094
  match(Set dst (SubF dst src));
3095

3096
  format %{ "subss   $dst, $src" %}
3097
  ins_cost(150);
3098
  ins_encode %{
3099
    __ subss($dst$$XMMRegister, $src$$XMMRegister);
3100
  %}
3101
  ins_pipe(pipe_slow);
3102
%}
3103

3104
instruct subF_mem(regF dst, memory src) %{
3105
  predicate((UseSSE>=1) && (UseAVX == 0));
3106
  match(Set dst (SubF dst (LoadF src)));
3107

3108
  format %{ "subss   $dst, $src" %}
3109
  ins_cost(150);
3110
  ins_encode %{
3111
    __ subss($dst$$XMMRegister, $src$$Address);
3112
  %}
3113
  ins_pipe(pipe_slow);
3114
%}
3115

3116
instruct subF_imm(regF dst, immF con) %{
3117
  predicate((UseSSE>=1) && (UseAVX == 0));
3118
  match(Set dst (SubF dst con));
3119
  format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
3120
  ins_cost(150);
3121
  ins_encode %{
3122
    __ subss($dst$$XMMRegister, $constantaddress($con));
3123
  %}
3124
  ins_pipe(pipe_slow);
3125
%}
3126

3127
instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
3128
  predicate(UseAVX > 0);
3129
  match(Set dst (SubF src1 src2));
3130

3131
  format %{ "vsubss  $dst, $src1, $src2" %}
3132
  ins_cost(150);
3133
  ins_encode %{
3134
    __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
3135
  %}
3136
  ins_pipe(pipe_slow);
3137
%}
3138

3139
instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
3140
  predicate(UseAVX > 0);
3141
  match(Set dst (SubF src1 (LoadF src2)));
3142

3143
  format %{ "vsubss  $dst, $src1, $src2" %}
3144
  ins_cost(150);
3145
  ins_encode %{
3146
    __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
3147
  %}
3148
  ins_pipe(pipe_slow);
3149
%}
3150

3151
instruct subF_reg_imm(regF dst, regF src, immF con) %{
3152
  predicate(UseAVX > 0);
3153
  match(Set dst (SubF src con));
3154

3155
  format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
3156
  ins_cost(150);
3157
  ins_encode %{
3158
    __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
3159
  %}
3160
  ins_pipe(pipe_slow);
3161
%}
3162

3163
instruct subD_reg(regD dst, regD src) %{
3164
  predicate((UseSSE>=2) && (UseAVX == 0));
3165
  match(Set dst (SubD dst src));
3166

3167
  format %{ "subsd   $dst, $src" %}
3168
  ins_cost(150);
3169
  ins_encode %{
3170
    __ subsd($dst$$XMMRegister, $src$$XMMRegister);
3171
  %}
3172
  ins_pipe(pipe_slow);
3173
%}
3174

3175
instruct subD_mem(regD dst, memory src) %{
3176
  predicate((UseSSE>=2) && (UseAVX == 0));
3177
  match(Set dst (SubD dst (LoadD src)));
3178

3179
  format %{ "subsd   $dst, $src" %}
3180
  ins_cost(150);
3181
  ins_encode %{
3182
    __ subsd($dst$$XMMRegister, $src$$Address);
3183
  %}
3184
  ins_pipe(pipe_slow);
3185
%}
3186

3187
instruct subD_imm(regD dst, immD con) %{
3188
  predicate((UseSSE>=2) && (UseAVX == 0));
3189
  match(Set dst (SubD dst con));
3190
  format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
3191
  ins_cost(150);
3192
  ins_encode %{
3193
    __ subsd($dst$$XMMRegister, $constantaddress($con));
3194
  %}
3195
  ins_pipe(pipe_slow);
3196
%}
3197

3198
instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
3199
  predicate(UseAVX > 0);
3200
  match(Set dst (SubD src1 src2));
3201

3202
  format %{ "vsubsd  $dst, $src1, $src2" %}
3203
  ins_cost(150);
3204
  ins_encode %{
3205
    __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
3206
  %}
3207
  ins_pipe(pipe_slow);
3208
%}
3209

3210
instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
3211
  predicate(UseAVX > 0);
3212
  match(Set dst (SubD src1 (LoadD src2)));
3213

3214
  format %{ "vsubsd  $dst, $src1, $src2" %}
3215
  ins_cost(150);
3216
  ins_encode %{
3217
    __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
3218
  %}
3219
  ins_pipe(pipe_slow);
3220
%}
3221

3222
instruct subD_reg_imm(regD dst, regD src, immD con) %{
3223
  predicate(UseAVX > 0);
3224
  match(Set dst (SubD src con));
3225

3226
  format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
3227
  ins_cost(150);
3228
  ins_encode %{
3229
    __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
3230
  %}
3231
  ins_pipe(pipe_slow);
3232
%}
3233

3234
instruct mulF_reg(regF dst, regF src) %{
3235
  predicate((UseSSE>=1) && (UseAVX == 0));
3236
  match(Set dst (MulF dst src));
3237

3238
  format %{ "mulss   $dst, $src" %}
3239
  ins_cost(150);
3240
  ins_encode %{
3241
    __ mulss($dst$$XMMRegister, $src$$XMMRegister);
3242
  %}
3243
  ins_pipe(pipe_slow);
3244
%}
3245

3246
instruct mulF_mem(regF dst, memory src) %{
3247
  predicate((UseSSE>=1) && (UseAVX == 0));
3248
  match(Set dst (MulF dst (LoadF src)));
3249

3250
  format %{ "mulss   $dst, $src" %}
3251
  ins_cost(150);
3252
  ins_encode %{
3253
    __ mulss($dst$$XMMRegister, $src$$Address);
3254
  %}
3255
  ins_pipe(pipe_slow);
3256
%}
3257

3258
instruct mulF_imm(regF dst, immF con) %{
3259
  predicate((UseSSE>=1) && (UseAVX == 0));
3260
  match(Set dst (MulF dst con));
3261
  format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
3262
  ins_cost(150);
3263
  ins_encode %{
3264
    __ mulss($dst$$XMMRegister, $constantaddress($con));
3265
  %}
3266
  ins_pipe(pipe_slow);
3267
%}
3268

3269
instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
3270
  predicate(UseAVX > 0);
3271
  match(Set dst (MulF src1 src2));
3272

3273
  format %{ "vmulss  $dst, $src1, $src2" %}
3274
  ins_cost(150);
3275
  ins_encode %{
3276
    __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
3277
  %}
3278
  ins_pipe(pipe_slow);
3279
%}
3280

3281
instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
3282
  predicate(UseAVX > 0);
3283
  match(Set dst (MulF src1 (LoadF src2)));
3284

3285
  format %{ "vmulss  $dst, $src1, $src2" %}
3286
  ins_cost(150);
3287
  ins_encode %{
3288
    __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
3289
  %}
3290
  ins_pipe(pipe_slow);
3291
%}
3292

3293
instruct mulF_reg_imm(regF dst, regF src, immF con) %{
3294
  predicate(UseAVX > 0);
3295
  match(Set dst (MulF src con));
3296

3297
  format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
3298
  ins_cost(150);
3299
  ins_encode %{
3300
    __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
3301
  %}
3302
  ins_pipe(pipe_slow);
3303
%}
3304

3305
instruct mulD_reg(regD dst, regD src) %{
3306
  predicate((UseSSE>=2) && (UseAVX == 0));
3307
  match(Set dst (MulD dst src));
3308

3309
  format %{ "mulsd   $dst, $src" %}
3310
  ins_cost(150);
3311
  ins_encode %{
3312
    __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
3313
  %}
3314
  ins_pipe(pipe_slow);
3315
%}
3316

3317
instruct mulD_mem(regD dst, memory src) %{
3318
  predicate((UseSSE>=2) && (UseAVX == 0));
3319
  match(Set dst (MulD dst (LoadD src)));
3320

3321
  format %{ "mulsd   $dst, $src" %}
3322
  ins_cost(150);
3323
  ins_encode %{
3324
    __ mulsd($dst$$XMMRegister, $src$$Address);
3325
  %}
3326
  ins_pipe(pipe_slow);
3327
%}
3328

3329
instruct mulD_imm(regD dst, immD con) %{
3330
  predicate((UseSSE>=2) && (UseAVX == 0));
3331
  match(Set dst (MulD dst con));
3332
  format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
3333
  ins_cost(150);
3334
  ins_encode %{
3335
    __ mulsd($dst$$XMMRegister, $constantaddress($con));
3336
  %}
3337
  ins_pipe(pipe_slow);
3338
%}
3339

3340
instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
3341
  predicate(UseAVX > 0);
3342
  match(Set dst (MulD src1 src2));
3343

3344
  format %{ "vmulsd  $dst, $src1, $src2" %}
3345
  ins_cost(150);
3346
  ins_encode %{
3347
    __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
3348
  %}
3349
  ins_pipe(pipe_slow);
3350
%}
3351

3352
instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
3353
  predicate(UseAVX > 0);
3354
  match(Set dst (MulD src1 (LoadD src2)));
3355

3356
  format %{ "vmulsd  $dst, $src1, $src2" %}
3357
  ins_cost(150);
3358
  ins_encode %{
3359
    __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
3360
  %}
3361
  ins_pipe(pipe_slow);
3362
%}
3363

3364
instruct mulD_reg_imm(regD dst, regD src, immD con) %{
3365
  predicate(UseAVX > 0);
3366
  match(Set dst (MulD src con));
3367

3368
  format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
3369
  ins_cost(150);
3370
  ins_encode %{
3371
    __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
3372
  %}
3373
  ins_pipe(pipe_slow);
3374
%}
3375

3376
instruct divF_reg(regF dst, regF src) %{
3377
  predicate((UseSSE>=1) && (UseAVX == 0));
3378
  match(Set dst (DivF dst src));
3379

3380
  format %{ "divss   $dst, $src" %}
3381
  ins_cost(150);
3382
  ins_encode %{
3383
    __ divss($dst$$XMMRegister, $src$$XMMRegister);
3384
  %}
3385
  ins_pipe(pipe_slow);
3386
%}
3387

3388
instruct divF_mem(regF dst, memory src) %{
3389
  predicate((UseSSE>=1) && (UseAVX == 0));
3390
  match(Set dst (DivF dst (LoadF src)));
3391

3392
  format %{ "divss   $dst, $src" %}
3393
  ins_cost(150);
3394
  ins_encode %{
3395
    __ divss($dst$$XMMRegister, $src$$Address);
3396
  %}
3397
  ins_pipe(pipe_slow);
3398
%}
3399

3400
instruct divF_imm(regF dst, immF con) %{
3401
  predicate((UseSSE>=1) && (UseAVX == 0));
3402
  match(Set dst (DivF dst con));
3403
  format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
3404
  ins_cost(150);
3405
  ins_encode %{
3406
    __ divss($dst$$XMMRegister, $constantaddress($con));
3407
  %}
3408
  ins_pipe(pipe_slow);
3409
%}
3410

3411
instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
3412
  predicate(UseAVX > 0);
3413
  match(Set dst (DivF src1 src2));
3414

3415
  format %{ "vdivss  $dst, $src1, $src2" %}
3416
  ins_cost(150);
3417
  ins_encode %{
3418
    __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
3419
  %}
3420
  ins_pipe(pipe_slow);
3421
%}
3422

3423
instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
3424
  predicate(UseAVX > 0);
3425
  match(Set dst (DivF src1 (LoadF src2)));
3426

3427
  format %{ "vdivss  $dst, $src1, $src2" %}
3428
  ins_cost(150);
3429
  ins_encode %{
3430
    __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
3431
  %}
3432
  ins_pipe(pipe_slow);
3433
%}
3434

3435
instruct divF_reg_imm(regF dst, regF src, immF con) %{
3436
  predicate(UseAVX > 0);
3437
  match(Set dst (DivF src con));
3438

3439
  format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
3440
  ins_cost(150);
3441
  ins_encode %{
3442
    __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
3443
  %}
3444
  ins_pipe(pipe_slow);
3445
%}
3446

3447
instruct divD_reg(regD dst, regD src) %{
3448
  predicate((UseSSE>=2) && (UseAVX == 0));
3449
  match(Set dst (DivD dst src));
3450

3451
  format %{ "divsd   $dst, $src" %}
3452
  ins_cost(150);
3453
  ins_encode %{
3454
    __ divsd($dst$$XMMRegister, $src$$XMMRegister);
3455
  %}
3456
  ins_pipe(pipe_slow);
3457
%}
3458

3459
instruct divD_mem(regD dst, memory src) %{
3460
  predicate((UseSSE>=2) && (UseAVX == 0));
3461
  match(Set dst (DivD dst (LoadD src)));
3462

3463
  format %{ "divsd   $dst, $src" %}
3464
  ins_cost(150);
3465
  ins_encode %{
3466
    __ divsd($dst$$XMMRegister, $src$$Address);
3467
  %}
3468
  ins_pipe(pipe_slow);
3469
%}
3470

3471
instruct divD_imm(regD dst, immD con) %{
3472
  predicate((UseSSE>=2) && (UseAVX == 0));
3473
  match(Set dst (DivD dst con));
3474
  format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
3475
  ins_cost(150);
3476
  ins_encode %{
3477
    __ divsd($dst$$XMMRegister, $constantaddress($con));
3478
  %}
3479
  ins_pipe(pipe_slow);
3480
%}
3481

3482
instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
3483
  predicate(UseAVX > 0);
3484
  match(Set dst (DivD src1 src2));
3485

3486
  format %{ "vdivsd  $dst, $src1, $src2" %}
3487
  ins_cost(150);
3488
  ins_encode %{
3489
    __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
3490
  %}
3491
  ins_pipe(pipe_slow);
3492
%}
3493

3494
instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
3495
  predicate(UseAVX > 0);
3496
  match(Set dst (DivD src1 (LoadD src2)));
3497

3498
  format %{ "vdivsd  $dst, $src1, $src2" %}
3499
  ins_cost(150);
3500
  ins_encode %{
3501
    __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
3502
  %}
3503
  ins_pipe(pipe_slow);
3504
%}
3505

3506
instruct divD_reg_imm(regD dst, regD src, immD con) %{
3507
  predicate(UseAVX > 0);
3508
  match(Set dst (DivD src con));
3509

3510
  format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
3511
  ins_cost(150);
3512
  ins_encode %{
3513
    __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
3514
  %}
3515
  ins_pipe(pipe_slow);
3516
%}
3517

3518
instruct absF_reg(regF dst) %{
3519
  predicate((UseSSE>=1) && (UseAVX == 0));
3520
  match(Set dst (AbsF dst));
3521
  ins_cost(150);
3522
  format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
3523
  ins_encode %{
3524
    __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
3525
  %}
3526
  ins_pipe(pipe_slow);
3527
%}
3528

3529
instruct absF_reg_reg(vlRegF dst, vlRegF src) %{
3530
  predicate(UseAVX > 0);
3531
  match(Set dst (AbsF src));
3532
  ins_cost(150);
3533
  format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
3534
  ins_encode %{
3535
    int vlen_enc = Assembler::AVX_128bit;
3536
    __ vandps($dst$$XMMRegister, $src$$XMMRegister,
3537
              ExternalAddress(float_signmask()), vlen_enc);
3538
  %}
3539
  ins_pipe(pipe_slow);
3540
%}
3541

3542
instruct absD_reg(regD dst) %{
3543
  predicate((UseSSE>=2) && (UseAVX == 0));
3544
  match(Set dst (AbsD dst));
3545
  ins_cost(150);
3546
  format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
3547
            "# abs double by sign masking" %}
3548
  ins_encode %{
3549
    __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
3550
  %}
3551
  ins_pipe(pipe_slow);
3552
%}
3553

3554
instruct absD_reg_reg(vlRegD dst, vlRegD src) %{
3555
  predicate(UseAVX > 0);
3556
  match(Set dst (AbsD src));
3557
  ins_cost(150);
3558
  format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
3559
            "# abs double by sign masking" %}
3560
  ins_encode %{
3561
    int vlen_enc = Assembler::AVX_128bit;
3562
    __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
3563
              ExternalAddress(double_signmask()), vlen_enc);
3564
  %}
3565
  ins_pipe(pipe_slow);
3566
%}
3567

3568
instruct negF_reg(regF dst) %{
3569
  predicate((UseSSE>=1) && (UseAVX == 0));
3570
  match(Set dst (NegF dst));
3571
  ins_cost(150);
3572
  format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
3573
  ins_encode %{
3574
    __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
3575
  %}
3576
  ins_pipe(pipe_slow);
3577
%}
3578

3579
instruct negF_reg_reg(vlRegF dst, vlRegF src) %{
3580
  predicate(UseAVX > 0);
3581
  match(Set dst (NegF src));
3582
  ins_cost(150);
3583
  format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
3584
  ins_encode %{
3585
    __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
3586
                 ExternalAddress(float_signflip()));
3587
  %}
3588
  ins_pipe(pipe_slow);
3589
%}
3590

3591
instruct negD_reg(regD dst) %{
3592
  predicate((UseSSE>=2) && (UseAVX == 0));
3593
  match(Set dst (NegD dst));
3594
  ins_cost(150);
3595
  format %{ "xorpd   $dst, [0x8000000000000000]\t"
3596
            "# neg double by sign flipping" %}
3597
  ins_encode %{
3598
    __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
3599
  %}
3600
  ins_pipe(pipe_slow);
3601
%}
3602

3603
instruct negD_reg_reg(vlRegD dst, vlRegD src) %{
3604
  predicate(UseAVX > 0);
3605
  match(Set dst (NegD src));
3606
  ins_cost(150);
3607
  format %{ "vnegatesd  $dst, $src, [0x8000000000000000]\t"
3608
            "# neg double by sign flipping" %}
3609
  ins_encode %{
3610
    __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
3611
                 ExternalAddress(double_signflip()));
3612
  %}
3613
  ins_pipe(pipe_slow);
3614
%}
3615

3616
// sqrtss instruction needs destination register to be pre initialized for best performance
3617
// Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
3618
instruct sqrtF_reg(regF dst) %{
3619
  predicate(UseSSE>=1);
3620
  match(Set dst (SqrtF dst));
3621
  format %{ "sqrtss  $dst, $dst" %}
3622
  ins_encode %{
3623
    __ sqrtss($dst$$XMMRegister, $dst$$XMMRegister);
3624
  %}
3625
  ins_pipe(pipe_slow);
3626
%}
3627

3628
// sqrtsd instruction needs destination register to be pre initialized for best performance
3629
// Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
3630
instruct sqrtD_reg(regD dst) %{
3631
  predicate(UseSSE>=2);
3632
  match(Set dst (SqrtD dst));
3633
  format %{ "sqrtsd  $dst, $dst" %}
3634
  ins_encode %{
3635
    __ sqrtsd($dst$$XMMRegister, $dst$$XMMRegister);
3636
  %}
3637
  ins_pipe(pipe_slow);
3638
%}
3639

3640
instruct convF2HF_reg_reg(rRegI dst, vlRegF src, vlRegF tmp) %{
3641
  effect(TEMP tmp);
3642
  match(Set dst (ConvF2HF src));
3643
  ins_cost(125);
3644
  format %{ "vcvtps2ph $dst,$src \t using $tmp as TEMP"%}
3645
  ins_encode %{
3646
    __ flt_to_flt16($dst$$Register, $src$$XMMRegister, $tmp$$XMMRegister);
3647
  %}
3648
  ins_pipe( pipe_slow );
3649
%}
3650

3651
instruct convF2HF_mem_reg(memory mem, regF src, kReg ktmp, rRegI rtmp) %{
3652
  predicate((UseAVX > 2) && VM_Version::supports_avx512vl());
3653
  effect(TEMP ktmp, TEMP rtmp);
3654
  match(Set mem (StoreC mem (ConvF2HF src)));
3655
  format %{ "evcvtps2ph $mem,$src \t using $ktmp and $rtmp as TEMP" %}
3656
  ins_encode %{
3657
    __ movl($rtmp$$Register, 0x1);
3658
    __ kmovwl($ktmp$$KRegister, $rtmp$$Register);
3659
    __ evcvtps2ph($mem$$Address, $ktmp$$KRegister, $src$$XMMRegister, 0x04, Assembler::AVX_128bit);
3660
  %}
3661
  ins_pipe( pipe_slow );
3662
%}
3663

3664
instruct vconvF2HF(vec dst, vec src) %{
3665
  match(Set dst (VectorCastF2HF src));
3666
  format %{ "vector_conv_F2HF $dst $src" %}
3667
  ins_encode %{
3668
    int vlen_enc = vector_length_encoding(this, $src);
3669
    __ vcvtps2ph($dst$$XMMRegister, $src$$XMMRegister, 0x04, vlen_enc);
3670
  %}
3671
  ins_pipe( pipe_slow );
3672
%}
3673

3674
instruct vconvF2HF_mem_reg(memory mem, vec src) %{
3675
  match(Set mem (StoreVector mem (VectorCastF2HF src)));
3676
  format %{ "vcvtps2ph $mem,$src" %}
3677
  ins_encode %{
3678
    int vlen_enc = vector_length_encoding(this, $src);
3679
    __ vcvtps2ph($mem$$Address, $src$$XMMRegister, 0x04, vlen_enc);
3680
  %}
3681
  ins_pipe( pipe_slow );
3682
%}
3683

3684
instruct convHF2F_reg_reg(vlRegF dst, rRegI src) %{
3685
  match(Set dst (ConvHF2F src));
3686
  format %{ "vcvtph2ps $dst,$src" %}
3687
  ins_encode %{
3688
    __ flt16_to_flt($dst$$XMMRegister, $src$$Register);
3689
  %}
3690
  ins_pipe( pipe_slow );
3691
%}
3692

3693
instruct vconvHF2F_reg_mem(vec dst, memory mem) %{
3694
  match(Set dst (VectorCastHF2F (LoadVector mem)));
3695
  format %{ "vcvtph2ps $dst,$mem" %}
3696
  ins_encode %{
3697
    int vlen_enc = vector_length_encoding(this);
3698
    __ vcvtph2ps($dst$$XMMRegister, $mem$$Address, vlen_enc);
3699
  %}
3700
  ins_pipe( pipe_slow );
3701
%}
3702

3703
instruct vconvHF2F(vec dst, vec src) %{
3704
  match(Set dst (VectorCastHF2F src));
3705
  ins_cost(125);
3706
  format %{ "vector_conv_HF2F $dst,$src" %}
3707
  ins_encode %{
3708
    int vlen_enc = vector_length_encoding(this);
3709
    __ vcvtph2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
3710
  %}
3711
  ins_pipe( pipe_slow );
3712
%}
3713

3714
// ---------------------------------------- VectorReinterpret ------------------------------------
3715
instruct reinterpret_mask(kReg dst) %{
3716
  predicate(n->bottom_type()->isa_vectmask() &&
3717
            Matcher::vector_length(n) == Matcher::vector_length(n->in(1))); // dst == src
3718
  match(Set dst (VectorReinterpret dst));
3719
  ins_cost(125);
3720
  format %{ "vector_reinterpret $dst\t!" %}
3721
  ins_encode %{
3722
    // empty
3723
  %}
3724
  ins_pipe( pipe_slow );
3725
%}
3726

3727
instruct reinterpret_mask_W2B(kReg dst, kReg src, vec xtmp) %{
3728
  predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
3729
            n->bottom_type()->isa_vectmask() &&
3730
            n->in(1)->bottom_type()->isa_vectmask() &&
3731
            n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_SHORT &&
3732
            n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
3733
  match(Set dst (VectorReinterpret src));
3734
  effect(TEMP xtmp);
3735
  format %{ "vector_mask_reinterpret_W2B $dst $src\t!" %}
3736
  ins_encode %{
3737
     int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_SHORT);
3738
     int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
3739
     assert(src_sz == dst_sz , "src and dst size mismatch");
3740
     int vlen_enc = vector_length_encoding(src_sz);
3741
     __  evpmovm2w($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
3742
     __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
3743
  %}
3744
  ins_pipe( pipe_slow );
3745
%}
3746

3747
instruct reinterpret_mask_D2B(kReg dst, kReg src, vec xtmp) %{
3748
  predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
3749
            n->bottom_type()->isa_vectmask() &&
3750
            n->in(1)->bottom_type()->isa_vectmask() &&
3751
            (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_INT ||
3752
             n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_FLOAT) &&
3753
            n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
3754
  match(Set dst (VectorReinterpret src));
3755
  effect(TEMP xtmp);
3756
  format %{ "vector_mask_reinterpret_D2B $dst $src\t!" %}
3757
  ins_encode %{
3758
     int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_INT);
3759
     int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
3760
     assert(src_sz == dst_sz , "src and dst size mismatch");
3761
     int vlen_enc = vector_length_encoding(src_sz);
3762
     __  evpmovm2d($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
3763
     __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
3764
  %}
3765
  ins_pipe( pipe_slow );
3766
%}
3767

3768
instruct reinterpret_mask_Q2B(kReg dst, kReg src, vec xtmp) %{
3769
  predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
3770
            n->bottom_type()->isa_vectmask() &&
3771
            n->in(1)->bottom_type()->isa_vectmask() &&
3772
            (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_LONG ||
3773
             n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_DOUBLE) &&
3774
            n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
3775
  match(Set dst (VectorReinterpret src));
3776
  effect(TEMP xtmp);
3777
  format %{ "vector_mask_reinterpret_Q2B $dst $src\t!" %}
3778
  ins_encode %{
3779
     int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_LONG);
3780
     int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
3781
     assert(src_sz == dst_sz , "src and dst size mismatch");
3782
     int vlen_enc = vector_length_encoding(src_sz);
3783
     __  evpmovm2q($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
3784
     __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
3785
  %}
3786
  ins_pipe( pipe_slow );
3787
%}
3788

3789
instruct reinterpret(vec dst) %{
3790
  predicate(!n->bottom_type()->isa_vectmask() &&
3791
            Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1))); // dst == src
3792
  match(Set dst (VectorReinterpret dst));
3793
  ins_cost(125);
3794
  format %{ "vector_reinterpret $dst\t!" %}
3795
  ins_encode %{
3796
    // empty
3797
  %}
3798
  ins_pipe( pipe_slow );
3799
%}
3800

3801
instruct reinterpret_expand(vec dst, vec src) %{
3802
  predicate(UseAVX == 0 &&
3803
            (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
3804
  match(Set dst (VectorReinterpret src));
3805
  ins_cost(125);
3806
  effect(TEMP dst);
3807
  format %{ "vector_reinterpret_expand $dst,$src" %}
3808
  ins_encode %{
3809
    assert(Matcher::vector_length_in_bytes(this)       <= 16, "required");
3810
    assert(Matcher::vector_length_in_bytes(this, $src) <=  8, "required");
3811

3812
    int src_vlen_in_bytes = Matcher::vector_length_in_bytes(this, $src);
3813
    if (src_vlen_in_bytes == 4) {
3814
      __ movdqu($dst$$XMMRegister, ExternalAddress(vector_32_bit_mask()), noreg);
3815
    } else {
3816
      assert(src_vlen_in_bytes == 8, "");
3817
      __ movdqu($dst$$XMMRegister, ExternalAddress(vector_64_bit_mask()), noreg);
3818
    }
3819
    __ pand($dst$$XMMRegister, $src$$XMMRegister);
3820
  %}
3821
  ins_pipe( pipe_slow );
3822
%}
3823

3824
instruct vreinterpret_expand4(legVec dst, vec src) %{
3825
  predicate(UseAVX > 0 &&
3826
            !n->bottom_type()->isa_vectmask() &&
3827
            (Matcher::vector_length_in_bytes(n->in(1)) == 4) && // src
3828
            (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
3829
  match(Set dst (VectorReinterpret src));
3830
  ins_cost(125);
3831
  format %{ "vector_reinterpret_expand $dst,$src" %}
3832
  ins_encode %{
3833
    __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_32_bit_mask()), 0, noreg);
3834
  %}
3835
  ins_pipe( pipe_slow );
3836
%}
3837

3838

3839
instruct vreinterpret_expand(legVec dst, vec src) %{
3840
  predicate(UseAVX > 0 &&
3841
            !n->bottom_type()->isa_vectmask() &&
3842
            (Matcher::vector_length_in_bytes(n->in(1)) > 4) && // src
3843
            (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
3844
  match(Set dst (VectorReinterpret src));
3845
  ins_cost(125);
3846
  format %{ "vector_reinterpret_expand $dst,$src\t!" %}
3847
  ins_encode %{
3848
    switch (Matcher::vector_length_in_bytes(this, $src)) {
3849
      case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
3850
      case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
3851
      case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
3852
      default: ShouldNotReachHere();
3853
    }
3854
  %}
3855
  ins_pipe( pipe_slow );
3856
%}
3857

3858
instruct reinterpret_shrink(vec dst, legVec src) %{
3859
  predicate(!n->bottom_type()->isa_vectmask() &&
3860
            Matcher::vector_length_in_bytes(n->in(1)) > Matcher::vector_length_in_bytes(n)); // src > dst
3861
  match(Set dst (VectorReinterpret src));
3862
  ins_cost(125);
3863
  format %{ "vector_reinterpret_shrink $dst,$src\t!" %}
3864
  ins_encode %{
3865
    switch (Matcher::vector_length_in_bytes(this)) {
3866
      case  4: __ movfltz($dst$$XMMRegister, $src$$XMMRegister); break;
3867
      case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
3868
      case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
3869
      case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
3870
      default: ShouldNotReachHere();
3871
    }
3872
  %}
3873
  ins_pipe( pipe_slow );
3874
%}
3875

3876
// ----------------------------------------------------------------------------------------------------
3877

3878
#ifdef _LP64
3879
instruct roundD_reg(legRegD dst, legRegD src, immU8 rmode) %{
3880
  match(Set dst (RoundDoubleMode src rmode));
3881
  format %{ "roundsd $dst,$src" %}
3882
  ins_cost(150);
3883
  ins_encode %{
3884
    assert(UseSSE >= 4, "required");
3885
    if ((UseAVX == 0) && ($dst$$XMMRegister != $src$$XMMRegister)) {
3886
      __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3887
    }
3888
    __ roundsd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant);
3889
  %}
3890
  ins_pipe(pipe_slow);
3891
%}
3892

3893
instruct roundD_imm(legRegD dst, immD con, immU8 rmode) %{
3894
  match(Set dst (RoundDoubleMode con rmode));
3895
  format %{ "roundsd $dst,[$constantaddress]\t# load from constant table: double=$con" %}
3896
  ins_cost(150);
3897
  ins_encode %{
3898
    assert(UseSSE >= 4, "required");
3899
    __ roundsd($dst$$XMMRegister, $constantaddress($con), $rmode$$constant, noreg);
3900
  %}
3901
  ins_pipe(pipe_slow);
3902
%}
3903

3904
instruct vroundD_reg(legVec dst, legVec src, immU8 rmode) %{
3905
  predicate(Matcher::vector_length(n) < 8);
3906
  match(Set dst (RoundDoubleModeV src rmode));
3907
  format %{ "vroundpd $dst,$src,$rmode\t! round packedD" %}
3908
  ins_encode %{
3909
    assert(UseAVX > 0, "required");
3910
    int vlen_enc = vector_length_encoding(this);
3911
    __ vroundpd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, vlen_enc);
3912
  %}
3913
  ins_pipe( pipe_slow );
3914
%}
3915

3916
instruct vround8D_reg(vec dst, vec src, immU8 rmode) %{
3917
  predicate(Matcher::vector_length(n) == 8);
3918
  match(Set dst (RoundDoubleModeV src rmode));
3919
  format %{ "vrndscalepd $dst,$src,$rmode\t! round packed8D" %}
3920
  ins_encode %{
3921
    assert(UseAVX > 2, "required");
3922
    __ vrndscalepd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, Assembler::AVX_512bit);
3923
  %}
3924
  ins_pipe( pipe_slow );
3925
%}
3926

3927
instruct vroundD_mem(legVec dst, memory mem, immU8 rmode) %{
3928
  predicate(Matcher::vector_length(n) < 8);
3929
  match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
3930
  format %{ "vroundpd $dst, $mem, $rmode\t! round packedD" %}
3931
  ins_encode %{
3932
    assert(UseAVX > 0, "required");
3933
    int vlen_enc = vector_length_encoding(this);
3934
    __ vroundpd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, vlen_enc);
3935
  %}
3936
  ins_pipe( pipe_slow );
3937
%}
3938

3939
instruct vround8D_mem(vec dst, memory mem, immU8 rmode) %{
3940
  predicate(Matcher::vector_length(n) == 8);
3941
  match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
3942
  format %{ "vrndscalepd $dst,$mem,$rmode\t! round packed8D" %}
3943
  ins_encode %{
3944
    assert(UseAVX > 2, "required");
3945
    __ vrndscalepd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, Assembler::AVX_512bit);
3946
  %}
3947
  ins_pipe( pipe_slow );
3948
%}
3949
#endif // _LP64
3950

3951
instruct onspinwait() %{
3952
  match(OnSpinWait);
3953
  ins_cost(200);
3954

3955
  format %{
3956
    $$template
3957
    $$emit$$"pause\t! membar_onspinwait"
3958
  %}
3959
  ins_encode %{
3960
    __ pause();
3961
  %}
3962
  ins_pipe(pipe_slow);
3963
%}
3964

3965
// a * b + c
3966
instruct fmaD_reg(regD a, regD b, regD c) %{
3967
  match(Set c (FmaD  c (Binary a b)));
3968
  format %{ "fmasd $a,$b,$c\t# $c = $a * $b + $c" %}
3969
  ins_cost(150);
3970
  ins_encode %{
3971
    assert(UseFMA, "Needs FMA instructions support.");
3972
    __ fmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
3973
  %}
3974
  ins_pipe( pipe_slow );
3975
%}
3976

3977
// a * b + c
3978
instruct fmaF_reg(regF a, regF b, regF c) %{
3979
  match(Set c (FmaF  c (Binary a b)));
3980
  format %{ "fmass $a,$b,$c\t# $c = $a * $b + $c" %}
3981
  ins_cost(150);
3982
  ins_encode %{
3983
    assert(UseFMA, "Needs FMA instructions support.");
3984
    __ fmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
3985
  %}
3986
  ins_pipe( pipe_slow );
3987
%}
3988

3989
// ====================VECTOR INSTRUCTIONS=====================================
3990

3991
// Dummy reg-to-reg vector moves. Removed during post-selection cleanup.
3992
instruct MoveVec2Leg(legVec dst, vec src) %{
3993
  match(Set dst src);
3994
  format %{ "" %}
3995
  ins_encode %{
3996
    ShouldNotReachHere();
3997
  %}
3998
  ins_pipe( fpu_reg_reg );
3999
%}
4000

4001
instruct MoveLeg2Vec(vec dst, legVec src) %{
4002
  match(Set dst src);
4003
  format %{ "" %}
4004
  ins_encode %{
4005
    ShouldNotReachHere();
4006
  %}
4007
  ins_pipe( fpu_reg_reg );
4008
%}
4009

4010
// ============================================================================
4011

4012
// Load vectors generic operand pattern
4013
instruct loadV(vec dst, memory mem) %{
4014
  match(Set dst (LoadVector mem));
4015
  ins_cost(125);
4016
  format %{ "load_vector $dst,$mem" %}
4017
  ins_encode %{
4018
    __ load_vector($dst$$XMMRegister, $mem$$Address, Matcher::vector_length_in_bytes(this));
4019
  %}
4020
  ins_pipe( pipe_slow );
4021
%}
4022

4023
// Store vectors generic operand pattern.
4024
instruct storeV(memory mem, vec src) %{
4025
  match(Set mem (StoreVector mem src));
4026
  ins_cost(145);
4027
  format %{ "store_vector $mem,$src\n\t" %}
4028
  ins_encode %{
4029
    switch (Matcher::vector_length_in_bytes(this, $src)) {
4030
      case  4: __ movdl    ($mem$$Address, $src$$XMMRegister); break;
4031
      case  8: __ movq     ($mem$$Address, $src$$XMMRegister); break;
4032
      case 16: __ movdqu   ($mem$$Address, $src$$XMMRegister); break;
4033
      case 32: __ vmovdqu  ($mem$$Address, $src$$XMMRegister); break;
4034
      case 64: __ evmovdqul($mem$$Address, $src$$XMMRegister, Assembler::AVX_512bit); break;
4035
      default: ShouldNotReachHere();
4036
    }
4037
  %}
4038
  ins_pipe( pipe_slow );
4039
%}
4040

4041
// ---------------------------------------- Gather ------------------------------------
4042

4043
// Gather BYTE, SHORT, INT, LONG, FLOAT, DOUBLE
4044

4045
instruct gather(legVec dst, memory mem, legVec idx, rRegP tmp, legVec mask) %{
4046
  predicate(!VM_Version::supports_avx512vl() && !is_subword_type(Matcher::vector_element_basic_type(n)) &&
4047
            Matcher::vector_length_in_bytes(n) <= 32);
4048
  match(Set dst (LoadVectorGather mem idx));
4049
  effect(TEMP dst, TEMP tmp, TEMP mask);
4050
  format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and $mask as TEMP" %}
4051
  ins_encode %{
4052
    int vlen_enc = vector_length_encoding(this);
4053
    BasicType elem_bt = Matcher::vector_element_basic_type(this);
4054
    assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
4055
    __ vpcmpeqd($mask$$XMMRegister, $mask$$XMMRegister, $mask$$XMMRegister, vlen_enc);
4056
    __ lea($tmp$$Register, $mem$$Address);
4057
    __ vgather(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx$$XMMRegister, $mask$$XMMRegister, vlen_enc);
4058
  %}
4059
  ins_pipe( pipe_slow );
4060
%}
4061

4062

4063
instruct evgather(vec dst, memory mem, vec idx, rRegP tmp, kReg ktmp) %{
4064
  predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64) &&
4065
            !is_subword_type(Matcher::vector_element_basic_type(n)));
4066
  match(Set dst (LoadVectorGather mem idx));
4067
  effect(TEMP dst, TEMP tmp, TEMP ktmp);
4068
  format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and ktmp as TEMP" %}
4069
  ins_encode %{
4070
    int vlen_enc = vector_length_encoding(this);
4071
    BasicType elem_bt = Matcher::vector_element_basic_type(this);
4072
    __ kxnorwl($ktmp$$KRegister, $ktmp$$KRegister, $ktmp$$KRegister);
4073
    __ lea($tmp$$Register, $mem$$Address);
4074
    __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
4075
  %}
4076
  ins_pipe( pipe_slow );
4077
%}
4078

4079
instruct evgather_masked(vec dst, memory mem, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
4080
  predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64) &&
4081
            !is_subword_type(Matcher::vector_element_basic_type(n)));
4082
  match(Set dst (LoadVectorGatherMasked mem (Binary idx mask)));
4083
  effect(TEMP_DEF dst, TEMP tmp, TEMP ktmp);
4084
  format %{ "load_vector_gather_masked $dst, $mem, $idx, $mask\t! using $tmp and ktmp as TEMP" %}
4085
  ins_encode %{
4086
    assert(UseAVX > 2, "sanity");
4087
    int vlen_enc = vector_length_encoding(this);
4088
    BasicType elem_bt = Matcher::vector_element_basic_type(this);
4089
    assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
4090
    // Note: Since gather instruction partially updates the opmask register used
4091
    // for predication hense moving mask operand to a temporary.
4092
    __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
4093
    __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4094
    __ lea($tmp$$Register, $mem$$Address);
4095
    __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
4096
  %}
4097
  ins_pipe( pipe_slow );
4098
%}
4099

4100
instruct vgather_subwordLE8B(vec dst, memory mem, rRegP idx_base, immI_0 offset, rRegP tmp, rRegI rtmp) %{
4101
  predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
4102
  match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
4103
  effect(TEMP tmp, TEMP rtmp);
4104
  format %{ "vector_gatherLE8 $dst, $mem, $idx_base\t! using $tmp and $rtmp as TEMP" %}
4105
  ins_encode %{
4106
    int vlen_enc = vector_length_encoding(this);
4107
    BasicType elem_bt = Matcher::vector_element_basic_type(this);
4108
    __ lea($tmp$$Register, $mem$$Address);
4109
    __ vgather8b_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, noreg, $rtmp$$Register, vlen_enc);
4110
  %}
4111
  ins_pipe( pipe_slow );
4112
%}
4113

4114
instruct vgather_subwordGT8B(vec dst, memory mem, rRegP idx_base, immI_0 offset, rRegP tmp, rRegP idx_base_temp,
4115
                             vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI length, rFlagsReg cr) %{
4116
  predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
4117
  match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
4118
  effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP length, KILL cr);
4119
  format %{ "vector_gatherGT8 $dst, $mem, $idx_base\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp and $length as TEMP" %}
4120
  ins_encode %{
4121
    int vlen_enc = vector_length_encoding(this);
4122
    int vector_len = Matcher::vector_length(this);
4123
    BasicType elem_bt = Matcher::vector_element_basic_type(this);
4124
    __ lea($tmp$$Register, $mem$$Address);
4125
    __ movptr($idx_base_temp$$Register, $idx_base$$Register);
4126
    __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, noreg, noreg, $xtmp1$$XMMRegister,
4127
                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, noreg, $length$$Register, vector_len, vlen_enc);
4128
  %}
4129
  ins_pipe( pipe_slow );
4130
%}
4131

4132
instruct vgather_subwordLE8B_off(vec dst, memory mem, rRegP idx_base, rRegI offset, rRegP tmp, rRegI rtmp, rFlagsReg cr) %{
4133
  predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
4134
  match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
4135
  effect(TEMP tmp, TEMP rtmp, KILL cr);
4136
  format %{ "vector_gatherLE8_off $dst, $mem, $idx_base, $offset\t! using $tmp and $rtmp as TEMP" %}
4137
  ins_encode %{
4138
    int vlen_enc = vector_length_encoding(this);
4139
    BasicType elem_bt = Matcher::vector_element_basic_type(this);
4140
    __ lea($tmp$$Register, $mem$$Address);
4141
    __ vgather8b_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $offset$$Register, $rtmp$$Register, vlen_enc);
4142
  %}
4143
  ins_pipe( pipe_slow );
4144
%}
4145

4146

4147
instruct vgather_subwordGT8B_off(vec dst, memory mem, rRegP idx_base, rRegI offset, rRegP tmp, rRegP idx_base_temp,
4148
                                 vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI length, rFlagsReg cr) %{
4149
  predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
4150
  match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
4151
  effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP length, KILL cr);
4152
  format %{ "vector_gatherGT8_off $dst, $mem, $idx_base, $offset\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp and $length as TEMP" %}
4153
  ins_encode %{
4154
    int vlen_enc = vector_length_encoding(this);
4155
    int vector_len = Matcher::vector_length(this);
4156
    BasicType elem_bt = Matcher::vector_element_basic_type(this);
4157
    __ lea($tmp$$Register, $mem$$Address);
4158
    __ movptr($idx_base_temp$$Register, $idx_base$$Register);
4159
    __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $offset$$Register, noreg, $xtmp1$$XMMRegister,
4160
                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, noreg, $length$$Register, vector_len, vlen_enc);
4161
  %}
4162
  ins_pipe( pipe_slow );
4163
%}
4164

4165

4166
#ifdef _LP64
4167
instruct vgather_masked_subwordLE8B_avx3(vec dst, memory mem, rRegP idx_base, immI_0 offset, kReg mask, rRegL mask_idx, rRegP tmp, rRegI rtmp, rRegL rtmp2, rFlagsReg cr) %{
4168
  predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
4169
  match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
4170
  effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
4171
  format %{ "vector_masked_gatherLE8 $dst, $mem, $idx_base, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
4172
  ins_encode %{
4173
    int vlen_enc = vector_length_encoding(this);
4174
    BasicType elem_bt = Matcher::vector_element_basic_type(this);
4175
    __ xorq($mask_idx$$Register, $mask_idx$$Register);
4176
    __ lea($tmp$$Register, $mem$$Address);
4177
    __ kmovql($rtmp2$$Register, $mask$$KRegister);
4178
    __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, noreg, $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
4179
  %}
4180
  ins_pipe( pipe_slow );
4181
%}
4182

4183
instruct vgather_masked_subwordGT8B_avx3(vec dst, memory mem, rRegP idx_base, immI_0 offset, kReg mask, rRegP tmp, rRegP idx_base_temp,
4184
                                         vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegL rtmp2, rRegL mask_idx, rRegI length, rFlagsReg cr) %{
4185
  predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
4186
  match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
4187
  effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
4188
  format %{ "vector_gatherGT8_masked $dst, $mem, $idx_base, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
4189
  ins_encode %{
4190
    int vlen_enc = vector_length_encoding(this);
4191
    int vector_len = Matcher::vector_length(this);
4192
    BasicType elem_bt = Matcher::vector_element_basic_type(this);
4193
    __ xorq($mask_idx$$Register, $mask_idx$$Register);
4194
    __ lea($tmp$$Register, $mem$$Address);
4195
    __ movptr($idx_base_temp$$Register, $idx_base$$Register);
4196
    __ kmovql($rtmp2$$Register, $mask$$KRegister);
4197
    __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, noreg, $rtmp2$$Register, $xtmp1$$XMMRegister,
4198
                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
4199
  %}
4200
  ins_pipe( pipe_slow );
4201
%}
4202

4203
instruct vgather_masked_subwordLE8B_off_avx3(vec dst, memory mem, rRegP idx_base, rRegI offset, kReg mask, rRegL mask_idx, rRegP tmp, rRegI rtmp, rRegL rtmp2, rFlagsReg cr) %{
4204
  predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
4205
  match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
4206
  effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
4207
  format %{ "vector_masked_gatherLE8_off $dst, $mem, $idx_base, $offset, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
4208
  ins_encode %{
4209
    int vlen_enc = vector_length_encoding(this);
4210
    BasicType elem_bt = Matcher::vector_element_basic_type(this);
4211
    __ xorq($mask_idx$$Register, $mask_idx$$Register);
4212
    __ lea($tmp$$Register, $mem$$Address);
4213
    __ kmovql($rtmp2$$Register, $mask$$KRegister);
4214
    __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $offset$$Register,
4215
                                $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
4216
  %}
4217
  ins_pipe( pipe_slow );
4218
%}
4219

4220
instruct vgather_masked_subwordGT8B_off_avx3(vec dst, memory mem, rRegP idx_base, rRegI offset, kReg mask, rRegP tmp, rRegP idx_base_temp,
4221
                                             vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegL rtmp2, rRegL mask_idx, rRegI length, rFlagsReg cr) %{
4222
  predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
4223
  match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
4224
  effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
4225
  format %{ "vector_gatherGT8_masked_off $dst, $mem, $idx_base, $offset, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
4226
  ins_encode %{
4227
    int vlen_enc = vector_length_encoding(this);
4228
    int vector_len = Matcher::vector_length(this);
4229
    BasicType elem_bt = Matcher::vector_element_basic_type(this);
4230
    __ xorq($mask_idx$$Register, $mask_idx$$Register);
4231
    __ lea($tmp$$Register, $mem$$Address);
4232
    __ movptr($idx_base_temp$$Register, $idx_base$$Register);
4233
    __ kmovql($rtmp2$$Register, $mask$$KRegister);
4234
    __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $offset$$Register, $rtmp2$$Register, $xtmp1$$XMMRegister,
4235
                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
4236
  %}
4237
  ins_pipe( pipe_slow );
4238
%}
4239

4240
instruct vgather_masked_subwordLE8B_avx2(vec dst, memory mem, rRegP idx_base, immI_0 offset, vec mask, rRegI mask_idx, rRegP tmp, rRegI rtmp, rRegI rtmp2, rFlagsReg cr) %{
4241
  predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
4242
  match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
4243
  effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
4244
  format %{ "vector_masked_gatherLE8 $dst, $mem, $idx_base, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
4245
  ins_encode %{
4246
    int vlen_enc = vector_length_encoding(this);
4247
    BasicType elem_bt = Matcher::vector_element_basic_type(this);
4248
    __ lea($tmp$$Register, $mem$$Address);
4249
    __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
4250
    if (elem_bt == T_SHORT) {
4251
      __ movl($mask_idx$$Register, 0x55555555);
4252
      __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
4253
    }
4254
    __ xorl($mask_idx$$Register, $mask_idx$$Register);
4255
    __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, noreg, $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
4256
  %}
4257
  ins_pipe( pipe_slow );
4258
%}
4259

4260
instruct vgather_masked_subwordGT8B_avx2(vec dst, memory mem, rRegP idx_base, immI_0 offset, vec mask, rRegP tmp, rRegP idx_base_temp,
4261
                                         vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI rtmp2, rRegI mask_idx, rRegI length, rFlagsReg cr) %{
4262
  predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
4263
  match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
4264
  effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
4265
  format %{ "vector_gatherGT8_masked $dst, $mem, $idx_base, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
4266
  ins_encode %{
4267
    int vlen_enc = vector_length_encoding(this);
4268
    int vector_len = Matcher::vector_length(this);
4269
    BasicType elem_bt = Matcher::vector_element_basic_type(this);
4270
    __ lea($tmp$$Register, $mem$$Address);
4271
    __ movptr($idx_base_temp$$Register, $idx_base$$Register);
4272
    __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
4273
    if (elem_bt == T_SHORT) {
4274
      __ movl($mask_idx$$Register, 0x55555555);
4275
      __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
4276
    }
4277
    __ xorl($mask_idx$$Register, $mask_idx$$Register);
4278
    __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, noreg, $rtmp2$$Register, $xtmp1$$XMMRegister,
4279
                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
4280
  %}
4281
  ins_pipe( pipe_slow );
4282
%}
4283

4284
instruct vgather_masked_subwordLE8B_off_avx2(vec dst, memory mem, rRegP idx_base, rRegI offset, vec mask, rRegI mask_idx, rRegP tmp, rRegI rtmp, rRegI rtmp2, rFlagsReg cr) %{
4285
  predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
4286
  match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
4287
  effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
4288
  format %{ "vector_masked_gatherLE8_off $dst, $mem, $idx_base, $offset, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
4289
  ins_encode %{
4290
    int vlen_enc = vector_length_encoding(this);
4291
    BasicType elem_bt = Matcher::vector_element_basic_type(this);
4292
    __ lea($tmp$$Register, $mem$$Address);
4293
    __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
4294
    if (elem_bt == T_SHORT) {
4295
      __ movl($mask_idx$$Register, 0x55555555);
4296
      __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
4297
    }
4298
    __ xorl($mask_idx$$Register, $mask_idx$$Register);
4299
    __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $offset$$Register,
4300
                                $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
4301
  %}
4302
  ins_pipe( pipe_slow );
4303
%}
4304

4305
instruct vgather_masked_subwordGT8B_off_avx2(vec dst, memory mem, rRegP idx_base, rRegI offset, vec mask, rRegP tmp, rRegP idx_base_temp,
4306
                                             vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI rtmp2, rRegI mask_idx, rRegI length, rFlagsReg cr) %{
4307
  predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
4308
  match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
4309
  effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
4310
  format %{ "vector_gatherGT8_masked_off $dst, $mem, $idx_base, $offset, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
4311
  ins_encode %{
4312
    int vlen_enc = vector_length_encoding(this);
4313
    int vector_len = Matcher::vector_length(this);
4314
    BasicType elem_bt = Matcher::vector_element_basic_type(this);
4315
    __ xorl($mask_idx$$Register, $mask_idx$$Register);
4316
    __ lea($tmp$$Register, $mem$$Address);
4317
    __ movptr($idx_base_temp$$Register, $idx_base$$Register);
4318
    __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
4319
    if (elem_bt == T_SHORT) {
4320
      __ movl($mask_idx$$Register, 0x55555555);
4321
      __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
4322
    }
4323
    __ xorl($mask_idx$$Register, $mask_idx$$Register);
4324
    __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $offset$$Register, $rtmp2$$Register, $xtmp1$$XMMRegister,
4325
                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
4326
  %}
4327
  ins_pipe( pipe_slow );
4328
%}
4329
#endif
4330

4331
// ====================Scatter=======================================
4332

4333
// Scatter INT, LONG, FLOAT, DOUBLE
4334

4335
instruct scatter(memory mem, vec src, vec idx, rRegP tmp, kReg ktmp) %{
4336
  predicate(UseAVX > 2);
4337
  match(Set mem (StoreVectorScatter mem (Binary src idx)));
4338
  effect(TEMP tmp, TEMP ktmp);
4339
  format %{ "store_vector_scatter $mem, $idx, $src\t! using k2 and $tmp as TEMP" %}
4340
  ins_encode %{
4341
    int vlen_enc = vector_length_encoding(this, $src);
4342
    BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
4343

4344
    assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
4345
    assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
4346

4347
    __ kmovwl($ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), noreg);
4348
    __ lea($tmp$$Register, $mem$$Address);
4349
    __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
4350
  %}
4351
  ins_pipe( pipe_slow );
4352
%}
4353

4354
instruct scatter_masked(memory mem, vec src, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
4355
  match(Set mem (StoreVectorScatterMasked mem (Binary src (Binary idx mask))));
4356
  effect(TEMP tmp, TEMP ktmp);
4357
  format %{ "store_vector_scatter_masked $mem, $idx, $src, $mask\t!" %}
4358
  ins_encode %{
4359
    int vlen_enc = vector_length_encoding(this, $src);
4360
    BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
4361
    assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
4362
    assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
4363
    // Note: Since scatter instruction partially updates the opmask register used
4364
    // for predication hense moving mask operand to a temporary.
4365
    __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
4366
    __ lea($tmp$$Register, $mem$$Address);
4367
    __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
4368
  %}
4369
  ins_pipe( pipe_slow );
4370
%}
4371

4372
// ====================REPLICATE=======================================
4373

4374
// Replicate byte scalar to be vector
4375
instruct vReplB_reg(vec dst, rRegI src) %{
4376
  predicate(Matcher::vector_element_basic_type(n) == T_BYTE);
4377
  match(Set dst (Replicate src));
4378
  format %{ "replicateB $dst,$src" %}
4379
  ins_encode %{
4380
    uint vlen = Matcher::vector_length(this);
4381
    if (UseAVX >= 2) {
4382
      int vlen_enc = vector_length_encoding(this);
4383
      if (vlen == 64 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
4384
        assert(VM_Version::supports_avx512bw(), "required"); // 512-bit byte vectors assume AVX512BW
4385
        __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vlen_enc);
4386
      } else {
4387
        __ movdl($dst$$XMMRegister, $src$$Register);
4388
        __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4389
      }
4390
    } else {
4391
       assert(UseAVX < 2, "");
4392
      __ movdl($dst$$XMMRegister, $src$$Register);
4393
      __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
4394
      __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4395
      if (vlen >= 16) {
4396
        assert(vlen == 16, "");
4397
        __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4398
      }
4399
    }
4400
  %}
4401
  ins_pipe( pipe_slow );
4402
%}
4403

4404
instruct ReplB_mem(vec dst, memory mem) %{
4405
  predicate(UseAVX >= 2 && Matcher::vector_element_basic_type(n) == T_BYTE);
4406
  match(Set dst (Replicate (LoadB mem)));
4407
  format %{ "replicateB $dst,$mem" %}
4408
  ins_encode %{
4409
    int vlen_enc = vector_length_encoding(this);
4410
    __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vlen_enc);
4411
  %}
4412
  ins_pipe( pipe_slow );
4413
%}
4414

4415
// ====================ReplicateS=======================================
4416

4417
instruct vReplS_reg(vec dst, rRegI src) %{
4418
  predicate(Matcher::vector_element_basic_type(n) == T_SHORT);
4419
  match(Set dst (Replicate src));
4420
  format %{ "replicateS $dst,$src" %}
4421
  ins_encode %{
4422
    uint vlen = Matcher::vector_length(this);
4423
    int vlen_enc = vector_length_encoding(this);
4424
    if (UseAVX >= 2) {
4425
      if (vlen == 32 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
4426
        assert(VM_Version::supports_avx512bw(), "required"); // 512-bit short vectors assume AVX512BW
4427
        __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vlen_enc);
4428
      } else {
4429
        __ movdl($dst$$XMMRegister, $src$$Register);
4430
        __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4431
      }
4432
    } else {
4433
      assert(UseAVX < 2, "");
4434
      __ movdl($dst$$XMMRegister, $src$$Register);
4435
      __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4436
      if (vlen >= 8) {
4437
        assert(vlen == 8, "");
4438
        __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4439
      }
4440
    }
4441
  %}
4442
  ins_pipe( pipe_slow );
4443
%}
4444

4445
instruct ReplS_mem(vec dst, memory mem) %{
4446
  predicate(UseAVX >= 2 && Matcher::vector_element_basic_type(n) == T_SHORT);
4447
  match(Set dst (Replicate (LoadS mem)));
4448
  format %{ "replicateS $dst,$mem" %}
4449
  ins_encode %{
4450
    int vlen_enc = vector_length_encoding(this);
4451
    __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vlen_enc);
4452
  %}
4453
  ins_pipe( pipe_slow );
4454
%}
4455

4456
// ====================ReplicateI=======================================
4457

4458
instruct ReplI_reg(vec dst, rRegI src) %{
4459
  predicate(Matcher::vector_element_basic_type(n) == T_INT);
4460
  match(Set dst (Replicate src));
4461
  format %{ "replicateI $dst,$src" %}
4462
  ins_encode %{
4463
    uint vlen = Matcher::vector_length(this);
4464
    int vlen_enc = vector_length_encoding(this);
4465
    if (vlen == 16 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
4466
      __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vlen_enc);
4467
    } else if (VM_Version::supports_avx2()) {
4468
      __ movdl($dst$$XMMRegister, $src$$Register);
4469
      __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4470
    } else {
4471
      __ movdl($dst$$XMMRegister, $src$$Register);
4472
      __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4473
    }
4474
  %}
4475
  ins_pipe( pipe_slow );
4476
%}
4477

4478
instruct ReplI_mem(vec dst, memory mem) %{
4479
  predicate(Matcher::vector_element_basic_type(n) == T_INT);
4480
  match(Set dst (Replicate (LoadI mem)));
4481
  format %{ "replicateI $dst,$mem" %}
4482
  ins_encode %{
4483
    int vlen_enc = vector_length_encoding(this);
4484
    if (VM_Version::supports_avx2()) {
4485
      __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vlen_enc);
4486
    } else if (VM_Version::supports_avx()) {
4487
      __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
4488
    } else {
4489
      __ movdl($dst$$XMMRegister, $mem$$Address);
4490
      __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4491
    }
4492
  %}
4493
  ins_pipe( pipe_slow );
4494
%}
4495

4496
instruct ReplI_imm(vec dst, immI con) %{
4497
  predicate(Matcher::is_non_long_integral_vector(n));
4498
  match(Set dst (Replicate con));
4499
  format %{ "replicateI $dst,$con" %}
4500
  ins_encode %{
4501
    InternalAddress addr = $constantaddress(Matcher::vector_element_basic_type(this),
4502
        vreplicate_imm(Matcher::vector_element_basic_type(this), $con$$constant,
4503
            (VM_Version::supports_sse3() ? (VM_Version::supports_avx() ? 4 : 8) : 8) /
4504
                type2aelembytes(Matcher::vector_element_basic_type(this))));
4505
    BasicType bt = Matcher::vector_element_basic_type(this);
4506
    int vlen = Matcher::vector_length_in_bytes(this);
4507
    __ load_constant_vector(bt, $dst$$XMMRegister, addr, vlen);
4508
  %}
4509
  ins_pipe( pipe_slow );
4510
%}
4511

4512
// Replicate scalar zero to be vector
4513
instruct ReplI_zero(vec dst, immI_0 zero) %{
4514
  predicate(Matcher::is_non_long_integral_vector(n));
4515
  match(Set dst (Replicate zero));
4516
  format %{ "replicateI $dst,$zero" %}
4517
  ins_encode %{
4518
    int vlen_enc = vector_length_encoding(this);
4519
    if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
4520
      __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4521
    } else {
4522
      __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4523
    }
4524
  %}
4525
  ins_pipe( fpu_reg_reg );
4526
%}
4527

4528
instruct ReplI_M1(vec dst, immI_M1 con) %{
4529
  predicate(UseSSE >= 2 && Matcher::is_non_long_integral_vector(n));
4530
  match(Set dst (Replicate con));
4531
  format %{ "vallones $dst" %}
4532
  ins_encode %{
4533
    int vector_len = vector_length_encoding(this);
4534
    __ vallones($dst$$XMMRegister, vector_len);
4535
  %}
4536
  ins_pipe( pipe_slow );
4537
%}
4538

4539
// ====================ReplicateL=======================================
4540

4541
#ifdef _LP64
4542
// Replicate long (8 byte) scalar to be vector
4543
instruct ReplL_reg(vec dst, rRegL src) %{
4544
  predicate(Matcher::vector_element_basic_type(n) == T_LONG);
4545
  match(Set dst (Replicate src));
4546
  format %{ "replicateL $dst,$src" %}
4547
  ins_encode %{
4548
    int vlen = Matcher::vector_length(this);
4549
    int vlen_enc = vector_length_encoding(this);
4550
    if (vlen == 8 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
4551
      __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vlen_enc);
4552
    } else if (VM_Version::supports_avx2()) {
4553
      __ movdq($dst$$XMMRegister, $src$$Register);
4554
      __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4555
    } else {
4556
      __ movdq($dst$$XMMRegister, $src$$Register);
4557
      __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4558
    }
4559
  %}
4560
  ins_pipe( pipe_slow );
4561
%}
4562
#else // _LP64
4563
// Replicate long (8 byte) scalar to be vector
4564
instruct ReplL_reg(vec dst, eRegL src, vec tmp) %{
4565
  predicate(Matcher::vector_length(n) <= 4 && Matcher::vector_element_basic_type(n) == T_LONG);
4566
  match(Set dst (Replicate src));
4567
  effect(TEMP dst, USE src, TEMP tmp);
4568
  format %{ "replicateL $dst,$src" %}
4569
  ins_encode %{
4570
    uint vlen = Matcher::vector_length(this);
4571
    if (vlen == 2) {
4572
      __ movdl($dst$$XMMRegister, $src$$Register);
4573
      __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4574
      __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4575
      __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4576
    } else if (VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
4577
      int vlen_enc = Assembler::AVX_256bit;
4578
      __ movdl($dst$$XMMRegister, $src$$Register);
4579
      __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4580
      __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4581
      __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4582
    } else {
4583
      __ movdl($dst$$XMMRegister, $src$$Register);
4584
      __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4585
      __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4586
      __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4587
      __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4588
    }
4589
  %}
4590
  ins_pipe( pipe_slow );
4591
%}
4592

4593
instruct ReplL_reg_leg(legVec dst, eRegL src, legVec tmp) %{
4594
  predicate(Matcher::vector_length(n) == 8 && Matcher::vector_element_basic_type(n) == T_LONG);
4595
  match(Set dst (Replicate src));
4596
  effect(TEMP dst, USE src, TEMP tmp);
4597
  format %{ "replicateL $dst,$src" %}
4598
  ins_encode %{
4599
    if (VM_Version::supports_avx512vl()) {
4600
      __ movdl($dst$$XMMRegister, $src$$Register);
4601
      __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4602
      __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4603
      __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4604
      __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4605
      __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
4606
    } else {
4607
      int vlen_enc = Assembler::AVX_512bit;
4608
      __ movdl($dst$$XMMRegister, $src$$Register);
4609
      __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4610
      __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4611
      __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4612
    }
4613
  %}
4614
  ins_pipe( pipe_slow );
4615
%}
4616
#endif // _LP64
4617

4618
instruct ReplL_mem(vec dst, memory mem) %{
4619
  predicate(Matcher::vector_element_basic_type(n) == T_LONG);
4620
  match(Set dst (Replicate (LoadL mem)));
4621
  format %{ "replicateL $dst,$mem" %}
4622
  ins_encode %{
4623
    int vlen_enc = vector_length_encoding(this);
4624
    if (VM_Version::supports_avx2()) {
4625
      __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vlen_enc);
4626
    } else if (VM_Version::supports_sse3()) {
4627
      __ movddup($dst$$XMMRegister, $mem$$Address);
4628
    } else {
4629
      __ movq($dst$$XMMRegister, $mem$$Address);
4630
      __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4631
    }
4632
  %}
4633
  ins_pipe( pipe_slow );
4634
%}
4635

4636
// Replicate long (8 byte) scalar immediate to be vector by loading from const table.
4637
instruct ReplL_imm(vec dst, immL con) %{
4638
  predicate(Matcher::vector_element_basic_type(n) == T_LONG);
4639
  match(Set dst (Replicate con));
4640
  format %{ "replicateL $dst,$con" %}
4641
  ins_encode %{
4642
    InternalAddress addr = $constantaddress(T_LONG, vreplicate_imm(T_LONG, $con$$constant, 1));
4643
    int vlen = Matcher::vector_length_in_bytes(this);
4644
    __ load_constant_vector(T_LONG, $dst$$XMMRegister, addr, vlen);
4645
  %}
4646
  ins_pipe( pipe_slow );
4647
%}
4648

4649
instruct ReplL_zero(vec dst, immL0 zero) %{
4650
  predicate(Matcher::vector_element_basic_type(n) == T_LONG);
4651
  match(Set dst (Replicate zero));
4652
  format %{ "replicateL $dst,$zero" %}
4653
  ins_encode %{
4654
    int vlen_enc = vector_length_encoding(this);
4655
    if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
4656
      __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4657
    } else {
4658
      __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4659
    }
4660
  %}
4661
  ins_pipe( fpu_reg_reg );
4662
%}
4663

4664
instruct ReplL_M1(vec dst, immL_M1 con) %{
4665
  predicate(UseSSE >= 2 && Matcher::vector_element_basic_type(n) == T_LONG);
4666
  match(Set dst (Replicate con));
4667
  format %{ "vallones $dst" %}
4668
  ins_encode %{
4669
    int vector_len = vector_length_encoding(this);
4670
    __ vallones($dst$$XMMRegister, vector_len);
4671
  %}
4672
  ins_pipe( pipe_slow );
4673
%}
4674

4675
// ====================ReplicateF=======================================
4676

4677
instruct vReplF_reg(vec dst, vlRegF src) %{
4678
  predicate(UseAVX > 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
4679
  match(Set dst (Replicate src));
4680
  format %{ "replicateF $dst,$src" %}
4681
  ins_encode %{
4682
    uint vlen = Matcher::vector_length(this);
4683
    int vlen_enc = vector_length_encoding(this);
4684
    if (vlen <= 4) {
4685
      __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 0x00, Assembler::AVX_128bit);
4686
    } else if (VM_Version::supports_avx2()) {
4687
      __ vbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
4688
    } else {
4689
      assert(vlen == 8, "sanity");
4690
      __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 0x00, Assembler::AVX_128bit);
4691
      __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
4692
    }
4693
  %}
4694
  ins_pipe( pipe_slow );
4695
%}
4696

4697
instruct ReplF_reg(vec dst, vlRegF src) %{
4698
  predicate(UseAVX == 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
4699
  match(Set dst (Replicate src));
4700
  format %{ "replicateF $dst,$src" %}
4701
  ins_encode %{
4702
    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
4703
  %}
4704
  ins_pipe( pipe_slow );
4705
%}
4706

4707
instruct ReplF_mem(vec dst, memory mem) %{
4708
  predicate(UseAVX > 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
4709
  match(Set dst (Replicate (LoadF mem)));
4710
  format %{ "replicateF $dst,$mem" %}
4711
  ins_encode %{
4712
    int vlen_enc = vector_length_encoding(this);
4713
    __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
4714
  %}
4715
  ins_pipe( pipe_slow );
4716
%}
4717

4718
// Replicate float scalar immediate to be vector by loading from const table.
4719
instruct ReplF_imm(vec dst, immF con) %{
4720
  predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
4721
  match(Set dst (Replicate con));
4722
  format %{ "replicateF $dst,$con" %}
4723
  ins_encode %{
4724
    InternalAddress addr = $constantaddress(T_FLOAT, vreplicate_imm(T_FLOAT, $con$$constant,
4725
        VM_Version::supports_sse3() ? (VM_Version::supports_avx() ? 1 : 2) : 2));
4726
    int vlen = Matcher::vector_length_in_bytes(this);
4727
    __ load_constant_vector(T_FLOAT, $dst$$XMMRegister, addr, vlen);
4728
  %}
4729
  ins_pipe( pipe_slow );
4730
%}
4731

4732
instruct ReplF_zero(vec dst, immF0 zero) %{
4733
  predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
4734
  match(Set dst (Replicate zero));
4735
  format %{ "replicateF $dst,$zero" %}
4736
  ins_encode %{
4737
    int vlen_enc = vector_length_encoding(this);
4738
    if (VM_Version::supports_evex() && !VM_Version::supports_avx512vldq()) {
4739
      __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4740
    } else {
4741
      __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
4742
    }
4743
  %}
4744
  ins_pipe( fpu_reg_reg );
4745
%}
4746

4747
// ====================ReplicateD=======================================
4748

4749
// Replicate double (8 bytes) scalar to be vector
4750
instruct vReplD_reg(vec dst, vlRegD src) %{
4751
  predicate(UseSSE >= 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
4752
  match(Set dst (Replicate src));
4753
  format %{ "replicateD $dst,$src" %}
4754
  ins_encode %{
4755
    uint vlen = Matcher::vector_length(this);
4756
    int vlen_enc = vector_length_encoding(this);
4757
    if (vlen <= 2) {
4758
      __ movddup($dst$$XMMRegister, $src$$XMMRegister);
4759
    } else if (VM_Version::supports_avx2()) {
4760
      __ vbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
4761
    } else {
4762
      assert(vlen == 4, "sanity");
4763
      __ movddup($dst$$XMMRegister, $src$$XMMRegister);
4764
      __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
4765
    }
4766
  %}
4767
  ins_pipe( pipe_slow );
4768
%}
4769

4770
instruct ReplD_reg(vec dst, vlRegD src) %{
4771
  predicate(UseSSE < 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
4772
  match(Set dst (Replicate src));
4773
  format %{ "replicateD $dst,$src" %}
4774
  ins_encode %{
4775
    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
4776
  %}
4777
  ins_pipe( pipe_slow );
4778
%}
4779

4780
instruct ReplD_mem(vec dst, memory mem) %{
4781
  predicate(UseSSE >= 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
4782
  match(Set dst (Replicate (LoadD mem)));
4783
  format %{ "replicateD $dst,$mem" %}
4784
  ins_encode %{
4785
    if (Matcher::vector_length(this) >= 4) {
4786
      int vlen_enc = vector_length_encoding(this);
4787
      __ vbroadcastsd($dst$$XMMRegister, $mem$$Address, vlen_enc);
4788
    } else {
4789
      __ movddup($dst$$XMMRegister, $mem$$Address);
4790
    }
4791
  %}
4792
  ins_pipe( pipe_slow );
4793
%}
4794

4795
// Replicate double (8 byte) scalar immediate to be vector by loading from const table.
4796
instruct ReplD_imm(vec dst, immD con) %{
4797
  predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
4798
  match(Set dst (Replicate con));
4799
  format %{ "replicateD $dst,$con" %}
4800
  ins_encode %{
4801
    InternalAddress addr = $constantaddress(T_DOUBLE, vreplicate_imm(T_DOUBLE, $con$$constant, 1));
4802
    int vlen = Matcher::vector_length_in_bytes(this);
4803
    __ load_constant_vector(T_DOUBLE, $dst$$XMMRegister, addr, vlen);
4804
  %}
4805
  ins_pipe( pipe_slow );
4806
%}
4807

4808
instruct ReplD_zero(vec dst, immD0 zero) %{
4809
  predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
4810
  match(Set dst (Replicate zero));
4811
  format %{ "replicateD $dst,$zero" %}
4812
  ins_encode %{
4813
    int vlen_enc = vector_length_encoding(this);
4814
    if (VM_Version::supports_evex() && !VM_Version::supports_avx512vldq()) {
4815
      __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4816
    } else {
4817
      __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
4818
    }
4819
  %}
4820
  ins_pipe( fpu_reg_reg );
4821
%}
4822

4823
// ====================VECTOR INSERT=======================================
4824

4825
instruct insert(vec dst, rRegI val, immU8 idx) %{
4826
  predicate(Matcher::vector_length_in_bytes(n) < 32);
4827
  match(Set dst (VectorInsert (Binary dst val) idx));
4828
  format %{ "vector_insert $dst,$val,$idx" %}
4829
  ins_encode %{
4830
    assert(UseSSE >= 4, "required");
4831
    assert(Matcher::vector_length_in_bytes(this) >= 8, "required");
4832

4833
    BasicType elem_bt = Matcher::vector_element_basic_type(this);
4834

4835
    assert(is_integral_type(elem_bt), "");
4836
    assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4837

4838
    __ insert(elem_bt, $dst$$XMMRegister, $val$$Register, $idx$$constant);
4839
  %}
4840
  ins_pipe( pipe_slow );
4841
%}
4842

4843
instruct insert32(vec dst, vec src, rRegI val, immU8 idx, vec vtmp) %{
4844
  predicate(Matcher::vector_length_in_bytes(n) == 32);
4845
  match(Set dst (VectorInsert (Binary src val) idx));
4846
  effect(TEMP vtmp);
4847
  format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4848
  ins_encode %{
4849
    int vlen_enc = Assembler::AVX_256bit;
4850
    BasicType elem_bt = Matcher::vector_element_basic_type(this);
4851
    int elem_per_lane = 16/type2aelembytes(elem_bt);
4852
    int log2epr = log2(elem_per_lane);
4853

4854
    assert(is_integral_type(elem_bt), "sanity");
4855
    assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4856

4857
    uint x_idx = $idx$$constant & right_n_bits(log2epr);
4858
    uint y_idx = ($idx$$constant >> log2epr) & 1;
4859
    __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4860
    __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
4861
    __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4862
  %}
4863
  ins_pipe( pipe_slow );
4864
%}
4865

4866
instruct insert64(vec dst, vec src, rRegI val, immU8 idx, legVec vtmp) %{
4867
  predicate(Matcher::vector_length_in_bytes(n) == 64);
4868
  match(Set dst (VectorInsert (Binary src val) idx));
4869
  effect(TEMP vtmp);
4870
  format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4871
  ins_encode %{
4872
    assert(UseAVX > 2, "sanity");
4873

4874
    BasicType elem_bt = Matcher::vector_element_basic_type(this);
4875
    int elem_per_lane = 16/type2aelembytes(elem_bt);
4876
    int log2epr = log2(elem_per_lane);
4877

4878
    assert(is_integral_type(elem_bt), "");
4879
    assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4880

4881
    uint x_idx = $idx$$constant & right_n_bits(log2epr);
4882
    uint y_idx = ($idx$$constant >> log2epr) & 3;
4883
    __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4884
    __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
4885
    __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4886
  %}
4887
  ins_pipe( pipe_slow );
4888
%}
4889

4890
#ifdef _LP64
4891
instruct insert2L(vec dst, rRegL val, immU8 idx) %{
4892
  predicate(Matcher::vector_length(n) == 2);
4893
  match(Set dst (VectorInsert (Binary dst val) idx));
4894
  format %{ "vector_insert $dst,$val,$idx" %}
4895
  ins_encode %{
4896
    assert(UseSSE >= 4, "required");
4897
    assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
4898
    assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4899

4900
    __ pinsrq($dst$$XMMRegister, $val$$Register, $idx$$constant);
4901
  %}
4902
  ins_pipe( pipe_slow );
4903
%}
4904

4905
instruct insert4L(vec dst, vec src, rRegL val, immU8 idx, vec vtmp) %{
4906
  predicate(Matcher::vector_length(n) == 4);
4907
  match(Set dst (VectorInsert (Binary src val) idx));
4908
  effect(TEMP vtmp);
4909
  format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4910
  ins_encode %{
4911
    assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
4912
    assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4913

4914
    uint x_idx = $idx$$constant & right_n_bits(1);
4915
    uint y_idx = ($idx$$constant >> 1) & 1;
4916
    int vlen_enc = Assembler::AVX_256bit;
4917
    __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4918
    __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
4919
    __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4920
  %}
4921
  ins_pipe( pipe_slow );
4922
%}
4923

4924
instruct insert8L(vec dst, vec src, rRegL val, immU8 idx, legVec vtmp) %{
4925
  predicate(Matcher::vector_length(n) == 8);
4926
  match(Set dst (VectorInsert (Binary src val) idx));
4927
  effect(TEMP vtmp);
4928
  format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4929
  ins_encode %{
4930
    assert(Matcher::vector_element_basic_type(this) == T_LONG, "sanity");
4931
    assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4932

4933
    uint x_idx = $idx$$constant & right_n_bits(1);
4934
    uint y_idx = ($idx$$constant >> 1) & 3;
4935
    __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4936
    __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
4937
    __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4938
  %}
4939
  ins_pipe( pipe_slow );
4940
%}
4941
#endif
4942

4943
instruct insertF(vec dst, regF val, immU8 idx) %{
4944
  predicate(Matcher::vector_length(n) < 8);
4945
  match(Set dst (VectorInsert (Binary dst val) idx));
4946
  format %{ "vector_insert $dst,$val,$idx" %}
4947
  ins_encode %{
4948
    assert(UseSSE >= 4, "sanity");
4949

4950
    assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
4951
    assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4952

4953
    uint x_idx = $idx$$constant & right_n_bits(2);
4954
    __ insertps($dst$$XMMRegister, $val$$XMMRegister, x_idx << 4);
4955
  %}
4956
  ins_pipe( pipe_slow );
4957
%}
4958

4959
instruct vinsertF(vec dst, vec src, regF val, immU8 idx, vec vtmp) %{
4960
  predicate(Matcher::vector_length(n) >= 8);
4961
  match(Set dst (VectorInsert (Binary src val) idx));
4962
  effect(TEMP vtmp);
4963
  format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4964
  ins_encode %{
4965
    assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
4966
    assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4967

4968
    int vlen = Matcher::vector_length(this);
4969
    uint x_idx = $idx$$constant & right_n_bits(2);
4970
    if (vlen == 8) {
4971
      uint y_idx = ($idx$$constant >> 2) & 1;
4972
      int vlen_enc = Assembler::AVX_256bit;
4973
      __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4974
      __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx << 4);
4975
      __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4976
    } else {
4977
      assert(vlen == 16, "sanity");
4978
      uint y_idx = ($idx$$constant >> 2) & 3;
4979
      __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4980
      __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx << 4);
4981
      __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4982
    }
4983
  %}
4984
  ins_pipe( pipe_slow );
4985
%}
4986

4987
#ifdef _LP64
4988
instruct insert2D(vec dst, regD val, immU8 idx, rRegL tmp) %{
4989
  predicate(Matcher::vector_length(n) == 2);
4990
  match(Set dst (VectorInsert (Binary dst val) idx));
4991
  effect(TEMP tmp);
4992
  format %{ "vector_insert $dst,$val,$idx\t!using $tmp as TEMP" %}
4993
  ins_encode %{
4994
    assert(UseSSE >= 4, "sanity");
4995
    assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
4996
    assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4997

4998
    __ movq($tmp$$Register, $val$$XMMRegister);
4999
    __ pinsrq($dst$$XMMRegister, $tmp$$Register, $idx$$constant);
5000
  %}
5001
  ins_pipe( pipe_slow );
5002
%}
5003

5004
instruct insert4D(vec dst, vec src, regD val, immU8 idx, rRegL tmp, vec vtmp) %{
5005
  predicate(Matcher::vector_length(n) == 4);
5006
  match(Set dst (VectorInsert (Binary src val) idx));
5007
  effect(TEMP vtmp, TEMP tmp);
5008
  format %{ "vector_insert $dst,$src,$val,$idx\t!using $tmp, $vtmp as TEMP" %}
5009
  ins_encode %{
5010
    assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
5011
    assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
5012

5013
    uint x_idx = $idx$$constant & right_n_bits(1);
5014
    uint y_idx = ($idx$$constant >> 1) & 1;
5015
    int vlen_enc = Assembler::AVX_256bit;
5016
    __ movq($tmp$$Register, $val$$XMMRegister);
5017
    __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
5018
    __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
5019
    __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
5020
  %}
5021
  ins_pipe( pipe_slow );
5022
%}
5023

5024
instruct insert8D(vec dst, vec src, regD val, immI idx, rRegL tmp, legVec vtmp) %{
5025
  predicate(Matcher::vector_length(n) == 8);
5026
  match(Set dst (VectorInsert (Binary src val) idx));
5027
  effect(TEMP tmp, TEMP vtmp);
5028
  format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
5029
  ins_encode %{
5030
    assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
5031
    assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
5032

5033
    uint x_idx = $idx$$constant & right_n_bits(1);
5034
    uint y_idx = ($idx$$constant >> 1) & 3;
5035
    __ movq($tmp$$Register, $val$$XMMRegister);
5036
    __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
5037
    __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
5038
    __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
5039
  %}
5040
  ins_pipe( pipe_slow );
5041
%}
5042
#endif
5043

5044
// ====================REDUCTION ARITHMETIC=======================================
5045

5046
// =======================Int Reduction==========================================
5047

5048
instruct reductionI(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
5049
  predicate(Matcher::vector_element_basic_type(n->in(2)) == T_INT); // src2
5050
  match(Set dst (AddReductionVI src1 src2));
5051
  match(Set dst (MulReductionVI src1 src2));
5052
  match(Set dst (AndReductionV  src1 src2));
5053
  match(Set dst ( OrReductionV  src1 src2));
5054
  match(Set dst (XorReductionV  src1 src2));
5055
  match(Set dst (MinReductionV  src1 src2));
5056
  match(Set dst (MaxReductionV  src1 src2));
5057
  effect(TEMP vtmp1, TEMP vtmp2);
5058
  format %{ "vector_reduction_int $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
5059
  ins_encode %{
5060
    int opcode = this->ideal_Opcode();
5061
    int vlen = Matcher::vector_length(this, $src2);
5062
    __ reduceI(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
5063
  %}
5064
  ins_pipe( pipe_slow );
5065
%}
5066

5067
// =======================Long Reduction==========================================
5068

5069
#ifdef _LP64
5070
instruct reductionL(rRegL dst, rRegL src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
5071
  predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && !VM_Version::supports_avx512dq());
5072
  match(Set dst (AddReductionVL src1 src2));
5073
  match(Set dst (MulReductionVL src1 src2));
5074
  match(Set dst (AndReductionV  src1 src2));
5075
  match(Set dst ( OrReductionV  src1 src2));
5076
  match(Set dst (XorReductionV  src1 src2));
5077
  match(Set dst (MinReductionV  src1 src2));
5078
  match(Set dst (MaxReductionV  src1 src2));
5079
  effect(TEMP vtmp1, TEMP vtmp2);
5080
  format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
5081
  ins_encode %{
5082
    int opcode = this->ideal_Opcode();
5083
    int vlen = Matcher::vector_length(this, $src2);
5084
    __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
5085
  %}
5086
  ins_pipe( pipe_slow );
5087
%}
5088

5089
instruct reductionL_avx512dq(rRegL dst, rRegL src1, vec src2, vec vtmp1, vec vtmp2) %{
5090
  predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && VM_Version::supports_avx512dq());
5091
  match(Set dst (AddReductionVL src1 src2));
5092
  match(Set dst (MulReductionVL src1 src2));
5093
  match(Set dst (AndReductionV  src1 src2));
5094
  match(Set dst ( OrReductionV  src1 src2));
5095
  match(Set dst (XorReductionV  src1 src2));
5096
  match(Set dst (MinReductionV  src1 src2));
5097
  match(Set dst (MaxReductionV  src1 src2));
5098
  effect(TEMP vtmp1, TEMP vtmp2);
5099
  format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
5100
  ins_encode %{
5101
    int opcode = this->ideal_Opcode();
5102
    int vlen = Matcher::vector_length(this, $src2);
5103
    __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
5104
  %}
5105
  ins_pipe( pipe_slow );
5106
%}
5107
#endif // _LP64
5108

5109
// =======================Float Reduction==========================================
5110

5111
instruct reductionF128(regF dst, vec src, vec vtmp) %{
5112
  predicate(Matcher::vector_length(n->in(2)) <= 4); // src
5113
  match(Set dst (AddReductionVF dst src));
5114
  match(Set dst (MulReductionVF dst src));
5115
  effect(TEMP dst, TEMP vtmp);
5116
  format %{ "vector_reduction_float  $dst,$src ; using $vtmp as TEMP" %}
5117
  ins_encode %{
5118
    int opcode = this->ideal_Opcode();
5119
    int vlen = Matcher::vector_length(this, $src);
5120
    __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
5121
  %}
5122
  ins_pipe( pipe_slow );
5123
%}
5124

5125
instruct reduction8F(regF dst, vec src, vec vtmp1, vec vtmp2) %{
5126
  predicate(Matcher::vector_length(n->in(2)) == 8); // src
5127
  match(Set dst (AddReductionVF dst src));
5128
  match(Set dst (MulReductionVF dst src));
5129
  effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
5130
  format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
5131
  ins_encode %{
5132
    int opcode = this->ideal_Opcode();
5133
    int vlen = Matcher::vector_length(this, $src);
5134
    __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
5135
  %}
5136
  ins_pipe( pipe_slow );
5137
%}
5138

5139
instruct reduction16F(regF dst, legVec src, legVec vtmp1, legVec vtmp2) %{
5140
  predicate(Matcher::vector_length(n->in(2)) == 16); // src
5141
  match(Set dst (AddReductionVF dst src));
5142
  match(Set dst (MulReductionVF dst src));
5143
  effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
5144
  format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
5145
  ins_encode %{
5146
    int opcode = this->ideal_Opcode();
5147
    int vlen = Matcher::vector_length(this, $src);
5148
    __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
5149
  %}
5150
  ins_pipe( pipe_slow );
5151
%}
5152

5153
// =======================Double Reduction==========================================
5154

5155
instruct reduction2D(regD dst, vec src, vec vtmp) %{
5156
  predicate(Matcher::vector_length(n->in(2)) == 2); // src
5157
  match(Set dst (AddReductionVD dst src));
5158
  match(Set dst (MulReductionVD dst src));
5159
  effect(TEMP dst, TEMP vtmp);
5160
  format %{ "vector_reduction_double $dst,$src ; using $vtmp as TEMP" %}
5161
  ins_encode %{
5162
    int opcode = this->ideal_Opcode();
5163
    int vlen = Matcher::vector_length(this, $src);
5164
    __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
5165
%}
5166
  ins_pipe( pipe_slow );
5167
%}
5168

5169
instruct reduction4D(regD dst, vec src, vec vtmp1, vec vtmp2) %{
5170
  predicate(Matcher::vector_length(n->in(2)) == 4); // src
5171
  match(Set dst (AddReductionVD dst src));
5172
  match(Set dst (MulReductionVD dst src));
5173
  effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
5174
  format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
5175
  ins_encode %{
5176
    int opcode = this->ideal_Opcode();
5177
    int vlen = Matcher::vector_length(this, $src);
5178
    __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
5179
  %}
5180
  ins_pipe( pipe_slow );
5181
%}
5182

5183
instruct reduction8D(regD dst, legVec src, legVec vtmp1, legVec vtmp2) %{
5184
  predicate(Matcher::vector_length(n->in(2)) == 8); // src
5185
  match(Set dst (AddReductionVD dst src));
5186
  match(Set dst (MulReductionVD dst src));
5187
  effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
5188
  format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
5189
  ins_encode %{
5190
    int opcode = this->ideal_Opcode();
5191
    int vlen = Matcher::vector_length(this, $src);
5192
    __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
5193
  %}
5194
  ins_pipe( pipe_slow );
5195
%}
5196

5197
// =======================Byte Reduction==========================================
5198

5199
#ifdef _LP64
5200
instruct reductionB(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
5201
  predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && !VM_Version::supports_avx512bw());
5202
  match(Set dst (AddReductionVI src1 src2));
5203
  match(Set dst (AndReductionV  src1 src2));
5204
  match(Set dst ( OrReductionV  src1 src2));
5205
  match(Set dst (XorReductionV  src1 src2));
5206
  match(Set dst (MinReductionV  src1 src2));
5207
  match(Set dst (MaxReductionV  src1 src2));
5208
  effect(TEMP vtmp1, TEMP vtmp2);
5209
  format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
5210
  ins_encode %{
5211
    int opcode = this->ideal_Opcode();
5212
    int vlen = Matcher::vector_length(this, $src2);
5213
    __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
5214
  %}
5215
  ins_pipe( pipe_slow );
5216
%}
5217

5218
instruct reductionB_avx512bw(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
5219
  predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && VM_Version::supports_avx512bw());
5220
  match(Set dst (AddReductionVI src1 src2));
5221
  match(Set dst (AndReductionV  src1 src2));
5222
  match(Set dst ( OrReductionV  src1 src2));
5223
  match(Set dst (XorReductionV  src1 src2));
5224
  match(Set dst (MinReductionV  src1 src2));
5225
  match(Set dst (MaxReductionV  src1 src2));
5226
  effect(TEMP vtmp1, TEMP vtmp2);
5227
  format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
5228
  ins_encode %{
5229
    int opcode = this->ideal_Opcode();
5230
    int vlen = Matcher::vector_length(this, $src2);
5231
    __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
5232
  %}
5233
  ins_pipe( pipe_slow );
5234
%}
5235
#endif
5236

5237
// =======================Short Reduction==========================================
5238

5239
instruct reductionS(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
5240
  predicate(Matcher::vector_element_basic_type(n->in(2)) == T_SHORT); // src2
5241
  match(Set dst (AddReductionVI src1 src2));
5242
  match(Set dst (MulReductionVI src1 src2));
5243
  match(Set dst (AndReductionV  src1 src2));
5244
  match(Set dst ( OrReductionV  src1 src2));
5245
  match(Set dst (XorReductionV  src1 src2));
5246
  match(Set dst (MinReductionV  src1 src2));
5247
  match(Set dst (MaxReductionV  src1 src2));
5248
  effect(TEMP vtmp1, TEMP vtmp2);
5249
  format %{ "vector_reduction_short $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
5250
  ins_encode %{
5251
    int opcode = this->ideal_Opcode();
5252
    int vlen = Matcher::vector_length(this, $src2);
5253
    __ reduceS(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
5254
  %}
5255
  ins_pipe( pipe_slow );
5256
%}
5257

5258
// =======================Mul Reduction==========================================
5259

5260
instruct mul_reductionB(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
5261
  predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
5262
            Matcher::vector_length(n->in(2)) <= 32); // src2
5263
  match(Set dst (MulReductionVI src1 src2));
5264
  effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
5265
  format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
5266
  ins_encode %{
5267
    int opcode = this->ideal_Opcode();
5268
    int vlen = Matcher::vector_length(this, $src2);
5269
    __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
5270
  %}
5271
  ins_pipe( pipe_slow );
5272
%}
5273

5274
instruct mul_reduction64B(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
5275
  predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
5276
            Matcher::vector_length(n->in(2)) == 64); // src2
5277
  match(Set dst (MulReductionVI src1 src2));
5278
  effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
5279
  format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
5280
  ins_encode %{
5281
    int opcode = this->ideal_Opcode();
5282
    int vlen = Matcher::vector_length(this, $src2);
5283
    __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
5284
  %}
5285
  ins_pipe( pipe_slow );
5286
%}
5287

5288
//--------------------Min/Max Float Reduction --------------------
5289
// Float Min Reduction
5290
instruct minmax_reduction2F(legRegF dst, immF src1, legVec src2, legVec tmp,
5291
                            legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
5292
  predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
5293
            ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
5294
             (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
5295
            Matcher::vector_length(n->in(2)) == 2);
5296
  match(Set dst (MinReductionV src1 src2));
5297
  match(Set dst (MaxReductionV src1 src2));
5298
  effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
5299
  format %{ "vector_minmax2F_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
5300
  ins_encode %{
5301
    assert(UseAVX > 0, "sanity");
5302

5303
    int opcode = this->ideal_Opcode();
5304
    int vlen = Matcher::vector_length(this, $src2);
5305
    __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
5306
                         $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
5307
  %}
5308
  ins_pipe( pipe_slow );
5309
%}
5310

5311
instruct minmax_reductionF(legRegF dst, immF src1, legVec src2, legVec tmp, legVec atmp,
5312
                           legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
5313
  predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
5314
            ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
5315
             (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
5316
            Matcher::vector_length(n->in(2)) >= 4);
5317
  match(Set dst (MinReductionV src1 src2));
5318
  match(Set dst (MaxReductionV src1 src2));
5319
  effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
5320
  format %{ "vector_minmaxF_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
5321
  ins_encode %{
5322
    assert(UseAVX > 0, "sanity");
5323

5324
    int opcode = this->ideal_Opcode();
5325
    int vlen = Matcher::vector_length(this, $src2);
5326
    __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
5327
                         $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
5328
  %}
5329
  ins_pipe( pipe_slow );
5330
%}
5331

5332
instruct minmax_reduction2F_av(legRegF dst, legVec src, legVec tmp,
5333
                               legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
5334
  predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
5335
            Matcher::vector_length(n->in(2)) == 2);
5336
  match(Set dst (MinReductionV dst src));
5337
  match(Set dst (MaxReductionV dst src));
5338
  effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
5339
  format %{ "vector_minmax2F_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
5340
  ins_encode %{
5341
    assert(UseAVX > 0, "sanity");
5342

5343
    int opcode = this->ideal_Opcode();
5344
    int vlen = Matcher::vector_length(this, $src);
5345
    __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
5346
                         $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
5347
  %}
5348
  ins_pipe( pipe_slow );
5349
%}
5350

5351

5352
instruct minmax_reductionF_av(legRegF dst, legVec src, legVec tmp,
5353
                              legVec atmp, legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
5354
  predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
5355
            Matcher::vector_length(n->in(2)) >= 4);
5356
  match(Set dst (MinReductionV dst src));
5357
  match(Set dst (MaxReductionV dst src));
5358
  effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
5359
  format %{ "vector_minmaxF_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
5360
  ins_encode %{
5361
    assert(UseAVX > 0, "sanity");
5362

5363
    int opcode = this->ideal_Opcode();
5364
    int vlen = Matcher::vector_length(this, $src);
5365
    __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
5366
                         $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
5367
  %}
5368
  ins_pipe( pipe_slow );
5369
%}
5370

5371

5372
//--------------------Min Double Reduction --------------------
5373
instruct minmax_reduction2D(legRegD dst, immD src1, legVec src2,
5374
                            legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
5375
                            rFlagsReg cr) %{
5376
  predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
5377
            ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
5378
             (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
5379
            Matcher::vector_length(n->in(2)) == 2);
5380
  match(Set dst (MinReductionV src1 src2));
5381
  match(Set dst (MaxReductionV src1 src2));
5382
  effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
5383
  format %{ "vector_minmax2D_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
5384
  ins_encode %{
5385
    assert(UseAVX > 0, "sanity");
5386

5387
    int opcode = this->ideal_Opcode();
5388
    int vlen = Matcher::vector_length(this, $src2);
5389
    __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
5390
                          $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
5391
  %}
5392
  ins_pipe( pipe_slow );
5393
%}
5394

5395
instruct minmax_reductionD(legRegD dst, immD src1, legVec src2,
5396
                           legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
5397
                           rFlagsReg cr) %{
5398
  predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
5399
            ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
5400
             (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
5401
            Matcher::vector_length(n->in(2)) >= 4);
5402
  match(Set dst (MinReductionV src1 src2));
5403
  match(Set dst (MaxReductionV src1 src2));
5404
  effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
5405
  format %{ "vector_minmaxD_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
5406
  ins_encode %{
5407
    assert(UseAVX > 0, "sanity");
5408

5409
    int opcode = this->ideal_Opcode();
5410
    int vlen = Matcher::vector_length(this, $src2);
5411
    __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
5412
                          $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
5413
  %}
5414
  ins_pipe( pipe_slow );
5415
%}
5416

5417

5418
instruct minmax_reduction2D_av(legRegD dst, legVec src,
5419
                               legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
5420
                               rFlagsReg cr) %{
5421
  predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
5422
            Matcher::vector_length(n->in(2)) == 2);
5423
  match(Set dst (MinReductionV dst src));
5424
  match(Set dst (MaxReductionV dst src));
5425
  effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
5426
  format %{ "vector_minmax2D_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
5427
  ins_encode %{
5428
    assert(UseAVX > 0, "sanity");
5429

5430
    int opcode = this->ideal_Opcode();
5431
    int vlen = Matcher::vector_length(this, $src);
5432
    __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
5433
                          $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
5434
  %}
5435
  ins_pipe( pipe_slow );
5436
%}
5437

5438
instruct minmax_reductionD_av(legRegD dst, legVec src,
5439
                              legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
5440
                              rFlagsReg cr) %{
5441
  predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
5442
            Matcher::vector_length(n->in(2)) >= 4);
5443
  match(Set dst (MinReductionV dst src));
5444
  match(Set dst (MaxReductionV dst src));
5445
  effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
5446
  format %{ "vector_minmaxD_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
5447
  ins_encode %{
5448
    assert(UseAVX > 0, "sanity");
5449

5450
    int opcode = this->ideal_Opcode();
5451
    int vlen = Matcher::vector_length(this, $src);
5452
    __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
5453
                          $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
5454
  %}
5455
  ins_pipe( pipe_slow );
5456
%}
5457

5458
// ====================VECTOR ARITHMETIC=======================================
5459

5460
// --------------------------------- ADD --------------------------------------
5461

5462
// Bytes vector add
5463
instruct vaddB(vec dst, vec src) %{
5464
  predicate(UseAVX == 0);
5465
  match(Set dst (AddVB dst src));
5466
  format %{ "paddb   $dst,$src\t! add packedB" %}
5467
  ins_encode %{
5468
    __ paddb($dst$$XMMRegister, $src$$XMMRegister);
5469
  %}
5470
  ins_pipe( pipe_slow );
5471
%}
5472

5473
instruct vaddB_reg(vec dst, vec src1, vec src2) %{
5474
  predicate(UseAVX > 0);
5475
  match(Set dst (AddVB src1 src2));
5476
  format %{ "vpaddb  $dst,$src1,$src2\t! add packedB" %}
5477
  ins_encode %{
5478
    int vlen_enc = vector_length_encoding(this);
5479
    __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5480
  %}
5481
  ins_pipe( pipe_slow );
5482
%}
5483

5484
instruct vaddB_mem(vec dst, vec src, memory mem) %{
5485
  predicate((UseAVX > 0) &&
5486
            (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5487
  match(Set dst (AddVB src (LoadVector mem)));
5488
  format %{ "vpaddb  $dst,$src,$mem\t! add packedB" %}
5489
  ins_encode %{
5490
    int vlen_enc = vector_length_encoding(this);
5491
    __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5492
  %}
5493
  ins_pipe( pipe_slow );
5494
%}
5495

5496
// Shorts/Chars vector add
5497
instruct vaddS(vec dst, vec src) %{
5498
  predicate(UseAVX == 0);
5499
  match(Set dst (AddVS dst src));
5500
  format %{ "paddw   $dst,$src\t! add packedS" %}
5501
  ins_encode %{
5502
    __ paddw($dst$$XMMRegister, $src$$XMMRegister);
5503
  %}
5504
  ins_pipe( pipe_slow );
5505
%}
5506

5507
instruct vaddS_reg(vec dst, vec src1, vec src2) %{
5508
  predicate(UseAVX > 0);
5509
  match(Set dst (AddVS src1 src2));
5510
  format %{ "vpaddw  $dst,$src1,$src2\t! add packedS" %}
5511
  ins_encode %{
5512
    int vlen_enc = vector_length_encoding(this);
5513
    __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5514
  %}
5515
  ins_pipe( pipe_slow );
5516
%}
5517

5518
instruct vaddS_mem(vec dst, vec src, memory mem) %{
5519
  predicate((UseAVX > 0) &&
5520
            (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5521
  match(Set dst (AddVS src (LoadVector mem)));
5522
  format %{ "vpaddw  $dst,$src,$mem\t! add packedS" %}
5523
  ins_encode %{
5524
    int vlen_enc = vector_length_encoding(this);
5525
    __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5526
  %}
5527
  ins_pipe( pipe_slow );
5528
%}
5529

5530
// Integers vector add
5531
instruct vaddI(vec dst, vec src) %{
5532
  predicate(UseAVX == 0);
5533
  match(Set dst (AddVI dst src));
5534
  format %{ "paddd   $dst,$src\t! add packedI" %}
5535
  ins_encode %{
5536
    __ paddd($dst$$XMMRegister, $src$$XMMRegister);
5537
  %}
5538
  ins_pipe( pipe_slow );
5539
%}
5540

5541
instruct vaddI_reg(vec dst, vec src1, vec src2) %{
5542
  predicate(UseAVX > 0);
5543
  match(Set dst (AddVI src1 src2));
5544
  format %{ "vpaddd  $dst,$src1,$src2\t! add packedI" %}
5545
  ins_encode %{
5546
    int vlen_enc = vector_length_encoding(this);
5547
    __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5548
  %}
5549
  ins_pipe( pipe_slow );
5550
%}
5551

5552

5553
instruct vaddI_mem(vec dst, vec src, memory mem) %{
5554
  predicate((UseAVX > 0) &&
5555
            (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5556
  match(Set dst (AddVI src (LoadVector mem)));
5557
  format %{ "vpaddd  $dst,$src,$mem\t! add packedI" %}
5558
  ins_encode %{
5559
    int vlen_enc = vector_length_encoding(this);
5560
    __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5561
  %}
5562
  ins_pipe( pipe_slow );
5563
%}
5564

5565
// Longs vector add
5566
instruct vaddL(vec dst, vec src) %{
5567
  predicate(UseAVX == 0);
5568
  match(Set dst (AddVL dst src));
5569
  format %{ "paddq   $dst,$src\t! add packedL" %}
5570
  ins_encode %{
5571
    __ paddq($dst$$XMMRegister, $src$$XMMRegister);
5572
  %}
5573
  ins_pipe( pipe_slow );
5574
%}
5575

5576
instruct vaddL_reg(vec dst, vec src1, vec src2) %{
5577
  predicate(UseAVX > 0);
5578
  match(Set dst (AddVL src1 src2));
5579
  format %{ "vpaddq  $dst,$src1,$src2\t! add packedL" %}
5580
  ins_encode %{
5581
    int vlen_enc = vector_length_encoding(this);
5582
    __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5583
  %}
5584
  ins_pipe( pipe_slow );
5585
%}
5586

5587
instruct vaddL_mem(vec dst, vec src, memory mem) %{
5588
  predicate((UseAVX > 0) &&
5589
            (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5590
  match(Set dst (AddVL src (LoadVector mem)));
5591
  format %{ "vpaddq  $dst,$src,$mem\t! add packedL" %}
5592
  ins_encode %{
5593
    int vlen_enc = vector_length_encoding(this);
5594
    __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5595
  %}
5596
  ins_pipe( pipe_slow );
5597
%}
5598

5599
// Floats vector add
5600
instruct vaddF(vec dst, vec src) %{
5601
  predicate(UseAVX == 0);
5602
  match(Set dst (AddVF dst src));
5603
  format %{ "addps   $dst,$src\t! add packedF" %}
5604
  ins_encode %{
5605
    __ addps($dst$$XMMRegister, $src$$XMMRegister);
5606
  %}
5607
  ins_pipe( pipe_slow );
5608
%}
5609

5610
instruct vaddF_reg(vec dst, vec src1, vec src2) %{
5611
  predicate(UseAVX > 0);
5612
  match(Set dst (AddVF src1 src2));
5613
  format %{ "vaddps  $dst,$src1,$src2\t! add packedF" %}
5614
  ins_encode %{
5615
    int vlen_enc = vector_length_encoding(this);
5616
    __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5617
  %}
5618
  ins_pipe( pipe_slow );
5619
%}
5620

5621
instruct vaddF_mem(vec dst, vec src, memory mem) %{
5622
  predicate((UseAVX > 0) &&
5623
            (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5624
  match(Set dst (AddVF src (LoadVector mem)));
5625
  format %{ "vaddps  $dst,$src,$mem\t! add packedF" %}
5626
  ins_encode %{
5627
    int vlen_enc = vector_length_encoding(this);
5628
    __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5629
  %}
5630
  ins_pipe( pipe_slow );
5631
%}
5632

5633
// Doubles vector add
5634
instruct vaddD(vec dst, vec src) %{
5635
  predicate(UseAVX == 0);
5636
  match(Set dst (AddVD dst src));
5637
  format %{ "addpd   $dst,$src\t! add packedD" %}
5638
  ins_encode %{
5639
    __ addpd($dst$$XMMRegister, $src$$XMMRegister);
5640
  %}
5641
  ins_pipe( pipe_slow );
5642
%}
5643

5644
instruct vaddD_reg(vec dst, vec src1, vec src2) %{
5645
  predicate(UseAVX > 0);
5646
  match(Set dst (AddVD src1 src2));
5647
  format %{ "vaddpd  $dst,$src1,$src2\t! add packedD" %}
5648
  ins_encode %{
5649
    int vlen_enc = vector_length_encoding(this);
5650
    __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5651
  %}
5652
  ins_pipe( pipe_slow );
5653
%}
5654

5655
instruct vaddD_mem(vec dst, vec src, memory mem) %{
5656
  predicate((UseAVX > 0) &&
5657
            (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5658
  match(Set dst (AddVD src (LoadVector mem)));
5659
  format %{ "vaddpd  $dst,$src,$mem\t! add packedD" %}
5660
  ins_encode %{
5661
    int vlen_enc = vector_length_encoding(this);
5662
    __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5663
  %}
5664
  ins_pipe( pipe_slow );
5665
%}
5666

5667
// --------------------------------- SUB --------------------------------------
5668

5669
// Bytes vector sub
5670
instruct vsubB(vec dst, vec src) %{
5671
  predicate(UseAVX == 0);
5672
  match(Set dst (SubVB dst src));
5673
  format %{ "psubb   $dst,$src\t! sub packedB" %}
5674
  ins_encode %{
5675
    __ psubb($dst$$XMMRegister, $src$$XMMRegister);
5676
  %}
5677
  ins_pipe( pipe_slow );
5678
%}
5679

5680
instruct vsubB_reg(vec dst, vec src1, vec src2) %{
5681
  predicate(UseAVX > 0);
5682
  match(Set dst (SubVB src1 src2));
5683
  format %{ "vpsubb  $dst,$src1,$src2\t! sub packedB" %}
5684
  ins_encode %{
5685
    int vlen_enc = vector_length_encoding(this);
5686
    __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5687
  %}
5688
  ins_pipe( pipe_slow );
5689
%}
5690

5691
instruct vsubB_mem(vec dst, vec src, memory mem) %{
5692
  predicate((UseAVX > 0) &&
5693
            (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5694
  match(Set dst (SubVB src (LoadVector mem)));
5695
  format %{ "vpsubb  $dst,$src,$mem\t! sub packedB" %}
5696
  ins_encode %{
5697
    int vlen_enc = vector_length_encoding(this);
5698
    __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5699
  %}
5700
  ins_pipe( pipe_slow );
5701
%}
5702

5703
// Shorts/Chars vector sub
5704
instruct vsubS(vec dst, vec src) %{
5705
  predicate(UseAVX == 0);
5706
  match(Set dst (SubVS dst src));
5707
  format %{ "psubw   $dst,$src\t! sub packedS" %}
5708
  ins_encode %{
5709
    __ psubw($dst$$XMMRegister, $src$$XMMRegister);
5710
  %}
5711
  ins_pipe( pipe_slow );
5712
%}
5713

5714

5715
instruct vsubS_reg(vec dst, vec src1, vec src2) %{
5716
  predicate(UseAVX > 0);
5717
  match(Set dst (SubVS src1 src2));
5718
  format %{ "vpsubw  $dst,$src1,$src2\t! sub packedS" %}
5719
  ins_encode %{
5720
    int vlen_enc = vector_length_encoding(this);
5721
    __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5722
  %}
5723
  ins_pipe( pipe_slow );
5724
%}
5725

5726
instruct vsubS_mem(vec dst, vec src, memory mem) %{
5727
  predicate((UseAVX > 0) &&
5728
            (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5729
  match(Set dst (SubVS src (LoadVector mem)));
5730
  format %{ "vpsubw  $dst,$src,$mem\t! sub packedS" %}
5731
  ins_encode %{
5732
    int vlen_enc = vector_length_encoding(this);
5733
    __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5734
  %}
5735
  ins_pipe( pipe_slow );
5736
%}
5737

5738
// Integers vector sub
5739
instruct vsubI(vec dst, vec src) %{
5740
  predicate(UseAVX == 0);
5741
  match(Set dst (SubVI dst src));
5742
  format %{ "psubd   $dst,$src\t! sub packedI" %}
5743
  ins_encode %{
5744
    __ psubd($dst$$XMMRegister, $src$$XMMRegister);
5745
  %}
5746
  ins_pipe( pipe_slow );
5747
%}
5748

5749
instruct vsubI_reg(vec dst, vec src1, vec src2) %{
5750
  predicate(UseAVX > 0);
5751
  match(Set dst (SubVI src1 src2));
5752
  format %{ "vpsubd  $dst,$src1,$src2\t! sub packedI" %}
5753
  ins_encode %{
5754
    int vlen_enc = vector_length_encoding(this);
5755
    __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5756
  %}
5757
  ins_pipe( pipe_slow );
5758
%}
5759

5760
instruct vsubI_mem(vec dst, vec src, memory mem) %{
5761
  predicate((UseAVX > 0) &&
5762
            (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5763
  match(Set dst (SubVI src (LoadVector mem)));
5764
  format %{ "vpsubd  $dst,$src,$mem\t! sub packedI" %}
5765
  ins_encode %{
5766
    int vlen_enc = vector_length_encoding(this);
5767
    __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5768
  %}
5769
  ins_pipe( pipe_slow );
5770
%}
5771

5772
// Longs vector sub
5773
instruct vsubL(vec dst, vec src) %{
5774
  predicate(UseAVX == 0);
5775
  match(Set dst (SubVL dst src));
5776
  format %{ "psubq   $dst,$src\t! sub packedL" %}
5777
  ins_encode %{
5778
    __ psubq($dst$$XMMRegister, $src$$XMMRegister);
5779
  %}
5780
  ins_pipe( pipe_slow );
5781
%}
5782

5783
instruct vsubL_reg(vec dst, vec src1, vec src2) %{
5784
  predicate(UseAVX > 0);
5785
  match(Set dst (SubVL src1 src2));
5786
  format %{ "vpsubq  $dst,$src1,$src2\t! sub packedL" %}
5787
  ins_encode %{
5788
    int vlen_enc = vector_length_encoding(this);
5789
    __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5790
  %}
5791
  ins_pipe( pipe_slow );
5792
%}
5793

5794

5795
instruct vsubL_mem(vec dst, vec src, memory mem) %{
5796
  predicate((UseAVX > 0) &&
5797
            (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5798
  match(Set dst (SubVL src (LoadVector mem)));
5799
  format %{ "vpsubq  $dst,$src,$mem\t! sub packedL" %}
5800
  ins_encode %{
5801
    int vlen_enc = vector_length_encoding(this);
5802
    __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5803
  %}
5804
  ins_pipe( pipe_slow );
5805
%}
5806

5807
// Floats vector sub
5808
instruct vsubF(vec dst, vec src) %{
5809
  predicate(UseAVX == 0);
5810
  match(Set dst (SubVF dst src));
5811
  format %{ "subps   $dst,$src\t! sub packedF" %}
5812
  ins_encode %{
5813
    __ subps($dst$$XMMRegister, $src$$XMMRegister);
5814
  %}
5815
  ins_pipe( pipe_slow );
5816
%}
5817

5818
instruct vsubF_reg(vec dst, vec src1, vec src2) %{
5819
  predicate(UseAVX > 0);
5820
  match(Set dst (SubVF src1 src2));
5821
  format %{ "vsubps  $dst,$src1,$src2\t! sub packedF" %}
5822
  ins_encode %{
5823
    int vlen_enc = vector_length_encoding(this);
5824
    __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5825
  %}
5826
  ins_pipe( pipe_slow );
5827
%}
5828

5829
instruct vsubF_mem(vec dst, vec src, memory mem) %{
5830
  predicate((UseAVX > 0) &&
5831
            (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5832
  match(Set dst (SubVF src (LoadVector mem)));
5833
  format %{ "vsubps  $dst,$src,$mem\t! sub packedF" %}
5834
  ins_encode %{
5835
    int vlen_enc = vector_length_encoding(this);
5836
    __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5837
  %}
5838
  ins_pipe( pipe_slow );
5839
%}
5840

5841
// Doubles vector sub
5842
instruct vsubD(vec dst, vec src) %{
5843
  predicate(UseAVX == 0);
5844
  match(Set dst (SubVD dst src));
5845
  format %{ "subpd   $dst,$src\t! sub packedD" %}
5846
  ins_encode %{
5847
    __ subpd($dst$$XMMRegister, $src$$XMMRegister);
5848
  %}
5849
  ins_pipe( pipe_slow );
5850
%}
5851

5852
instruct vsubD_reg(vec dst, vec src1, vec src2) %{
5853
  predicate(UseAVX > 0);
5854
  match(Set dst (SubVD src1 src2));
5855
  format %{ "vsubpd  $dst,$src1,$src2\t! sub packedD" %}
5856
  ins_encode %{
5857
    int vlen_enc = vector_length_encoding(this);
5858
    __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5859
  %}
5860
  ins_pipe( pipe_slow );
5861
%}
5862

5863
instruct vsubD_mem(vec dst, vec src, memory mem) %{
5864
  predicate((UseAVX > 0) &&
5865
            (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5866
  match(Set dst (SubVD src (LoadVector mem)));
5867
  format %{ "vsubpd  $dst,$src,$mem\t! sub packedD" %}
5868
  ins_encode %{
5869
    int vlen_enc = vector_length_encoding(this);
5870
    __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5871
  %}
5872
  ins_pipe( pipe_slow );
5873
%}
5874

5875
// --------------------------------- MUL --------------------------------------
5876

5877
// Byte vector mul
5878
instruct vmul8B(vec dst, vec src1, vec src2, vec xtmp) %{
5879
  predicate(Matcher::vector_length_in_bytes(n) <= 8);
5880
  match(Set dst (MulVB src1 src2));
5881
  effect(TEMP dst, TEMP xtmp);
5882
  format %{ "mulVB   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
5883
  ins_encode %{
5884
    assert(UseSSE > 3, "required");
5885
    __ pmovsxbw($dst$$XMMRegister, $src1$$XMMRegister);
5886
    __ pmovsxbw($xtmp$$XMMRegister, $src2$$XMMRegister);
5887
    __ pmullw($dst$$XMMRegister, $xtmp$$XMMRegister);
5888
    __ psllw($dst$$XMMRegister, 8);
5889
    __ psrlw($dst$$XMMRegister, 8);
5890
    __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
5891
  %}
5892
  ins_pipe( pipe_slow );
5893
%}
5894

5895
instruct vmulB(vec dst, vec src1, vec src2, vec xtmp) %{
5896
  predicate(UseAVX == 0 && Matcher::vector_length_in_bytes(n) > 8);
5897
  match(Set dst (MulVB src1 src2));
5898
  effect(TEMP dst, TEMP xtmp);
5899
  format %{ "mulVB   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
5900
  ins_encode %{
5901
    assert(UseSSE > 3, "required");
5902
    // Odd-index elements
5903
    __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
5904
    __ psrlw($dst$$XMMRegister, 8);
5905
    __ movdqu($xtmp$$XMMRegister, $src2$$XMMRegister);
5906
    __ psrlw($xtmp$$XMMRegister, 8);
5907
    __ pmullw($dst$$XMMRegister, $xtmp$$XMMRegister);
5908
    __ psllw($dst$$XMMRegister, 8);
5909
    // Even-index elements
5910
    __ movdqu($xtmp$$XMMRegister, $src1$$XMMRegister);
5911
    __ pmullw($xtmp$$XMMRegister, $src2$$XMMRegister);
5912
    __ psllw($xtmp$$XMMRegister, 8);
5913
    __ psrlw($xtmp$$XMMRegister, 8);
5914
    // Combine
5915
    __ por($dst$$XMMRegister, $xtmp$$XMMRegister);
5916
  %}
5917
  ins_pipe( pipe_slow );
5918
%}
5919

5920
instruct vmulB_reg(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2) %{
5921
  predicate(UseAVX > 0 && Matcher::vector_length_in_bytes(n) > 8);
5922
  match(Set dst (MulVB src1 src2));
5923
  effect(TEMP xtmp1, TEMP xtmp2);
5924
  format %{ "vmulVB  $dst, $src1, $src2\t! using $xtmp1, $xtmp2 as TEMP" %}
5925
  ins_encode %{
5926
    int vlen_enc = vector_length_encoding(this);
5927
    // Odd-index elements
5928
    __ vpsrlw($xtmp2$$XMMRegister, $src1$$XMMRegister, 8, vlen_enc);
5929
    __ vpsrlw($xtmp1$$XMMRegister, $src2$$XMMRegister, 8, vlen_enc);
5930
    __ vpmullw($xtmp2$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
5931
    __ vpsllw($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, 8, vlen_enc);
5932
    // Even-index elements
5933
    __ vpmullw($xtmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5934
    __ vpsllw($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, 8, vlen_enc);
5935
    __ vpsrlw($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, 8, vlen_enc);
5936
    // Combine
5937
    __ vpor($dst$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
5938
  %}
5939
  ins_pipe( pipe_slow );
5940
%}
5941

5942
// Shorts/Chars vector mul
5943
instruct vmulS(vec dst, vec src) %{
5944
  predicate(UseAVX == 0);
5945
  match(Set dst (MulVS dst src));
5946
  format %{ "pmullw  $dst,$src\t! mul packedS" %}
5947
  ins_encode %{
5948
    __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
5949
  %}
5950
  ins_pipe( pipe_slow );
5951
%}
5952

5953
instruct vmulS_reg(vec dst, vec src1, vec src2) %{
5954
  predicate(UseAVX > 0);
5955
  match(Set dst (MulVS src1 src2));
5956
  format %{ "vpmullw $dst,$src1,$src2\t! mul packedS" %}
5957
  ins_encode %{
5958
    int vlen_enc = vector_length_encoding(this);
5959
    __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5960
  %}
5961
  ins_pipe( pipe_slow );
5962
%}
5963

5964
instruct vmulS_mem(vec dst, vec src, memory mem) %{
5965
  predicate((UseAVX > 0) &&
5966
            (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5967
  match(Set dst (MulVS src (LoadVector mem)));
5968
  format %{ "vpmullw $dst,$src,$mem\t! mul packedS" %}
5969
  ins_encode %{
5970
    int vlen_enc = vector_length_encoding(this);
5971
    __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5972
  %}
5973
  ins_pipe( pipe_slow );
5974
%}
5975

5976
// Integers vector mul
5977
instruct vmulI(vec dst, vec src) %{
5978
  predicate(UseAVX == 0);
5979
  match(Set dst (MulVI dst src));
5980
  format %{ "pmulld  $dst,$src\t! mul packedI" %}
5981
  ins_encode %{
5982
    assert(UseSSE > 3, "required");
5983
    __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
5984
  %}
5985
  ins_pipe( pipe_slow );
5986
%}
5987

5988
instruct vmulI_reg(vec dst, vec src1, vec src2) %{
5989
  predicate(UseAVX > 0);
5990
  match(Set dst (MulVI src1 src2));
5991
  format %{ "vpmulld $dst,$src1,$src2\t! mul packedI" %}
5992
  ins_encode %{
5993
    int vlen_enc = vector_length_encoding(this);
5994
    __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5995
  %}
5996
  ins_pipe( pipe_slow );
5997
%}
5998

5999
instruct vmulI_mem(vec dst, vec src, memory mem) %{
6000
  predicate((UseAVX > 0) &&
6001
            (Matcher::vector_length_in_bytes(n->in(1)) > 8));
6002
  match(Set dst (MulVI src (LoadVector mem)));
6003
  format %{ "vpmulld $dst,$src,$mem\t! mul packedI" %}
6004
  ins_encode %{
6005
    int vlen_enc = vector_length_encoding(this);
6006
    __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
6007
  %}
6008
  ins_pipe( pipe_slow );
6009
%}
6010

6011
// Longs vector mul
6012
instruct evmulL_reg(vec dst, vec src1, vec src2) %{
6013
  predicate((Matcher::vector_length_in_bytes(n) == 64 &&
6014
             VM_Version::supports_avx512dq()) ||
6015
            VM_Version::supports_avx512vldq());
6016
  match(Set dst (MulVL src1 src2));
6017
  format %{ "evpmullq $dst,$src1,$src2\t! mul packedL" %}
6018
  ins_encode %{
6019
    assert(UseAVX > 2, "required");
6020
    int vlen_enc = vector_length_encoding(this);
6021
    __ evpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
6022
  %}
6023
  ins_pipe( pipe_slow );
6024
%}
6025

6026
instruct evmulL_mem(vec dst, vec src, memory mem) %{
6027
  predicate((Matcher::vector_length_in_bytes(n) == 64 &&
6028
             VM_Version::supports_avx512dq()) ||
6029
            (Matcher::vector_length_in_bytes(n) > 8 &&
6030
             VM_Version::supports_avx512vldq()));
6031
  match(Set dst (MulVL src (LoadVector mem)));
6032
  format %{ "evpmullq $dst,$src,$mem\t! mul packedL" %}
6033
  ins_encode %{
6034
    assert(UseAVX > 2, "required");
6035
    int vlen_enc = vector_length_encoding(this);
6036
    __ evpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
6037
  %}
6038
  ins_pipe( pipe_slow );
6039
%}
6040

6041
instruct vmulL(vec dst, vec src1, vec src2, vec xtmp) %{
6042
  predicate(UseAVX == 0);
6043
  match(Set dst (MulVL src1 src2));
6044
  effect(TEMP dst, TEMP xtmp);
6045
  format %{ "mulVL   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
6046
  ins_encode %{
6047
    assert(VM_Version::supports_sse4_1(), "required");
6048
    // Get the lo-hi products, only the lower 32 bits is in concerns
6049
    __ pshufd($xtmp$$XMMRegister, $src2$$XMMRegister, 0xB1);
6050
    __ pmulld($xtmp$$XMMRegister, $src1$$XMMRegister);
6051
    __ pshufd($dst$$XMMRegister, $xtmp$$XMMRegister, 0xB1);
6052
    __ paddd($dst$$XMMRegister, $xtmp$$XMMRegister);
6053
    __ psllq($dst$$XMMRegister, 32);
6054
    // Get the lo-lo products
6055
    __ movdqu($xtmp$$XMMRegister, $src1$$XMMRegister);
6056
    __ pmuludq($xtmp$$XMMRegister, $src2$$XMMRegister);
6057
    __ paddq($dst$$XMMRegister, $xtmp$$XMMRegister);
6058
  %}
6059
  ins_pipe( pipe_slow );
6060
%}
6061

6062
instruct vmulL_reg(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2) %{
6063
  predicate(UseAVX > 0 &&
6064
            ((Matcher::vector_length_in_bytes(n) == 64 &&
6065
              !VM_Version::supports_avx512dq()) ||
6066
             (Matcher::vector_length_in_bytes(n) < 64 &&
6067
              !VM_Version::supports_avx512vldq())));
6068
  match(Set dst (MulVL src1 src2));
6069
  effect(TEMP xtmp1, TEMP xtmp2);
6070
  format %{ "vmulVL  $dst, $src1, $src2\t! using $xtmp1, $xtmp2 as TEMP" %}
6071
  ins_encode %{
6072
    int vlen_enc = vector_length_encoding(this);
6073
    // Get the lo-hi products, only the lower 32 bits is in concerns
6074
    __ vpshufd($xtmp1$$XMMRegister, $src2$$XMMRegister, 0xB1, vlen_enc);
6075
    __ vpmulld($xtmp1$$XMMRegister, $src1$$XMMRegister, $xtmp1$$XMMRegister, vlen_enc);
6076
    __ vpshufd($xtmp2$$XMMRegister, $xtmp1$$XMMRegister, 0xB1, vlen_enc);
6077
    __ vpaddd($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, $xtmp1$$XMMRegister, vlen_enc);
6078
    __ vpsllq($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, 32, vlen_enc);
6079
    // Get the lo-lo products
6080
    __ vpmuludq($xtmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
6081
    __ vpaddq($dst$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
6082
  %}
6083
  ins_pipe( pipe_slow );
6084
%}
6085

6086
// Floats vector mul
6087
instruct vmulF(vec dst, vec src) %{
6088
  predicate(UseAVX == 0);
6089
  match(Set dst (MulVF dst src));
6090
  format %{ "mulps   $dst,$src\t! mul packedF" %}
6091
  ins_encode %{
6092
    __ mulps($dst$$XMMRegister, $src$$XMMRegister);
6093
  %}
6094
  ins_pipe( pipe_slow );
6095
%}
6096

6097
instruct vmulF_reg(vec dst, vec src1, vec src2) %{
6098
  predicate(UseAVX > 0);
6099
  match(Set dst (MulVF src1 src2));
6100
  format %{ "vmulps  $dst,$src1,$src2\t! mul packedF" %}
6101
  ins_encode %{
6102
    int vlen_enc = vector_length_encoding(this);
6103
    __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
6104
  %}
6105
  ins_pipe( pipe_slow );
6106
%}
6107

6108
instruct vmulF_mem(vec dst, vec src, memory mem) %{
6109
  predicate((UseAVX > 0) &&
6110
            (Matcher::vector_length_in_bytes(n->in(1)) > 8));
6111
  match(Set dst (MulVF src (LoadVector mem)));
6112
  format %{ "vmulps  $dst,$src,$mem\t! mul packedF" %}
6113
  ins_encode %{
6114
    int vlen_enc = vector_length_encoding(this);
6115
    __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
6116
  %}
6117
  ins_pipe( pipe_slow );
6118
%}
6119

6120
// Doubles vector mul
6121
instruct vmulD(vec dst, vec src) %{
6122
  predicate(UseAVX == 0);
6123
  match(Set dst (MulVD dst src));
6124
  format %{ "mulpd   $dst,$src\t! mul packedD" %}
6125
  ins_encode %{
6126
    __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
6127
  %}
6128
  ins_pipe( pipe_slow );
6129
%}
6130

6131
instruct vmulD_reg(vec dst, vec src1, vec src2) %{
6132
  predicate(UseAVX > 0);
6133
  match(Set dst (MulVD src1 src2));
6134
  format %{ "vmulpd  $dst,$src1,$src2\t! mul packedD" %}
6135
  ins_encode %{
6136
    int vlen_enc = vector_length_encoding(this);
6137
    __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
6138
  %}
6139
  ins_pipe( pipe_slow );
6140
%}
6141

6142
instruct vmulD_mem(vec dst, vec src, memory mem) %{
6143
  predicate((UseAVX > 0) &&
6144
            (Matcher::vector_length_in_bytes(n->in(1)) > 8));
6145
  match(Set dst (MulVD src (LoadVector mem)));
6146
  format %{ "vmulpd  $dst,$src,$mem\t! mul packedD" %}
6147
  ins_encode %{
6148
    int vlen_enc = vector_length_encoding(this);
6149
    __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
6150
  %}
6151
  ins_pipe( pipe_slow );
6152
%}
6153

6154
// --------------------------------- DIV --------------------------------------
6155

6156
// Floats vector div
6157
instruct vdivF(vec dst, vec src) %{
6158
  predicate(UseAVX == 0);
6159
  match(Set dst (DivVF dst src));
6160
  format %{ "divps   $dst,$src\t! div packedF" %}
6161
  ins_encode %{
6162
    __ divps($dst$$XMMRegister, $src$$XMMRegister);
6163
  %}
6164
  ins_pipe( pipe_slow );
6165
%}
6166

6167
instruct vdivF_reg(vec dst, vec src1, vec src2) %{
6168
  predicate(UseAVX > 0);
6169
  match(Set dst (DivVF src1 src2));
6170
  format %{ "vdivps  $dst,$src1,$src2\t! div packedF" %}
6171
  ins_encode %{
6172
    int vlen_enc = vector_length_encoding(this);
6173
    __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
6174
  %}
6175
  ins_pipe( pipe_slow );
6176
%}
6177

6178
instruct vdivF_mem(vec dst, vec src, memory mem) %{
6179
  predicate((UseAVX > 0) &&
6180
            (Matcher::vector_length_in_bytes(n->in(1)) > 8));
6181
  match(Set dst (DivVF src (LoadVector mem)));
6182
  format %{ "vdivps  $dst,$src,$mem\t! div packedF" %}
6183
  ins_encode %{
6184
    int vlen_enc = vector_length_encoding(this);
6185
    __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
6186
  %}
6187
  ins_pipe( pipe_slow );
6188
%}
6189

6190
// Doubles vector div
6191
instruct vdivD(vec dst, vec src) %{
6192
  predicate(UseAVX == 0);
6193
  match(Set dst (DivVD dst src));
6194
  format %{ "divpd   $dst,$src\t! div packedD" %}
6195
  ins_encode %{
6196
    __ divpd($dst$$XMMRegister, $src$$XMMRegister);
6197
  %}
6198
  ins_pipe( pipe_slow );
6199
%}
6200

6201
instruct vdivD_reg(vec dst, vec src1, vec src2) %{
6202
  predicate(UseAVX > 0);
6203
  match(Set dst (DivVD src1 src2));
6204
  format %{ "vdivpd  $dst,$src1,$src2\t! div packedD" %}
6205
  ins_encode %{
6206
    int vlen_enc = vector_length_encoding(this);
6207
    __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
6208
  %}
6209
  ins_pipe( pipe_slow );
6210
%}
6211

6212
instruct vdivD_mem(vec dst, vec src, memory mem) %{
6213
  predicate((UseAVX > 0) &&
6214
            (Matcher::vector_length_in_bytes(n->in(1)) > 8));
6215
  match(Set dst (DivVD src (LoadVector mem)));
6216
  format %{ "vdivpd  $dst,$src,$mem\t! div packedD" %}
6217
  ins_encode %{
6218
    int vlen_enc = vector_length_encoding(this);
6219
    __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
6220
  %}
6221
  ins_pipe( pipe_slow );
6222
%}
6223

6224
// ------------------------------ MinMax ---------------------------------------
6225

6226
// Byte, Short, Int vector Min/Max
6227
instruct minmax_reg_sse(vec dst, vec src) %{
6228
  predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
6229
            UseAVX == 0);
6230
  match(Set dst (MinV dst src));
6231
  match(Set dst (MaxV dst src));
6232
  format %{ "vector_minmax  $dst,$src\t!  " %}
6233
  ins_encode %{
6234
    assert(UseSSE >= 4, "required");
6235

6236
    int opcode = this->ideal_Opcode();
6237
    BasicType elem_bt = Matcher::vector_element_basic_type(this);
6238
    __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister);
6239
  %}
6240
  ins_pipe( pipe_slow );
6241
%}
6242

6243
instruct vminmax_reg(vec dst, vec src1, vec src2) %{
6244
  predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
6245
            UseAVX > 0);
6246
  match(Set dst (MinV src1 src2));
6247
  match(Set dst (MaxV src1 src2));
6248
  format %{ "vector_minmax  $dst,$src1,$src2\t!  " %}
6249
  ins_encode %{
6250
    int opcode = this->ideal_Opcode();
6251
    int vlen_enc = vector_length_encoding(this);
6252
    BasicType elem_bt = Matcher::vector_element_basic_type(this);
6253

6254
    __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
6255
  %}
6256
  ins_pipe( pipe_slow );
6257
%}
6258

6259
// Long vector Min/Max
6260
instruct minmaxL_reg_sse(vec dst, vec src, rxmm0 tmp) %{
6261
  predicate(Matcher::vector_length_in_bytes(n) == 16 && Matcher::vector_element_basic_type(n) == T_LONG &&
6262
            UseAVX == 0);
6263
  match(Set dst (MinV dst src));
6264
  match(Set dst (MaxV src dst));
6265
  effect(TEMP dst, TEMP tmp);
6266
  format %{ "vector_minmaxL  $dst,$src\t!using $tmp as TEMP" %}
6267
  ins_encode %{
6268
    assert(UseSSE >= 4, "required");
6269

6270
    int opcode = this->ideal_Opcode();
6271
    BasicType elem_bt = Matcher::vector_element_basic_type(this);
6272
    assert(elem_bt == T_LONG, "sanity");
6273

6274
    __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister);
6275
  %}
6276
  ins_pipe( pipe_slow );
6277
%}
6278

6279
instruct vminmaxL_reg_avx(legVec dst, legVec src1, legVec src2) %{
6280
  predicate(Matcher::vector_length_in_bytes(n) <= 32 && Matcher::vector_element_basic_type(n) == T_LONG &&
6281
            UseAVX > 0 && !VM_Version::supports_avx512vl());
6282
  match(Set dst (MinV src1 src2));
6283
  match(Set dst (MaxV src1 src2));
6284
  effect(TEMP dst);
6285
  format %{ "vector_minmaxL  $dst,$src1,$src2\t! " %}
6286
  ins_encode %{
6287
    int vlen_enc = vector_length_encoding(this);
6288
    int opcode = this->ideal_Opcode();
6289
    BasicType elem_bt = Matcher::vector_element_basic_type(this);
6290
    assert(elem_bt == T_LONG, "sanity");
6291

6292
    __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
6293
  %}
6294
  ins_pipe( pipe_slow );
6295
%}
6296

6297
instruct vminmaxL_reg_evex(vec dst, vec src1, vec src2) %{
6298
  predicate((Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()) &&
6299
            Matcher::vector_element_basic_type(n) == T_LONG);
6300
  match(Set dst (MinV src1 src2));
6301
  match(Set dst (MaxV src1 src2));
6302
  format %{ "vector_minmaxL  $dst,$src1,src2\t! " %}
6303
  ins_encode %{
6304
    assert(UseAVX > 2, "required");
6305

6306
    int vlen_enc = vector_length_encoding(this);
6307
    int opcode = this->ideal_Opcode();
6308
    BasicType elem_bt = Matcher::vector_element_basic_type(this);
6309
    assert(elem_bt == T_LONG, "sanity");
6310

6311
    __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
6312
  %}
6313
  ins_pipe( pipe_slow );
6314
%}
6315

6316
// Float/Double vector Min/Max
6317
instruct minmaxFP_reg(legVec dst, legVec a, legVec b, legVec tmp, legVec atmp, legVec btmp) %{
6318
  predicate(Matcher::vector_length_in_bytes(n) <= 32 &&
6319
            is_floating_point_type(Matcher::vector_element_basic_type(n)) && // T_FLOAT, T_DOUBLE
6320
            UseAVX > 0);
6321
  match(Set dst (MinV a b));
6322
  match(Set dst (MaxV a b));
6323
  effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
6324
  format %{ "vector_minmaxFP  $dst,$a,$b\t!using $tmp, $atmp, $btmp as TEMP" %}
6325
  ins_encode %{
6326
    assert(UseAVX > 0, "required");
6327

6328
    int opcode = this->ideal_Opcode();
6329
    int vlen_enc = vector_length_encoding(this);
6330
    BasicType elem_bt = Matcher::vector_element_basic_type(this);
6331

6332
    __ vminmax_fp(opcode, elem_bt,
6333
                  $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
6334
                  $tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
6335
  %}
6336
  ins_pipe( pipe_slow );
6337
%}
6338

6339
instruct evminmaxFP_reg_eavx(vec dst, vec a, vec b, vec atmp, vec btmp, kReg ktmp) %{
6340
  predicate(Matcher::vector_length_in_bytes(n) == 64 &&
6341
            is_floating_point_type(Matcher::vector_element_basic_type(n))); // T_FLOAT, T_DOUBLE
6342
  match(Set dst (MinV a b));
6343
  match(Set dst (MaxV a b));
6344
  effect(TEMP dst, USE a, USE b, TEMP atmp, TEMP btmp, TEMP ktmp);
6345
  format %{ "vector_minmaxFP  $dst,$a,$b\t!using $atmp, $btmp as TEMP" %}
6346
  ins_encode %{
6347
    assert(UseAVX > 2, "required");
6348

6349
    int opcode = this->ideal_Opcode();
6350
    int vlen_enc = vector_length_encoding(this);
6351
    BasicType elem_bt = Matcher::vector_element_basic_type(this);
6352

6353
    __ evminmax_fp(opcode, elem_bt,
6354
                   $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
6355
                   $ktmp$$KRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
6356
  %}
6357
  ins_pipe( pipe_slow );
6358
%}
6359

6360
// --------------------------------- Signum/CopySign ---------------------------
6361

6362
instruct signumF_reg(regF dst, regF zero, regF one, rFlagsReg cr) %{
6363
  match(Set dst (SignumF dst (Binary zero one)));
6364
  effect(KILL cr);
6365
  format %{ "signumF $dst, $dst" %}
6366
  ins_encode %{
6367
    int opcode = this->ideal_Opcode();
6368
    __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister);
6369
  %}
6370
  ins_pipe( pipe_slow );
6371
%}
6372

6373
instruct signumD_reg(regD dst, regD zero, regD one, rFlagsReg cr) %{
6374
  match(Set dst (SignumD dst (Binary zero one)));
6375
  effect(KILL cr);
6376
  format %{ "signumD $dst, $dst" %}
6377
  ins_encode %{
6378
    int opcode = this->ideal_Opcode();
6379
    __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister);
6380
  %}
6381
  ins_pipe( pipe_slow );
6382
%}
6383

6384
instruct signumV_reg_avx(vec dst, vec src, vec zero, vec one, vec xtmp1) %{
6385
  predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
6386
  match(Set dst (SignumVF src (Binary zero one)));
6387
  match(Set dst (SignumVD src (Binary zero one)));
6388
  effect(TEMP dst, TEMP xtmp1);
6389
  format %{ "vector_signum_avx $dst, $src\t! using $xtmp1 as TEMP" %}
6390
  ins_encode %{
6391
    int opcode = this->ideal_Opcode();
6392
    int vec_enc = vector_length_encoding(this);
6393
    __ vector_signum_avx(opcode, $dst$$XMMRegister, $src$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister,
6394
                         $xtmp1$$XMMRegister, vec_enc);
6395
  %}
6396
  ins_pipe( pipe_slow );
6397
%}
6398

6399
instruct signumV_reg_evex(vec dst, vec src, vec zero, vec one, kReg ktmp1) %{
6400
  predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
6401
  match(Set dst (SignumVF src (Binary zero one)));
6402
  match(Set dst (SignumVD src (Binary zero one)));
6403
  effect(TEMP dst, TEMP ktmp1);
6404
  format %{ "vector_signum_evex $dst, $src\t! using $ktmp1 as TEMP" %}
6405
  ins_encode %{
6406
    int opcode = this->ideal_Opcode();
6407
    int vec_enc = vector_length_encoding(this);
6408
    __ vector_signum_evex(opcode, $dst$$XMMRegister, $src$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister,
6409
                          $ktmp1$$KRegister, vec_enc);
6410
  %}
6411
  ins_pipe( pipe_slow );
6412
%}
6413

6414
// ---------------------------------------
6415
// For copySign use 0xE4 as writemask for vpternlog
6416
// Desired Truth Table: A -> xmm0 bit, B -> xmm1 bit, C -> xmm2 bit
6417
// C (xmm2) is set to 0x7FFFFFFF
6418
// Wherever xmm2 is 0, we want to pick from B (sign)
6419
// Wherever xmm2 is 1, we want to pick from A (src)
6420
//
6421
// A B C Result
6422
// 0 0 0 0
6423
// 0 0 1 0
6424
// 0 1 0 1
6425
// 0 1 1 0
6426
// 1 0 0 0
6427
// 1 0 1 1
6428
// 1 1 0 1
6429
// 1 1 1 1
6430
//
6431
// Result going from high bit to low bit is 0x11100100 = 0xe4
6432
// ---------------------------------------
6433

6434
#ifdef _LP64
6435
instruct copySignF_reg(regF dst, regF src, regF tmp1, rRegI tmp2) %{
6436
  match(Set dst (CopySignF dst src));
6437
  effect(TEMP tmp1, TEMP tmp2);
6438
  format %{ "CopySignF $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
6439
  ins_encode %{
6440
    __ movl($tmp2$$Register, 0x7FFFFFFF);
6441
    __ movdl($tmp1$$XMMRegister, $tmp2$$Register);
6442
    __ vpternlogd($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
6443
  %}
6444
  ins_pipe( pipe_slow );
6445
%}
6446

6447
instruct copySignD_imm(regD dst, regD src, regD tmp1, rRegL tmp2, immD zero) %{
6448
  match(Set dst (CopySignD dst (Binary src zero)));
6449
  ins_cost(100);
6450
  effect(TEMP tmp1, TEMP tmp2);
6451
  format %{ "CopySignD  $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
6452
  ins_encode %{
6453
    __ mov64($tmp2$$Register, 0x7FFFFFFFFFFFFFFF);
6454
    __ movq($tmp1$$XMMRegister, $tmp2$$Register);
6455
    __ vpternlogq($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
6456
  %}
6457
  ins_pipe( pipe_slow );
6458
%}
6459

6460
#endif // _LP64
6461

6462
//----------------------------- CompressBits/ExpandBits ------------------------
6463

6464
instruct compressBitsI_reg(rRegI dst, rRegI src, rRegI mask) %{
6465
  predicate(n->bottom_type()->isa_int());
6466
  match(Set dst (CompressBits src mask));
6467
  format %{ "pextl  $dst, $src, $mask\t! parallel bit extract" %}
6468
  ins_encode %{
6469
    __ pextl($dst$$Register, $src$$Register, $mask$$Register);
6470
  %}
6471
  ins_pipe( pipe_slow );
6472
%}
6473

6474
instruct expandBitsI_reg(rRegI dst, rRegI src, rRegI mask) %{
6475
  predicate(n->bottom_type()->isa_int());
6476
  match(Set dst (ExpandBits src mask));
6477
  format %{ "pdepl  $dst, $src, $mask\t! parallel bit deposit" %}
6478
  ins_encode %{
6479
    __ pdepl($dst$$Register, $src$$Register, $mask$$Register);
6480
  %}
6481
  ins_pipe( pipe_slow );
6482
%}
6483

6484
instruct compressBitsI_mem(rRegI dst, rRegI src, memory mask) %{
6485
  predicate(n->bottom_type()->isa_int());
6486
  match(Set dst (CompressBits src (LoadI mask)));
6487
  format %{ "pextl  $dst, $src, $mask\t! parallel bit extract" %}
6488
  ins_encode %{
6489
    __ pextl($dst$$Register, $src$$Register, $mask$$Address);
6490
  %}
6491
  ins_pipe( pipe_slow );
6492
%}
6493

6494
instruct expandBitsI_mem(rRegI dst, rRegI src, memory mask) %{
6495
  predicate(n->bottom_type()->isa_int());
6496
  match(Set dst (ExpandBits src (LoadI mask)));
6497
  format %{ "pdepl  $dst, $src, $mask\t! parallel bit deposit" %}
6498
  ins_encode %{
6499
    __ pdepl($dst$$Register, $src$$Register, $mask$$Address);
6500
  %}
6501
  ins_pipe( pipe_slow );
6502
%}
6503

6504
// --------------------------------- Sqrt --------------------------------------
6505

6506
instruct vsqrtF_reg(vec dst, vec src) %{
6507
  match(Set dst (SqrtVF src));
6508
  format %{ "vsqrtps  $dst,$src\t! sqrt packedF" %}
6509
  ins_encode %{
6510
    assert(UseAVX > 0, "required");
6511
    int vlen_enc = vector_length_encoding(this);
6512
    __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6513
  %}
6514
  ins_pipe( pipe_slow );
6515
%}
6516

6517
instruct vsqrtF_mem(vec dst, memory mem) %{
6518
  predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
6519
  match(Set dst (SqrtVF (LoadVector mem)));
6520
  format %{ "vsqrtps  $dst,$mem\t! sqrt packedF" %}
6521
  ins_encode %{
6522
    assert(UseAVX > 0, "required");
6523
    int vlen_enc = vector_length_encoding(this);
6524
    __ vsqrtps($dst$$XMMRegister, $mem$$Address, vlen_enc);
6525
  %}
6526
  ins_pipe( pipe_slow );
6527
%}
6528

6529
// Floating point vector sqrt
6530
instruct vsqrtD_reg(vec dst, vec src) %{
6531
  match(Set dst (SqrtVD src));
6532
  format %{ "vsqrtpd  $dst,$src\t! sqrt packedD" %}
6533
  ins_encode %{
6534
    assert(UseAVX > 0, "required");
6535
    int vlen_enc = vector_length_encoding(this);
6536
    __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6537
  %}
6538
  ins_pipe( pipe_slow );
6539
%}
6540

6541
instruct vsqrtD_mem(vec dst, memory mem) %{
6542
  predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
6543
  match(Set dst (SqrtVD (LoadVector mem)));
6544
  format %{ "vsqrtpd  $dst,$mem\t! sqrt packedD" %}
6545
  ins_encode %{
6546
    assert(UseAVX > 0, "required");
6547
    int vlen_enc = vector_length_encoding(this);
6548
    __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vlen_enc);
6549
  %}
6550
  ins_pipe( pipe_slow );
6551
%}
6552

6553
// ------------------------------ Shift ---------------------------------------
6554

6555
// Left and right shift count vectors are the same on x86
6556
// (only lowest bits of xmm reg are used for count).
6557
instruct vshiftcnt(vec dst, rRegI cnt) %{
6558
  match(Set dst (LShiftCntV cnt));
6559
  match(Set dst (RShiftCntV cnt));
6560
  format %{ "movdl    $dst,$cnt\t! load shift count" %}
6561
  ins_encode %{
6562
    __ movdl($dst$$XMMRegister, $cnt$$Register);
6563
  %}
6564
  ins_pipe( pipe_slow );
6565
%}
6566

6567
// Byte vector shift
6568
instruct vshiftB(vec dst, vec src, vec shift, vec tmp) %{
6569
  predicate(Matcher::vector_length(n) <= 8 && !n->as_ShiftV()->is_var_shift());
6570
  match(Set dst ( LShiftVB src shift));
6571
  match(Set dst ( RShiftVB src shift));
6572
  match(Set dst (URShiftVB src shift));
6573
  effect(TEMP dst, USE src, USE shift, TEMP tmp);
6574
  format %{"vector_byte_shift $dst,$src,$shift" %}
6575
  ins_encode %{
6576
    assert(UseSSE > 3, "required");
6577
    int opcode = this->ideal_Opcode();
6578
    bool sign = (opcode != Op_URShiftVB);
6579
    __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister);
6580
    __ vshiftw(opcode, $tmp$$XMMRegister, $shift$$XMMRegister);
6581
    __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
6582
    __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
6583
    __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
6584
  %}
6585
  ins_pipe( pipe_slow );
6586
%}
6587

6588
instruct vshift16B(vec dst, vec src, vec shift, vec tmp1, vec tmp2) %{
6589
  predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
6590
            UseAVX <= 1);
6591
  match(Set dst ( LShiftVB src shift));
6592
  match(Set dst ( RShiftVB src shift));
6593
  match(Set dst (URShiftVB src shift));
6594
  effect(TEMP dst, USE src, USE shift, TEMP tmp1, TEMP tmp2);
6595
  format %{"vector_byte_shift $dst,$src,$shift" %}
6596
  ins_encode %{
6597
    assert(UseSSE > 3, "required");
6598
    int opcode = this->ideal_Opcode();
6599
    bool sign = (opcode != Op_URShiftVB);
6600
    __ vextendbw(sign, $tmp1$$XMMRegister, $src$$XMMRegister);
6601
    __ vshiftw(opcode, $tmp1$$XMMRegister, $shift$$XMMRegister);
6602
    __ pshufd($tmp2$$XMMRegister, $src$$XMMRegister, 0xE);
6603
    __ vextendbw(sign, $tmp2$$XMMRegister, $tmp2$$XMMRegister);
6604
    __ vshiftw(opcode, $tmp2$$XMMRegister, $shift$$XMMRegister);
6605
    __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
6606
    __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
6607
    __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
6608
    __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
6609
  %}
6610
  ins_pipe( pipe_slow );
6611
%}
6612

6613
instruct vshift16B_avx(vec dst, vec src, vec shift, vec tmp) %{
6614
  predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
6615
            UseAVX > 1);
6616
  match(Set dst ( LShiftVB src shift));
6617
  match(Set dst ( RShiftVB src shift));
6618
  match(Set dst (URShiftVB src shift));
6619
  effect(TEMP dst, TEMP tmp);
6620
  format %{"vector_byte_shift $dst,$src,$shift" %}
6621
  ins_encode %{
6622
    int opcode = this->ideal_Opcode();
6623
    bool sign = (opcode != Op_URShiftVB);
6624
    int vlen_enc = Assembler::AVX_256bit;
6625
    __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
6626
    __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6627
    __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
6628
    __ vextracti128_high($dst$$XMMRegister, $tmp$$XMMRegister);
6629
    __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, 0);
6630
  %}
6631
  ins_pipe( pipe_slow );
6632
%}
6633

6634
instruct vshift32B_avx(vec dst, vec src, vec shift, vec tmp) %{
6635
  predicate(Matcher::vector_length(n) == 32 && !n->as_ShiftV()->is_var_shift());
6636
  match(Set dst ( LShiftVB src shift));
6637
  match(Set dst ( RShiftVB src shift));
6638
  match(Set dst (URShiftVB src shift));
6639
  effect(TEMP dst, TEMP tmp);
6640
  format %{"vector_byte_shift $dst,$src,$shift" %}
6641
  ins_encode %{
6642
    assert(UseAVX > 1, "required");
6643
    int opcode = this->ideal_Opcode();
6644
    bool sign = (opcode != Op_URShiftVB);
6645
    int vlen_enc = Assembler::AVX_256bit;
6646
    __ vextracti128_high($tmp$$XMMRegister, $src$$XMMRegister);
6647
    __ vextendbw(sign, $tmp$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
6648
    __ vextendbw(sign, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6649
    __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6650
    __ vshiftw(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6651
    __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
6652
    __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
6653
    __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
6654
    __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
6655
  %}
6656
  ins_pipe( pipe_slow );
6657
%}
6658

6659
instruct vshift64B_avx(vec dst, vec src, vec shift, vec tmp1, vec tmp2) %{
6660
  predicate(Matcher::vector_length(n) == 64 && !n->as_ShiftV()->is_var_shift());
6661
  match(Set dst ( LShiftVB src shift));
6662
  match(Set dst  (RShiftVB src shift));
6663
  match(Set dst (URShiftVB src shift));
6664
  effect(TEMP dst, TEMP tmp1, TEMP tmp2);
6665
  format %{"vector_byte_shift $dst,$src,$shift" %}
6666
  ins_encode %{
6667
    assert(UseAVX > 2, "required");
6668
    int opcode = this->ideal_Opcode();
6669
    bool sign = (opcode != Op_URShiftVB);
6670
    int vlen_enc = Assembler::AVX_512bit;
6671
    __ vextracti64x4($tmp1$$XMMRegister, $src$$XMMRegister, 1);
6672
    __ vextendbw(sign, $tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
6673
    __ vextendbw(sign, $tmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
6674
    __ vshiftw(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6675
    __ vshiftw(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6676
    __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
6677
    __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6678
    __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6679
    __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6680
    __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
6681
    __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vlen_enc, noreg);
6682
    __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6683
  %}
6684
  ins_pipe( pipe_slow );
6685
%}
6686

6687
// Shorts vector logical right shift produces incorrect Java result
6688
// for negative data because java code convert short value into int with
6689
// sign extension before a shift. But char vectors are fine since chars are
6690
// unsigned values.
6691
// Shorts/Chars vector left shift
6692
instruct vshiftS(vec dst, vec src, vec shift) %{
6693
  predicate(!n->as_ShiftV()->is_var_shift());
6694
  match(Set dst ( LShiftVS src shift));
6695
  match(Set dst ( RShiftVS src shift));
6696
  match(Set dst (URShiftVS src shift));
6697
  effect(TEMP dst, USE src, USE shift);
6698
  format %{ "vshiftw  $dst,$src,$shift\t! shift packedS" %}
6699
  ins_encode %{
6700
    int opcode = this->ideal_Opcode();
6701
    if (UseAVX > 0) {
6702
      int vlen_enc = vector_length_encoding(this);
6703
      __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6704
    } else {
6705
      int vlen = Matcher::vector_length(this);
6706
      if (vlen == 2) {
6707
        __ movflt($dst$$XMMRegister, $src$$XMMRegister);
6708
        __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6709
      } else if (vlen == 4) {
6710
        __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
6711
        __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6712
      } else {
6713
        assert (vlen == 8, "sanity");
6714
        __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6715
        __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6716
      }
6717
    }
6718
  %}
6719
  ins_pipe( pipe_slow );
6720
%}
6721

6722
// Integers vector left shift
6723
instruct vshiftI(vec dst, vec src, vec shift) %{
6724
  predicate(!n->as_ShiftV()->is_var_shift());
6725
  match(Set dst ( LShiftVI src shift));
6726
  match(Set dst ( RShiftVI src shift));
6727
  match(Set dst (URShiftVI src shift));
6728
  effect(TEMP dst, USE src, USE shift);
6729
  format %{ "vshiftd  $dst,$src,$shift\t! shift packedI" %}
6730
  ins_encode %{
6731
    int opcode = this->ideal_Opcode();
6732
    if (UseAVX > 0) {
6733
      int vlen_enc = vector_length_encoding(this);
6734
      __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6735
    } else {
6736
      int vlen = Matcher::vector_length(this);
6737
      if (vlen == 2) {
6738
        __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
6739
        __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6740
      } else {
6741
        assert(vlen == 4, "sanity");
6742
        __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6743
        __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6744
      }
6745
    }
6746
  %}
6747
  ins_pipe( pipe_slow );
6748
%}
6749

6750
// Integers vector left constant shift
6751
instruct vshiftI_imm(vec dst, vec src, immI8 shift) %{
6752
  match(Set dst (LShiftVI src (LShiftCntV shift)));
6753
  match(Set dst (RShiftVI src (RShiftCntV shift)));
6754
  match(Set dst (URShiftVI src (RShiftCntV shift)));
6755
  format %{ "vshiftd_imm  $dst,$src,$shift\t! shift packedI" %}
6756
  ins_encode %{
6757
    int opcode = this->ideal_Opcode();
6758
    if (UseAVX > 0) {
6759
      int vector_len = vector_length_encoding(this);
6760
      __ vshiftd_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
6761
    } else {
6762
      int vlen = Matcher::vector_length(this);
6763
      if (vlen == 2) {
6764
        __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
6765
        __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
6766
      } else {
6767
        assert(vlen == 4, "sanity");
6768
        __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6769
        __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
6770
      }
6771
    }
6772
  %}
6773
  ins_pipe( pipe_slow );
6774
%}
6775

6776
// Longs vector shift
6777
instruct vshiftL(vec dst, vec src, vec shift) %{
6778
  predicate(!n->as_ShiftV()->is_var_shift());
6779
  match(Set dst ( LShiftVL src shift));
6780
  match(Set dst (URShiftVL src shift));
6781
  effect(TEMP dst, USE src, USE shift);
6782
  format %{ "vshiftq  $dst,$src,$shift\t! shift packedL" %}
6783
  ins_encode %{
6784
    int opcode = this->ideal_Opcode();
6785
    if (UseAVX > 0) {
6786
      int vlen_enc = vector_length_encoding(this);
6787
      __ vshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6788
    } else {
6789
      assert(Matcher::vector_length(this) == 2, "");
6790
      __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6791
      __ vshiftq(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6792
    }
6793
  %}
6794
  ins_pipe( pipe_slow );
6795
%}
6796

6797
// Longs vector constant shift
6798
instruct vshiftL_imm(vec dst, vec src, immI8 shift) %{
6799
  match(Set dst (LShiftVL src (LShiftCntV shift)));
6800
  match(Set dst (URShiftVL src (RShiftCntV shift)));
6801
  format %{ "vshiftq_imm  $dst,$src,$shift\t! shift packedL" %}
6802
  ins_encode %{
6803
    int opcode = this->ideal_Opcode();
6804
    if (UseAVX > 0) {
6805
      int vector_len = vector_length_encoding(this);
6806
      __ vshiftq_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
6807
    } else {
6808
      assert(Matcher::vector_length(this) == 2, "");
6809
      __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6810
      __ vshiftq_imm(opcode, $dst$$XMMRegister, $shift$$constant);
6811
    }
6812
  %}
6813
  ins_pipe( pipe_slow );
6814
%}
6815

6816
// -------------------ArithmeticRightShift -----------------------------------
6817
// Long vector arithmetic right shift
6818
instruct vshiftL_arith_reg(vec dst, vec src, vec shift, vec tmp) %{
6819
  predicate(!n->as_ShiftV()->is_var_shift() && UseAVX <= 2);
6820
  match(Set dst (RShiftVL src shift));
6821
  effect(TEMP dst, TEMP tmp);
6822
  format %{ "vshiftq $dst,$src,$shift" %}
6823
  ins_encode %{
6824
    uint vlen = Matcher::vector_length(this);
6825
    if (vlen == 2) {
6826
      assert(UseSSE >= 2, "required");
6827
      __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6828
      __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
6829
      __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), noreg);
6830
      __ psrlq($tmp$$XMMRegister, $shift$$XMMRegister);
6831
      __ pxor($dst$$XMMRegister, $tmp$$XMMRegister);
6832
      __ psubq($dst$$XMMRegister, $tmp$$XMMRegister);
6833
    } else {
6834
      assert(vlen == 4, "sanity");
6835
      assert(UseAVX > 1, "required");
6836
      int vlen_enc = Assembler::AVX_256bit;
6837
      __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6838
      __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), noreg);
6839
      __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6840
      __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
6841
      __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
6842
    }
6843
  %}
6844
  ins_pipe( pipe_slow );
6845
%}
6846

6847
instruct vshiftL_arith_reg_evex(vec dst, vec src, vec shift) %{
6848
  predicate(!n->as_ShiftV()->is_var_shift() && UseAVX > 2);
6849
  match(Set dst (RShiftVL src shift));
6850
  format %{ "vshiftq $dst,$src,$shift" %}
6851
  ins_encode %{
6852
    int vlen_enc = vector_length_encoding(this);
6853
    __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6854
  %}
6855
  ins_pipe( pipe_slow );
6856
%}
6857

6858
// ------------------- Variable Shift -----------------------------
6859
// Byte variable shift
6860
instruct vshift8B_var_nobw(vec dst, vec src, vec shift, vec vtmp) %{
6861
  predicate(Matcher::vector_length(n) <= 8 &&
6862
            n->as_ShiftV()->is_var_shift() &&
6863
            !VM_Version::supports_avx512bw());
6864
  match(Set dst ( LShiftVB src shift));
6865
  match(Set dst ( RShiftVB src shift));
6866
  match(Set dst (URShiftVB src shift));
6867
  effect(TEMP dst, TEMP vtmp);
6868
  format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp as TEMP" %}
6869
  ins_encode %{
6870
    assert(UseAVX >= 2, "required");
6871

6872
    int opcode = this->ideal_Opcode();
6873
    int vlen_enc = Assembler::AVX_128bit;
6874
    __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister);
6875
    __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
6876
  %}
6877
  ins_pipe( pipe_slow );
6878
%}
6879

6880
instruct vshift16B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
6881
  predicate(Matcher::vector_length(n) == 16 &&
6882
            n->as_ShiftV()->is_var_shift() &&
6883
            !VM_Version::supports_avx512bw());
6884
  match(Set dst ( LShiftVB src shift));
6885
  match(Set dst ( RShiftVB src shift));
6886
  match(Set dst (URShiftVB src shift));
6887
  effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
6888
  format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 as TEMP" %}
6889
  ins_encode %{
6890
    assert(UseAVX >= 2, "required");
6891

6892
    int opcode = this->ideal_Opcode();
6893
    int vlen_enc = Assembler::AVX_128bit;
6894
    // Shift lower half and get word result in dst
6895
    __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
6896

6897
    // Shift upper half and get word result in vtmp1
6898
    __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
6899
    __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
6900
    __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
6901

6902
    // Merge and down convert the two word results to byte in dst
6903
    __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
6904
  %}
6905
  ins_pipe( pipe_slow );
6906
%}
6907

6908
instruct vshift32B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, vec vtmp3, vec vtmp4) %{
6909
  predicate(Matcher::vector_length(n) == 32 &&
6910
            n->as_ShiftV()->is_var_shift() &&
6911
            !VM_Version::supports_avx512bw());
6912
  match(Set dst ( LShiftVB src shift));
6913
  match(Set dst ( RShiftVB src shift));
6914
  match(Set dst (URShiftVB src shift));
6915
  effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4);
6916
  format %{ "vector_varshift_byte $dst, $src, $shift\n\t using $vtmp1, $vtmp2, $vtmp3, $vtmp4 as TEMP" %}
6917
  ins_encode %{
6918
    assert(UseAVX >= 2, "required");
6919

6920
    int opcode = this->ideal_Opcode();
6921
    int vlen_enc = Assembler::AVX_128bit;
6922
    // Process lower 128 bits and get result in dst
6923
    __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
6924
    __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
6925
    __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
6926
    __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
6927
    __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
6928

6929
    // Process higher 128 bits and get result in vtmp3
6930
    __ vextracti128_high($vtmp1$$XMMRegister, $src$$XMMRegister);
6931
    __ vextracti128_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
6932
    __ varshiftbw(opcode, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp4$$XMMRegister);
6933
    __ vpshufd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, 0xE, 0);
6934
    __ vpshufd($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, 0xE, 0);
6935
    __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
6936
    __ vpackuswb($vtmp1$$XMMRegister, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, 0);
6937

6938
    // Merge the two results in dst
6939
    __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
6940
  %}
6941
  ins_pipe( pipe_slow );
6942
%}
6943

6944
instruct vshiftB_var_evex_bw(vec dst, vec src, vec shift, vec vtmp) %{
6945
  predicate(Matcher::vector_length(n) <= 32 &&
6946
            n->as_ShiftV()->is_var_shift() &&
6947
            VM_Version::supports_avx512bw());
6948
  match(Set dst ( LShiftVB src shift));
6949
  match(Set dst ( RShiftVB src shift));
6950
  match(Set dst (URShiftVB src shift));
6951
  effect(TEMP dst, TEMP vtmp);
6952
  format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp as TEMP" %}
6953
  ins_encode %{
6954
    assert(UseAVX > 2, "required");
6955

6956
    int opcode = this->ideal_Opcode();
6957
    int vlen_enc = vector_length_encoding(this);
6958
    __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister);
6959
  %}
6960
  ins_pipe( pipe_slow );
6961
%}
6962

6963
instruct vshift64B_var_evex_bw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
6964
  predicate(Matcher::vector_length(n) == 64 &&
6965
            n->as_ShiftV()->is_var_shift() &&
6966
            VM_Version::supports_avx512bw());
6967
  match(Set dst ( LShiftVB src shift));
6968
  match(Set dst ( RShiftVB src shift));
6969
  match(Set dst (URShiftVB src shift));
6970
  effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
6971
  format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 as TEMP" %}
6972
  ins_encode %{
6973
    assert(UseAVX > 2, "required");
6974

6975
    int opcode = this->ideal_Opcode();
6976
    int vlen_enc = Assembler::AVX_256bit;
6977
    __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
6978
    __ vextracti64x4_high($vtmp1$$XMMRegister, $src$$XMMRegister);
6979
    __ vextracti64x4_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
6980
    __ evarshiftb(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
6981
    __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
6982
  %}
6983
  ins_pipe( pipe_slow );
6984
%}
6985

6986
// Short variable shift
6987
instruct vshift8S_var_nobw(vec dst, vec src, vec shift, vec vtmp) %{
6988
  predicate(Matcher::vector_length(n) <= 8 &&
6989
            n->as_ShiftV()->is_var_shift() &&
6990
            !VM_Version::supports_avx512bw());
6991
  match(Set dst ( LShiftVS src shift));
6992
  match(Set dst ( RShiftVS src shift));
6993
  match(Set dst (URShiftVS src shift));
6994
  effect(TEMP dst, TEMP vtmp);
6995
  format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
6996
  ins_encode %{
6997
    assert(UseAVX >= 2, "required");
6998

6999
    int opcode = this->ideal_Opcode();
7000
    bool sign = (opcode != Op_URShiftVS);
7001
    int vlen_enc = Assembler::AVX_256bit;
7002
    __ vextendwd(sign, $dst$$XMMRegister, $src$$XMMRegister, 1);
7003
    __ vpmovzxwd($vtmp$$XMMRegister, $shift$$XMMRegister, 1);
7004
    __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
7005
    __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
7006
    __ vextracti128_high($vtmp$$XMMRegister, $dst$$XMMRegister);
7007
    __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
7008
  %}
7009
  ins_pipe( pipe_slow );
7010
%}
7011

7012
instruct vshift16S_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
7013
  predicate(Matcher::vector_length(n) == 16 &&
7014
            n->as_ShiftV()->is_var_shift() &&
7015
            !VM_Version::supports_avx512bw());
7016
  match(Set dst ( LShiftVS src shift));
7017
  match(Set dst ( RShiftVS src shift));
7018
  match(Set dst (URShiftVS src shift));
7019
  effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
7020
  format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
7021
  ins_encode %{
7022
    assert(UseAVX >= 2, "required");
7023

7024
    int opcode = this->ideal_Opcode();
7025
    bool sign = (opcode != Op_URShiftVS);
7026
    int vlen_enc = Assembler::AVX_256bit;
7027
    // Shift lower half, with result in vtmp2 using vtmp1 as TEMP
7028
    __ vextendwd(sign, $vtmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
7029
    __ vpmovzxwd($vtmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
7030
    __ varshiftd(opcode, $vtmp2$$XMMRegister, $vtmp2$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
7031
    __ vpand($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
7032

7033
    // Shift upper half, with result in dst using vtmp1 as TEMP
7034
    __ vextracti128_high($dst$$XMMRegister, $src$$XMMRegister);
7035
    __ vextracti128_high($vtmp1$$XMMRegister, $shift$$XMMRegister);
7036
    __ vextendwd(sign, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7037
    __ vpmovzxwd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
7038
    __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
7039
    __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
7040

7041
    // Merge lower and upper half result into dst
7042
    __ vpackusdw($dst$$XMMRegister, $vtmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7043
    __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
7044
  %}
7045
  ins_pipe( pipe_slow );
7046
%}
7047

7048
instruct vshift16S_var_evex_bw(vec dst, vec src, vec shift) %{
7049
  predicate(n->as_ShiftV()->is_var_shift() &&
7050
            VM_Version::supports_avx512bw());
7051
  match(Set dst ( LShiftVS src shift));
7052
  match(Set dst ( RShiftVS src shift));
7053
  match(Set dst (URShiftVS src shift));
7054
  format %{ "vector_varshift_short $dst,$src,$shift\t!" %}
7055
  ins_encode %{
7056
    assert(UseAVX > 2, "required");
7057

7058
    int opcode = this->ideal_Opcode();
7059
    int vlen_enc = vector_length_encoding(this);
7060
    if (!VM_Version::supports_avx512vl()) {
7061
      vlen_enc = Assembler::AVX_512bit;
7062
    }
7063
    __ varshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
7064
  %}
7065
  ins_pipe( pipe_slow );
7066
%}
7067

7068
//Integer variable shift
7069
instruct vshiftI_var(vec dst, vec src, vec shift) %{
7070
  predicate(n->as_ShiftV()->is_var_shift());
7071
  match(Set dst ( LShiftVI src shift));
7072
  match(Set dst ( RShiftVI src shift));
7073
  match(Set dst (URShiftVI src shift));
7074
  format %{ "vector_varshift_int $dst,$src,$shift\t!" %}
7075
  ins_encode %{
7076
    assert(UseAVX >= 2, "required");
7077

7078
    int opcode = this->ideal_Opcode();
7079
    int vlen_enc = vector_length_encoding(this);
7080
    __ varshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
7081
  %}
7082
  ins_pipe( pipe_slow );
7083
%}
7084

7085
//Long variable shift
7086
instruct vshiftL_var(vec dst, vec src, vec shift) %{
7087
  predicate(n->as_ShiftV()->is_var_shift());
7088
  match(Set dst ( LShiftVL src shift));
7089
  match(Set dst (URShiftVL src shift));
7090
  format %{ "vector_varshift_long $dst,$src,$shift\t!" %}
7091
  ins_encode %{
7092
    assert(UseAVX >= 2, "required");
7093

7094
    int opcode = this->ideal_Opcode();
7095
    int vlen_enc = vector_length_encoding(this);
7096
    __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
7097
  %}
7098
  ins_pipe( pipe_slow );
7099
%}
7100

7101
//Long variable right shift arithmetic
7102
instruct vshiftL_arith_var(vec dst, vec src, vec shift, vec vtmp) %{
7103
  predicate(Matcher::vector_length(n) <= 4 &&
7104
            n->as_ShiftV()->is_var_shift() &&
7105
            UseAVX == 2);
7106
  match(Set dst (RShiftVL src shift));
7107
  effect(TEMP dst, TEMP vtmp);
7108
  format %{ "vector_varshift_long  $dst,$src,$shift\n\t! using $vtmp as TEMP" %}
7109
  ins_encode %{
7110
    int opcode = this->ideal_Opcode();
7111
    int vlen_enc = vector_length_encoding(this);
7112
    __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc,
7113
                 $vtmp$$XMMRegister);
7114
  %}
7115
  ins_pipe( pipe_slow );
7116
%}
7117

7118
instruct vshiftL_arith_var_evex(vec dst, vec src, vec shift) %{
7119
  predicate(n->as_ShiftV()->is_var_shift() &&
7120
            UseAVX > 2);
7121
  match(Set dst (RShiftVL src shift));
7122
  format %{ "vector_varfshift_long $dst,$src,$shift\t!" %}
7123
  ins_encode %{
7124
    int opcode = this->ideal_Opcode();
7125
    int vlen_enc = vector_length_encoding(this);
7126
    __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
7127
  %}
7128
  ins_pipe( pipe_slow );
7129
%}
7130

7131
// --------------------------------- AND --------------------------------------
7132

7133
instruct vand(vec dst, vec src) %{
7134
  predicate(UseAVX == 0);
7135
  match(Set dst (AndV dst src));
7136
  format %{ "pand    $dst,$src\t! and vectors" %}
7137
  ins_encode %{
7138
    __ pand($dst$$XMMRegister, $src$$XMMRegister);
7139
  %}
7140
  ins_pipe( pipe_slow );
7141
%}
7142

7143
instruct vand_reg(vec dst, vec src1, vec src2) %{
7144
  predicate(UseAVX > 0);
7145
  match(Set dst (AndV src1 src2));
7146
  format %{ "vpand   $dst,$src1,$src2\t! and vectors" %}
7147
  ins_encode %{
7148
    int vlen_enc = vector_length_encoding(this);
7149
    __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
7150
  %}
7151
  ins_pipe( pipe_slow );
7152
%}
7153

7154
instruct vand_mem(vec dst, vec src, memory mem) %{
7155
  predicate((UseAVX > 0) &&
7156
            (Matcher::vector_length_in_bytes(n->in(1)) > 8));
7157
  match(Set dst (AndV src (LoadVector mem)));
7158
  format %{ "vpand   $dst,$src,$mem\t! and vectors" %}
7159
  ins_encode %{
7160
    int vlen_enc = vector_length_encoding(this);
7161
    __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
7162
  %}
7163
  ins_pipe( pipe_slow );
7164
%}
7165

7166
// --------------------------------- OR ---------------------------------------
7167

7168
instruct vor(vec dst, vec src) %{
7169
  predicate(UseAVX == 0);
7170
  match(Set dst (OrV dst src));
7171
  format %{ "por     $dst,$src\t! or vectors" %}
7172
  ins_encode %{
7173
    __ por($dst$$XMMRegister, $src$$XMMRegister);
7174
  %}
7175
  ins_pipe( pipe_slow );
7176
%}
7177

7178
instruct vor_reg(vec dst, vec src1, vec src2) %{
7179
  predicate(UseAVX > 0);
7180
  match(Set dst (OrV src1 src2));
7181
  format %{ "vpor    $dst,$src1,$src2\t! or vectors" %}
7182
  ins_encode %{
7183
    int vlen_enc = vector_length_encoding(this);
7184
    __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
7185
  %}
7186
  ins_pipe( pipe_slow );
7187
%}
7188

7189
instruct vor_mem(vec dst, vec src, memory mem) %{
7190
  predicate((UseAVX > 0) &&
7191
            (Matcher::vector_length_in_bytes(n->in(1)) > 8));
7192
  match(Set dst (OrV src (LoadVector mem)));
7193
  format %{ "vpor    $dst,$src,$mem\t! or vectors" %}
7194
  ins_encode %{
7195
    int vlen_enc = vector_length_encoding(this);
7196
    __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
7197
  %}
7198
  ins_pipe( pipe_slow );
7199
%}
7200

7201
// --------------------------------- XOR --------------------------------------
7202

7203
instruct vxor(vec dst, vec src) %{
7204
  predicate(UseAVX == 0);
7205
  match(Set dst (XorV dst src));
7206
  format %{ "pxor    $dst,$src\t! xor vectors" %}
7207
  ins_encode %{
7208
    __ pxor($dst$$XMMRegister, $src$$XMMRegister);
7209
  %}
7210
  ins_pipe( pipe_slow );
7211
%}
7212

7213
instruct vxor_reg(vec dst, vec src1, vec src2) %{
7214
  predicate(UseAVX > 0);
7215
  match(Set dst (XorV src1 src2));
7216
  format %{ "vpxor   $dst,$src1,$src2\t! xor vectors" %}
7217
  ins_encode %{
7218
    int vlen_enc = vector_length_encoding(this);
7219
    __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
7220
  %}
7221
  ins_pipe( pipe_slow );
7222
%}
7223

7224
instruct vxor_mem(vec dst, vec src, memory mem) %{
7225
  predicate((UseAVX > 0) &&
7226
            (Matcher::vector_length_in_bytes(n->in(1)) > 8));
7227
  match(Set dst (XorV src (LoadVector mem)));
7228
  format %{ "vpxor   $dst,$src,$mem\t! xor vectors" %}
7229
  ins_encode %{
7230
    int vlen_enc = vector_length_encoding(this);
7231
    __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
7232
  %}
7233
  ins_pipe( pipe_slow );
7234
%}
7235

7236
// --------------------------------- VectorCast --------------------------------------
7237

7238
instruct vcastBtoX(vec dst, vec src) %{
7239
  predicate(VM_Version::supports_avx512vl() || Matcher::vector_element_basic_type(n) != T_DOUBLE);
7240
  match(Set dst (VectorCastB2X src));
7241
  format %{ "vector_cast_b2x $dst,$src\t!" %}
7242
  ins_encode %{
7243
    BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
7244
    int vlen_enc = vector_length_encoding(this);
7245
    __ vconvert_b2x(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7246
  %}
7247
  ins_pipe( pipe_slow );
7248
%}
7249

7250
instruct vcastBtoD(legVec dst, legVec src) %{
7251
  predicate(!VM_Version::supports_avx512vl() && Matcher::vector_element_basic_type(n) == T_DOUBLE);
7252
  match(Set dst (VectorCastB2X src));
7253
  format %{ "vector_cast_b2x $dst,$src\t!" %}
7254
  ins_encode %{
7255
    int vlen_enc = vector_length_encoding(this);
7256
    __ vconvert_b2x(T_DOUBLE, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7257
  %}
7258
  ins_pipe( pipe_slow );
7259
%}
7260

7261
instruct castStoX(vec dst, vec src) %{
7262
  predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
7263
            Matcher::vector_length(n->in(1)) <= 8 && // src
7264
            Matcher::vector_element_basic_type(n) == T_BYTE);
7265
  match(Set dst (VectorCastS2X src));
7266
  format %{ "vector_cast_s2x $dst,$src" %}
7267
  ins_encode %{
7268
    assert(UseAVX > 0, "required");
7269

7270
    __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), 0, noreg);
7271
    __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
7272
  %}
7273
  ins_pipe( pipe_slow );
7274
%}
7275

7276
instruct vcastStoX(vec dst, vec src, vec vtmp) %{
7277
  predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
7278
            Matcher::vector_length(n->in(1)) == 16 && // src
7279
            Matcher::vector_element_basic_type(n) == T_BYTE);
7280
  effect(TEMP dst, TEMP vtmp);
7281
  match(Set dst (VectorCastS2X src));
7282
  format %{ "vector_cast_s2x $dst,$src\t! using $vtmp as TEMP" %}
7283
  ins_encode %{
7284
    assert(UseAVX > 0, "required");
7285

7286
    int vlen_enc = vector_length_encoding(Matcher::vector_length_in_bytes(this, $src));
7287
    __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
7288
    __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
7289
    __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
7290
  %}
7291
  ins_pipe( pipe_slow );
7292
%}
7293

7294
instruct vcastStoX_evex(vec dst, vec src) %{
7295
  predicate((UseAVX > 2 && VM_Version::supports_avx512vlbw()) ||
7296
            (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
7297
  match(Set dst (VectorCastS2X src));
7298
  format %{ "vector_cast_s2x $dst,$src\t!" %}
7299
  ins_encode %{
7300
    BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
7301
    int src_vlen_enc = vector_length_encoding(this, $src);
7302
    int vlen_enc = vector_length_encoding(this);
7303
    switch (to_elem_bt) {
7304
      case T_BYTE:
7305
        if (!VM_Version::supports_avx512vl()) {
7306
          vlen_enc = Assembler::AVX_512bit;
7307
        }
7308
        __ evpmovwb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
7309
        break;
7310
      case T_INT:
7311
        __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7312
        break;
7313
      case T_FLOAT:
7314
        __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7315
        __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7316
        break;
7317
      case T_LONG:
7318
        __ vpmovsxwq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7319
        break;
7320
      case T_DOUBLE: {
7321
        int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
7322
        __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, mid_vlen_enc);
7323
        __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7324
        break;
7325
      }
7326
      default:
7327
        ShouldNotReachHere();
7328
    }
7329
  %}
7330
  ins_pipe( pipe_slow );
7331
%}
7332

7333
instruct castItoX(vec dst, vec src) %{
7334
  predicate(UseAVX <= 2 &&
7335
            (Matcher::vector_length_in_bytes(n->in(1)) <= 16) &&
7336
            (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
7337
  match(Set dst (VectorCastI2X src));
7338
  format %{ "vector_cast_i2x $dst,$src" %}
7339
  ins_encode %{
7340
    assert(UseAVX > 0, "required");
7341

7342
    BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
7343
    int vlen_enc = vector_length_encoding(this, $src);
7344

7345
    if (to_elem_bt == T_BYTE) {
7346
      __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, noreg);
7347
      __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7348
      __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7349
    } else {
7350
      assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
7351
      __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
7352
      __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7353
    }
7354
  %}
7355
  ins_pipe( pipe_slow );
7356
%}
7357

7358
instruct vcastItoX(vec dst, vec src, vec vtmp) %{
7359
  predicate(UseAVX <= 2 &&
7360
            (Matcher::vector_length_in_bytes(n->in(1)) == 32) &&
7361
            (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
7362
  match(Set dst (VectorCastI2X src));
7363
  format %{ "vector_cast_i2x $dst,$src\t! using $vtmp as TEMP" %}
7364
  effect(TEMP dst, TEMP vtmp);
7365
  ins_encode %{
7366
    assert(UseAVX > 0, "required");
7367

7368
    BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
7369
    int vlen_enc = vector_length_encoding(this, $src);
7370

7371
    if (to_elem_bt == T_BYTE) {
7372
      __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, noreg);
7373
      __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
7374
      __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7375
      __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
7376
    } else {
7377
      assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
7378
      __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
7379
      __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
7380
      __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7381
    }
7382
  %}
7383
  ins_pipe( pipe_slow );
7384
%}
7385

7386
instruct vcastItoX_evex(vec dst, vec src) %{
7387
  predicate(UseAVX > 2 ||
7388
            (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
7389
  match(Set dst (VectorCastI2X src));
7390
  format %{ "vector_cast_i2x $dst,$src\t!" %}
7391
  ins_encode %{
7392
    assert(UseAVX > 0, "required");
7393

7394
    BasicType dst_elem_bt = Matcher::vector_element_basic_type(this);
7395
    int src_vlen_enc = vector_length_encoding(this, $src);
7396
    int dst_vlen_enc = vector_length_encoding(this);
7397
    switch (dst_elem_bt) {
7398
      case T_BYTE:
7399
        if (!VM_Version::supports_avx512vl()) {
7400
          src_vlen_enc = Assembler::AVX_512bit;
7401
        }
7402
        __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
7403
        break;
7404
      case T_SHORT:
7405
        if (!VM_Version::supports_avx512vl()) {
7406
          src_vlen_enc = Assembler::AVX_512bit;
7407
        }
7408
        __ evpmovdw($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
7409
        break;
7410
      case T_FLOAT:
7411
        __ vcvtdq2ps($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
7412
        break;
7413
      case T_LONG:
7414
        __ vpmovsxdq($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
7415
        break;
7416
      case T_DOUBLE:
7417
        __ vcvtdq2pd($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
7418
        break;
7419
      default:
7420
        ShouldNotReachHere();
7421
    }
7422
  %}
7423
  ins_pipe( pipe_slow );
7424
%}
7425

7426
instruct vcastLtoBS(vec dst, vec src) %{
7427
  predicate((Matcher::vector_element_basic_type(n) == T_BYTE || Matcher::vector_element_basic_type(n) == T_SHORT) &&
7428
            UseAVX <= 2);
7429
  match(Set dst (VectorCastL2X src));
7430
  format %{ "vector_cast_l2x  $dst,$src" %}
7431
  ins_encode %{
7432
    assert(UseAVX > 0, "required");
7433

7434
    int vlen = Matcher::vector_length_in_bytes(this, $src);
7435
    BasicType to_elem_bt  = Matcher::vector_element_basic_type(this);
7436
    AddressLiteral mask_addr = (to_elem_bt == T_BYTE) ? ExternalAddress(vector_int_to_byte_mask())
7437
                                                      : ExternalAddress(vector_int_to_short_mask());
7438
    if (vlen <= 16) {
7439
      __ vpshufd($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_128bit);
7440
      __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, noreg);
7441
      __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
7442
    } else {
7443
      assert(vlen <= 32, "required");
7444
      __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_256bit);
7445
      __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, Assembler::AVX_256bit);
7446
      __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, noreg);
7447
      __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
7448
    }
7449
    if (to_elem_bt == T_BYTE) {
7450
      __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
7451
    }
7452
  %}
7453
  ins_pipe( pipe_slow );
7454
%}
7455

7456
instruct vcastLtoX_evex(vec dst, vec src) %{
7457
  predicate(UseAVX > 2 ||
7458
            (Matcher::vector_element_basic_type(n) == T_INT ||
7459
             Matcher::vector_element_basic_type(n) == T_FLOAT ||
7460
             Matcher::vector_element_basic_type(n) == T_DOUBLE));
7461
  match(Set dst (VectorCastL2X src));
7462
  format %{ "vector_cast_l2x  $dst,$src\t!" %}
7463
  ins_encode %{
7464
    BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
7465
    int vlen = Matcher::vector_length_in_bytes(this, $src);
7466
    int vlen_enc = vector_length_encoding(this, $src);
7467
    switch (to_elem_bt) {
7468
      case T_BYTE:
7469
        if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
7470
          vlen_enc = Assembler::AVX_512bit;
7471
        }
7472
        __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7473
        break;
7474
      case T_SHORT:
7475
        if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
7476
          vlen_enc = Assembler::AVX_512bit;
7477
        }
7478
        __ evpmovqw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7479
        break;
7480
      case T_INT:
7481
        if (vlen == 8) {
7482
          if ($dst$$XMMRegister != $src$$XMMRegister) {
7483
            __ movflt($dst$$XMMRegister, $src$$XMMRegister);
7484
          }
7485
        } else if (vlen == 16) {
7486
          __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 8);
7487
        } else if (vlen == 32) {
7488
          if (UseAVX > 2) {
7489
            if (!VM_Version::supports_avx512vl()) {
7490
              vlen_enc = Assembler::AVX_512bit;
7491
            }
7492
            __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7493
          } else {
7494
            __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, vlen_enc);
7495
            __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
7496
          }
7497
        } else { // vlen == 64
7498
          __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7499
        }
7500
        break;
7501
      case T_FLOAT:
7502
        assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
7503
        __ evcvtqq2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7504
        break;
7505
      case T_DOUBLE:
7506
        assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
7507
        __ evcvtqq2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7508
        break;
7509

7510
      default: assert(false, "%s", type2name(to_elem_bt));
7511
    }
7512
  %}
7513
  ins_pipe( pipe_slow );
7514
%}
7515

7516
instruct vcastFtoD_reg(vec dst, vec src) %{
7517
  predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
7518
  match(Set dst (VectorCastF2X src));
7519
  format %{ "vector_cast_f2d  $dst,$src\t!" %}
7520
  ins_encode %{
7521
    int vlen_enc = vector_length_encoding(this);
7522
    __ vcvtps2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7523
  %}
7524
  ins_pipe( pipe_slow );
7525
%}
7526

7527

7528
instruct castFtoX_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rFlagsReg cr) %{
7529
  predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64 &&
7530
            type2aelembytes(Matcher::vector_element_basic_type(n)) <= 4);
7531
  match(Set dst (VectorCastF2X src));
7532
  effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, KILL cr);
7533
  format %{ "vector_cast_f2x $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3 and $xtmp4 as TEMP" %}
7534
  ins_encode %{
7535
    int vlen_enc = vector_length_encoding(this, $src);
7536
    BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
7537
    // JDK-8292878 removed the need for an explicit scratch register needed to load greater than
7538
    // 32 bit addresses for register indirect addressing mode since stub constants
7539
    // are part of code cache and there is a cap of 2G on ReservedCodeCacheSize currently.
7540
    // However, targets are free to increase this limit, but having a large code cache size
7541
    // greater than 2G looks unreasonable in practical scenario, on the hind side with given
7542
    // cap we save a temporary register allocation which in limiting case can prevent
7543
    // spilling in high register pressure blocks.
7544
    __ vector_castF2X_avx(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
7545
                          $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister,
7546
                          ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
7547
  %}
7548
  ins_pipe( pipe_slow );
7549
%}
7550

7551
instruct castFtoX_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
7552
  predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n->in(1)) == 64) &&
7553
            is_integral_type(Matcher::vector_element_basic_type(n)));
7554
  match(Set dst (VectorCastF2X src));
7555
  effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
7556
  format %{ "vector_cast_f2x $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
7557
  ins_encode %{
7558
    BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
7559
    if (to_elem_bt == T_LONG) {
7560
      int vlen_enc = vector_length_encoding(this);
7561
      __ vector_castF2L_evex($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
7562
                             $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
7563
                             ExternalAddress(vector_double_signflip()), noreg, vlen_enc);
7564
    } else {
7565
      int vlen_enc = vector_length_encoding(this, $src);
7566
      __ vector_castF2X_evex(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
7567
                             $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
7568
                             ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
7569
    }
7570
  %}
7571
  ins_pipe( pipe_slow );
7572
%}
7573

7574
instruct vcastDtoF_reg(vec dst, vec src) %{
7575
  predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
7576
  match(Set dst (VectorCastD2X src));
7577
  format %{ "vector_cast_d2x  $dst,$src\t!" %}
7578
  ins_encode %{
7579
    int vlen_enc = vector_length_encoding(this, $src);
7580
    __ vcvtpd2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7581
  %}
7582
  ins_pipe( pipe_slow );
7583
%}
7584

7585
instruct castDtoX_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, vec xtmp5, rFlagsReg cr) %{
7586
  predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64 &&
7587
            is_integral_type(Matcher::vector_element_basic_type(n)));
7588
  match(Set dst (VectorCastD2X src));
7589
  effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP xtmp5, KILL cr);
7590
  format %{ "vector_cast_d2x $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3, $xtmp4 and $xtmp5 as TEMP" %}
7591
  ins_encode %{
7592
    int vlen_enc = vector_length_encoding(this, $src);
7593
    BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
7594
    __ vector_castD2X_avx(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
7595
                          $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister, $xtmp5$$XMMRegister,
7596
                          ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
7597
  %}
7598
  ins_pipe( pipe_slow );
7599
%}
7600

7601
instruct castDtoX_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
7602
  predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n->in(1)) == 64) &&
7603
            is_integral_type(Matcher::vector_element_basic_type(n)));
7604
  match(Set dst (VectorCastD2X src));
7605
  effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
7606
  format %{ "vector_cast_d2x $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
7607
  ins_encode %{
7608
    int vlen_enc = vector_length_encoding(this, $src);
7609
    BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
7610
    AddressLiteral signflip = VM_Version::supports_avx512dq() ? ExternalAddress(vector_double_signflip()) :
7611
                              ExternalAddress(vector_float_signflip());
7612
    __ vector_castD2X_evex(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
7613
                           $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister, signflip, noreg, vlen_enc);
7614
  %}
7615
  ins_pipe( pipe_slow );
7616
%}
7617

7618
instruct vucast(vec dst, vec src) %{
7619
  match(Set dst (VectorUCastB2X src));
7620
  match(Set dst (VectorUCastS2X src));
7621
  match(Set dst (VectorUCastI2X src));
7622
  format %{ "vector_ucast $dst,$src\t!" %}
7623
  ins_encode %{
7624
    assert(UseAVX > 0, "required");
7625

7626
    BasicType from_elem_bt = Matcher::vector_element_basic_type(this, $src);
7627
    BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
7628
    int vlen_enc = vector_length_encoding(this);
7629
    __ vector_unsigned_cast($dst$$XMMRegister, $src$$XMMRegister, vlen_enc, from_elem_bt, to_elem_bt);
7630
  %}
7631
  ins_pipe( pipe_slow );
7632
%}
7633

7634
#ifdef _LP64
7635
instruct vround_float_avx(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rFlagsReg cr) %{
7636
  predicate(!VM_Version::supports_avx512vl() &&
7637
            Matcher::vector_length_in_bytes(n) < 64 &&
7638
            Matcher::vector_element_basic_type(n) == T_INT);
7639
  match(Set dst (RoundVF src));
7640
  effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, KILL cr);
7641
  format %{ "vector_round_float $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $xtmp3, $xtmp4 as TEMP" %}
7642
  ins_encode %{
7643
    int vlen_enc = vector_length_encoding(this);
7644
    InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
7645
    __ vector_round_float_avx($dst$$XMMRegister, $src$$XMMRegister,
7646
                              ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), new_mxcsr, vlen_enc,
7647
                              $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister);
7648
  %}
7649
  ins_pipe( pipe_slow );
7650
%}
7651

7652
instruct vround_float_evex(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
7653
  predicate((VM_Version::supports_avx512vl() ||
7654
             Matcher::vector_length_in_bytes(n) == 64) &&
7655
             Matcher::vector_element_basic_type(n) == T_INT);
7656
  match(Set dst (RoundVF src));
7657
  effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
7658
  format %{ "vector_round_float $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $ktmp1, $ktmp2 as TEMP" %}
7659
  ins_encode %{
7660
    int vlen_enc = vector_length_encoding(this);
7661
    InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
7662
    __ vector_round_float_evex($dst$$XMMRegister, $src$$XMMRegister,
7663
                               ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), new_mxcsr, vlen_enc,
7664
                               $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister);
7665
  %}
7666
  ins_pipe( pipe_slow );
7667
%}
7668

7669
instruct vround_reg_evex(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
7670
  predicate(Matcher::vector_element_basic_type(n) == T_LONG);
7671
  match(Set dst (RoundVD src));
7672
  effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2,  KILL cr);
7673
  format %{ "vector_round_long $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $ktmp1, $ktmp2 as TEMP" %}
7674
  ins_encode %{
7675
    int vlen_enc = vector_length_encoding(this);
7676
    InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
7677
    __ vector_round_double_evex($dst$$XMMRegister, $src$$XMMRegister,
7678
                                ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), new_mxcsr, vlen_enc,
7679
                                $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister);
7680
  %}
7681
  ins_pipe( pipe_slow );
7682
%}
7683

7684
#endif // _LP64
7685

7686
// --------------------------------- VectorMaskCmp --------------------------------------
7687

7688
instruct vcmpFD(legVec dst, legVec src1, legVec src2, immI8 cond) %{
7689
  predicate(n->bottom_type()->isa_vectmask() == nullptr &&
7690
            Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  8 && // src1
7691
            Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
7692
            is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
7693
  match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
7694
  format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
7695
  ins_encode %{
7696
    int vlen_enc = vector_length_encoding(this, $src1);
7697
    Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
7698
    if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
7699
      __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
7700
    } else {
7701
      __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
7702
    }
7703
  %}
7704
  ins_pipe( pipe_slow );
7705
%}
7706

7707
instruct evcmpFD64(vec dst, vec src1, vec src2, immI8 cond, kReg ktmp) %{
7708
  predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64 && // src1
7709
            n->bottom_type()->isa_vectmask() == nullptr &&
7710
            is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
7711
  match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
7712
  effect(TEMP ktmp);
7713
  format %{ "vector_compare $dst,$src1,$src2,$cond" %}
7714
  ins_encode %{
7715
    int vlen_enc = Assembler::AVX_512bit;
7716
    Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
7717
    KRegister mask = k0; // The comparison itself is not being masked.
7718
    if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
7719
      __ evcmpps($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
7720
      __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, noreg);
7721
    } else {
7722
      __ evcmppd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
7723
      __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, noreg);
7724
    }
7725
  %}
7726
  ins_pipe( pipe_slow );
7727
%}
7728

7729
instruct evcmpFD(kReg dst, vec src1, vec src2, immI8 cond) %{
7730
  predicate(n->bottom_type()->isa_vectmask() &&
7731
            is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
7732
  match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
7733
  format %{ "vector_compare_evex $dst,$src1,$src2,$cond\t!" %}
7734
  ins_encode %{
7735
    assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
7736
    int vlen_enc = vector_length_encoding(this, $src1);
7737
    Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
7738
    KRegister mask = k0; // The comparison itself is not being masked.
7739
    if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
7740
      __ evcmpps($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
7741
    } else {
7742
      __ evcmppd($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
7743
    }
7744
  %}
7745
  ins_pipe( pipe_slow );
7746
%}
7747

7748
instruct vcmp_direct(legVec dst, legVec src1, legVec src2, immI8 cond) %{
7749
  predicate(n->bottom_type()->isa_vectmask() == nullptr &&
7750
            !Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
7751
            Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
7752
            Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
7753
            is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1))) &&
7754
            (n->in(2)->get_int() == BoolTest::eq ||
7755
             n->in(2)->get_int() == BoolTest::lt ||
7756
             n->in(2)->get_int() == BoolTest::gt)); // cond
7757
  match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
7758
  format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
7759
  ins_encode %{
7760
    int vlen_enc = vector_length_encoding(this, $src1);
7761
    Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
7762
    Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
7763
    __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, cmp, ww, vlen_enc);
7764
  %}
7765
  ins_pipe( pipe_slow );
7766
%}
7767

7768
instruct vcmp_negate(legVec dst, legVec src1, legVec src2, immI8 cond, legVec xtmp) %{
7769
  predicate(n->bottom_type()->isa_vectmask() == nullptr &&
7770
            !Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
7771
            Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
7772
            Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
7773
            is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1))) &&
7774
            (n->in(2)->get_int() == BoolTest::ne ||
7775
             n->in(2)->get_int() == BoolTest::le ||
7776
             n->in(2)->get_int() == BoolTest::ge)); // cond
7777
  match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
7778
  effect(TEMP dst, TEMP xtmp);
7779
  format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $xtmp as TEMP" %}
7780
  ins_encode %{
7781
    int vlen_enc = vector_length_encoding(this, $src1);
7782
    Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
7783
    Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
7784
    __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $xtmp$$XMMRegister, cmp, ww, vlen_enc);
7785
  %}
7786
  ins_pipe( pipe_slow );
7787
%}
7788

7789
instruct vcmpu(legVec dst, legVec src1, legVec src2, immI8 cond, legVec xtmp) %{
7790
  predicate(n->bottom_type()->isa_vectmask() == nullptr &&
7791
            Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
7792
            Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
7793
            Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
7794
            is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
7795
  match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
7796
  effect(TEMP dst, TEMP xtmp);
7797
  format %{ "vector_compareu $dst,$src1,$src2,$cond\t! using $xtmp as TEMP" %}
7798
  ins_encode %{
7799
    InternalAddress flip_bit = $constantaddress(high_bit_set(Matcher::vector_element_basic_type(this, $src1)));
7800
    int vlen_enc = vector_length_encoding(this, $src1);
7801
    Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
7802
    Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
7803

7804
    if (vlen_enc == Assembler::AVX_128bit) {
7805
      __ vmovddup($xtmp$$XMMRegister, flip_bit, vlen_enc, noreg);
7806
    } else {
7807
      __ vbroadcastsd($xtmp$$XMMRegister, flip_bit, vlen_enc, noreg);
7808
    }
7809
    __ vpxor($dst$$XMMRegister, $xtmp$$XMMRegister, $src1$$XMMRegister, vlen_enc);
7810
    __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $src2$$XMMRegister, vlen_enc);
7811
    __ vpcmpCCW($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, cmp, ww, vlen_enc);
7812
  %}
7813
  ins_pipe( pipe_slow );
7814
%}
7815

7816
instruct vcmp64(vec dst, vec src1, vec src2, immI8 cond, kReg ktmp) %{
7817
  predicate((n->bottom_type()->isa_vectmask() == nullptr &&
7818
             Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64) && // src1
7819
             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
7820
  match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
7821
  effect(TEMP ktmp);
7822
  format %{ "vector_compare $dst,$src1,$src2,$cond" %}
7823
  ins_encode %{
7824
    assert(UseAVX > 2, "required");
7825

7826
    int vlen_enc = vector_length_encoding(this, $src1);
7827
    Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
7828
    bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
7829
    KRegister mask = k0; // The comparison itself is not being masked.
7830
    bool merge = false;
7831
    BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
7832

7833
    switch (src1_elem_bt) {
7834
      case T_INT: {
7835
        __ evpcmpd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
7836
        __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, noreg);
7837
        break;
7838
      }
7839
      case T_LONG: {
7840
        __ evpcmpq($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
7841
        __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, noreg);
7842
        break;
7843
      }
7844
      default: assert(false, "%s", type2name(src1_elem_bt));
7845
    }
7846
  %}
7847
  ins_pipe( pipe_slow );
7848
%}
7849

7850

7851
instruct evcmp(kReg dst, vec src1, vec src2, immI8 cond) %{
7852
  predicate(n->bottom_type()->isa_vectmask() &&
7853
            is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
7854
  match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
7855
  format %{ "vector_compared_evex $dst,$src1,$src2,$cond\t!" %}
7856
  ins_encode %{
7857
    assert(UseAVX > 2, "required");
7858
    assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
7859

7860
    int vlen_enc = vector_length_encoding(this, $src1);
7861
    Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
7862
    bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
7863
    BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
7864

7865
    // Comparison i
7866
    switch (src1_elem_bt) {
7867
      case T_BYTE: {
7868
        __ evpcmpb($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
7869
        break;
7870
      }
7871
      case T_SHORT: {
7872
        __ evpcmpw($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
7873
        break;
7874
      }
7875
      case T_INT: {
7876
        __ evpcmpd($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
7877
        break;
7878
      }
7879
      case T_LONG: {
7880
        __ evpcmpq($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
7881
        break;
7882
      }
7883
      default: assert(false, "%s", type2name(src1_elem_bt));
7884
    }
7885
  %}
7886
  ins_pipe( pipe_slow );
7887
%}
7888

7889
// Extract
7890

7891
instruct extractI(rRegI dst, legVec src, immU8 idx) %{
7892
  predicate(Matcher::vector_length_in_bytes(n->in(1)) <= 16); // src
7893
  match(Set dst (ExtractI src idx));
7894
  match(Set dst (ExtractS src idx));
7895
#ifdef _LP64
7896
  match(Set dst (ExtractB src idx));
7897
#endif
7898
  format %{ "extractI $dst,$src,$idx\t!" %}
7899
  ins_encode %{
7900
    assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7901

7902
    BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
7903
    __ get_elem(elem_bt, $dst$$Register, $src$$XMMRegister, $idx$$constant);
7904
  %}
7905
  ins_pipe( pipe_slow );
7906
%}
7907

7908
instruct vextractI(rRegI dst, legVec src, immI idx, legVec vtmp) %{
7909
  predicate(Matcher::vector_length_in_bytes(n->in(1)) == 32 || // src
7910
            Matcher::vector_length_in_bytes(n->in(1)) == 64);  // src
7911
  match(Set dst (ExtractI src idx));
7912
  match(Set dst (ExtractS src idx));
7913
#ifdef _LP64
7914
  match(Set dst (ExtractB src idx));
7915
#endif
7916
  effect(TEMP vtmp);
7917
  format %{ "vextractI $dst,$src,$idx\t! using $vtmp as TEMP" %}
7918
  ins_encode %{
7919
    assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7920

7921
    BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
7922
    XMMRegister lane_xmm = __ get_lane(elem_bt, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
7923
    __ get_elem(elem_bt, $dst$$Register, lane_xmm, $idx$$constant);
7924
  %}
7925
  ins_pipe( pipe_slow );
7926
%}
7927

7928
#ifdef _LP64
7929
instruct extractL(rRegL dst, legVec src, immU8 idx) %{
7930
  predicate(Matcher::vector_length(n->in(1)) <= 2); // src
7931
  match(Set dst (ExtractL src idx));
7932
  format %{ "extractL $dst,$src,$idx\t!" %}
7933
  ins_encode %{
7934
    assert(UseSSE >= 4, "required");
7935
    assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7936

7937
    __ get_elem(T_LONG, $dst$$Register, $src$$XMMRegister, $idx$$constant);
7938
  %}
7939
  ins_pipe( pipe_slow );
7940
%}
7941

7942
instruct vextractL(rRegL dst, legVec src, immU8 idx, legVec vtmp) %{
7943
  predicate(Matcher::vector_length(n->in(1)) == 4 || // src
7944
            Matcher::vector_length(n->in(1)) == 8);  // src
7945
  match(Set dst (ExtractL src idx));
7946
  effect(TEMP vtmp);
7947
  format %{ "vextractL $dst,$src,$idx\t! using $vtmp as TEMP" %}
7948
  ins_encode %{
7949
    assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7950

7951
    XMMRegister lane_reg = __ get_lane(T_LONG, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
7952
    __ get_elem(T_LONG, $dst$$Register, lane_reg, $idx$$constant);
7953
  %}
7954
  ins_pipe( pipe_slow );
7955
%}
7956
#endif
7957

7958
instruct extractF(legRegF dst, legVec src, immU8 idx, legVec vtmp) %{
7959
  predicate(Matcher::vector_length(n->in(1)) <= 4);
7960
  match(Set dst (ExtractF src idx));
7961
  effect(TEMP dst, TEMP vtmp);
7962
  format %{ "extractF $dst,$src,$idx\t! using $vtmp as TEMP" %}
7963
  ins_encode %{
7964
    assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7965

7966
    __ get_elem(T_FLOAT, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant, $vtmp$$XMMRegister);
7967
  %}
7968
  ins_pipe( pipe_slow );
7969
%}
7970

7971
instruct vextractF(legRegF dst, legVec src, immU8 idx, legVec vtmp) %{
7972
  predicate(Matcher::vector_length(n->in(1)/*src*/) == 8 ||
7973
            Matcher::vector_length(n->in(1)/*src*/) == 16);
7974
  match(Set dst (ExtractF src idx));
7975
  effect(TEMP vtmp);
7976
  format %{ "vextractF $dst,$src,$idx\t! using $vtmp as TEMP" %}
7977
  ins_encode %{
7978
    assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7979

7980
    XMMRegister lane_reg = __ get_lane(T_FLOAT, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
7981
    __ get_elem(T_FLOAT, $dst$$XMMRegister, lane_reg, $idx$$constant);
7982
  %}
7983
  ins_pipe( pipe_slow );
7984
%}
7985

7986
instruct extractD(legRegD dst, legVec src, immU8 idx) %{
7987
  predicate(Matcher::vector_length(n->in(1)) == 2); // src
7988
  match(Set dst (ExtractD src idx));
7989
  format %{ "extractD $dst,$src,$idx\t!" %}
7990
  ins_encode %{
7991
    assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7992

7993
    __ get_elem(T_DOUBLE, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant);
7994
  %}
7995
  ins_pipe( pipe_slow );
7996
%}
7997

7998
instruct vextractD(legRegD dst, legVec src, immU8 idx, legVec vtmp) %{
7999
  predicate(Matcher::vector_length(n->in(1)) == 4 || // src
8000
            Matcher::vector_length(n->in(1)) == 8);  // src
8001
  match(Set dst (ExtractD src idx));
8002
  effect(TEMP vtmp);
8003
  format %{ "vextractD $dst,$src,$idx\t! using $vtmp as TEMP" %}
8004
  ins_encode %{
8005
    assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
8006

8007
    XMMRegister lane_reg = __ get_lane(T_DOUBLE, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
8008
    __ get_elem(T_DOUBLE, $dst$$XMMRegister, lane_reg, $idx$$constant);
8009
  %}
8010
  ins_pipe( pipe_slow );
8011
%}
8012

8013
// --------------------------------- Vector Blend --------------------------------------
8014

8015
instruct blendvp(vec dst, vec src, vec mask, rxmm0 tmp) %{
8016
  predicate(UseAVX == 0);
8017
  match(Set dst (VectorBlend (Binary dst src) mask));
8018
  format %{ "vector_blend  $dst,$src,$mask\t! using $tmp as TEMP" %}
8019
  effect(TEMP tmp);
8020
  ins_encode %{
8021
    assert(UseSSE >= 4, "required");
8022

8023
    if ($mask$$XMMRegister != $tmp$$XMMRegister) {
8024
      __ movdqu($tmp$$XMMRegister, $mask$$XMMRegister);
8025
    }
8026
    __ pblendvb($dst$$XMMRegister, $src$$XMMRegister); // uses xmm0 as mask
8027
  %}
8028
  ins_pipe( pipe_slow );
8029
%}
8030

8031
instruct vblendvpI(legVec dst, legVec src1, legVec src2, legVec mask) %{
8032
  predicate(UseAVX > 0 && !EnableX86ECoreOpts &&
8033
            n->in(2)->bottom_type()->isa_vectmask() == nullptr &&
8034
            Matcher::vector_length_in_bytes(n) <= 32 &&
8035
            is_integral_type(Matcher::vector_element_basic_type(n)));
8036
  match(Set dst (VectorBlend (Binary src1 src2) mask));
8037
  format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
8038
  ins_encode %{
8039
    int vlen_enc = vector_length_encoding(this);
8040
    __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
8041
  %}
8042
  ins_pipe( pipe_slow );
8043
%}
8044

8045
instruct vblendvpFD(legVec dst, legVec src1, legVec src2, legVec mask) %{
8046
  predicate(UseAVX > 0 && !EnableX86ECoreOpts &&
8047
            n->in(2)->bottom_type()->isa_vectmask() == nullptr &&
8048
            Matcher::vector_length_in_bytes(n) <= 32 &&
8049
            !is_integral_type(Matcher::vector_element_basic_type(n)));
8050
  match(Set dst (VectorBlend (Binary src1 src2) mask));
8051
  format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
8052
  ins_encode %{
8053
    int vlen_enc = vector_length_encoding(this);
8054
    __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
8055
  %}
8056
  ins_pipe( pipe_slow );
8057
%}
8058

8059
instruct vblendvp(legVec dst, legVec src1, legVec src2, legVec mask, legVec vtmp) %{
8060
  predicate(UseAVX > 0 && EnableX86ECoreOpts &&
8061
            n->in(2)->bottom_type()->isa_vectmask() == nullptr &&
8062
            Matcher::vector_length_in_bytes(n) <= 32);
8063
  match(Set dst (VectorBlend (Binary src1 src2) mask));
8064
  format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using $vtmp as TEMP" %}
8065
  effect(TEMP vtmp, TEMP dst);
8066
  ins_encode %{
8067
    int vlen_enc = vector_length_encoding(this);
8068
    __ vpandn($vtmp$$XMMRegister, $mask$$XMMRegister, $src1$$XMMRegister, vlen_enc);
8069
    __ vpand ($dst$$XMMRegister,  $mask$$XMMRegister, $src2$$XMMRegister, vlen_enc);
8070
    __ vpor  ($dst$$XMMRegister,  $dst$$XMMRegister,  $vtmp$$XMMRegister, vlen_enc);
8071
  %}
8072
  ins_pipe( pipe_slow );
8073
%}
8074

8075
instruct evblendvp64(vec dst, vec src1, vec src2, vec mask, kReg ktmp) %{
8076
  predicate(Matcher::vector_length_in_bytes(n) == 64 &&
8077
            n->in(2)->bottom_type()->isa_vectmask() == nullptr);
8078
  match(Set dst (VectorBlend (Binary src1 src2) mask));
8079
  format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using k2 as TEMP" %}
8080
  effect(TEMP ktmp);
8081
  ins_encode %{
8082
     int vlen_enc = Assembler::AVX_512bit;
8083
     BasicType elem_bt = Matcher::vector_element_basic_type(this);
8084
    __ evpcmp(elem_bt, $ktmp$$KRegister, k0, $mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), Assembler::eq, vlen_enc, noreg);
8085
    __ evpblend(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
8086
  %}
8087
  ins_pipe( pipe_slow );
8088
%}
8089

8090

8091
instruct evblendvp64_masked(vec dst, vec src1, vec src2, kReg mask) %{
8092
  predicate(n->in(2)->bottom_type()->isa_vectmask() &&
8093
            (!is_subword_type(Matcher::vector_element_basic_type(n)) ||
8094
             VM_Version::supports_avx512bw()));
8095
  match(Set dst (VectorBlend (Binary src1 src2) mask));
8096
  format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using k2 as TEMP" %}
8097
  ins_encode %{
8098
    int vlen_enc = vector_length_encoding(this);
8099
    BasicType elem_bt = Matcher::vector_element_basic_type(this);
8100
    __ evpblend(elem_bt, $dst$$XMMRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
8101
  %}
8102
  ins_pipe( pipe_slow );
8103
%}
8104

8105
// --------------------------------- ABS --------------------------------------
8106
// a = |a|
8107
instruct vabsB_reg(vec dst, vec src) %{
8108
  match(Set dst (AbsVB  src));
8109
  format %{ "vabsb $dst,$src\t# $dst = |$src| abs packedB" %}
8110
  ins_encode %{
8111
    uint vlen = Matcher::vector_length(this);
8112
    if (vlen <= 16) {
8113
      __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
8114
    } else {
8115
      int vlen_enc = vector_length_encoding(this);
8116
      __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
8117
    }
8118
  %}
8119
  ins_pipe( pipe_slow );
8120
%}
8121

8122
instruct vabsS_reg(vec dst, vec src) %{
8123
  match(Set dst (AbsVS  src));
8124
  format %{ "vabsw $dst,$src\t# $dst = |$src| abs packedS" %}
8125
  ins_encode %{
8126
    uint vlen = Matcher::vector_length(this);
8127
    if (vlen <= 8) {
8128
      __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
8129
    } else {
8130
      int vlen_enc = vector_length_encoding(this);
8131
      __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
8132
    }
8133
  %}
8134
  ins_pipe( pipe_slow );
8135
%}
8136

8137
instruct vabsI_reg(vec dst, vec src) %{
8138
  match(Set dst (AbsVI  src));
8139
  format %{ "pabsd $dst,$src\t# $dst = |$src| abs packedI" %}
8140
  ins_encode %{
8141
    uint vlen = Matcher::vector_length(this);
8142
    if (vlen <= 4) {
8143
      __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
8144
    } else {
8145
      int vlen_enc = vector_length_encoding(this);
8146
      __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
8147
    }
8148
  %}
8149
  ins_pipe( pipe_slow );
8150
%}
8151

8152
instruct vabsL_reg(vec dst, vec src) %{
8153
  match(Set dst (AbsVL  src));
8154
  format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packedL" %}
8155
  ins_encode %{
8156
    assert(UseAVX > 2, "required");
8157
    int vlen_enc = vector_length_encoding(this);
8158
    if (!VM_Version::supports_avx512vl()) {
8159
      vlen_enc = Assembler::AVX_512bit;
8160
    }
8161
    __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
8162
  %}
8163
  ins_pipe( pipe_slow );
8164
%}
8165

8166
// --------------------------------- ABSNEG --------------------------------------
8167

8168
instruct vabsnegF(vec dst, vec src) %{
8169
  predicate(Matcher::vector_length(n) != 4); // handled by 1-operand instruction vabsneg4F
8170
  match(Set dst (AbsVF src));
8171
  match(Set dst (NegVF src));
8172
  format %{ "vabsnegf $dst,$src,[mask]\t# absneg packedF" %}
8173
  ins_cost(150);
8174
  ins_encode %{
8175
    int opcode = this->ideal_Opcode();
8176
    int vlen = Matcher::vector_length(this);
8177
    if (vlen == 2) {
8178
      __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister);
8179
    } else {
8180
      assert(vlen == 8 || vlen == 16, "required");
8181
      int vlen_enc = vector_length_encoding(this);
8182
      __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
8183
    }
8184
  %}
8185
  ins_pipe( pipe_slow );
8186
%}
8187

8188
instruct vabsneg4F(vec dst) %{
8189
  predicate(Matcher::vector_length(n) == 4);
8190
  match(Set dst (AbsVF dst));
8191
  match(Set dst (NegVF dst));
8192
  format %{ "vabsnegf $dst,[mask]\t# absneg packed4F" %}
8193
  ins_cost(150);
8194
  ins_encode %{
8195
    int opcode = this->ideal_Opcode();
8196
    __ vabsnegf(opcode, $dst$$XMMRegister, $dst$$XMMRegister);
8197
  %}
8198
  ins_pipe( pipe_slow );
8199
%}
8200

8201
instruct vabsnegD(vec dst, vec src) %{
8202
  match(Set dst (AbsVD  src));
8203
  match(Set dst (NegVD  src));
8204
  format %{ "vabsnegd $dst,$src,[mask]\t# absneg packedD" %}
8205
  ins_encode %{
8206
    int opcode = this->ideal_Opcode();
8207
    uint vlen = Matcher::vector_length(this);
8208
    if (vlen == 2) {
8209
      assert(UseSSE >= 2, "required");
8210
      __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister);
8211
    } else {
8212
      int vlen_enc = vector_length_encoding(this);
8213
      __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
8214
    }
8215
  %}
8216
  ins_pipe( pipe_slow );
8217
%}
8218

8219
//------------------------------------- VectorTest --------------------------------------------
8220

8221
#ifdef _LP64
8222
instruct vptest_lt16(rFlagsRegU cr, legVec src1, legVec src2, legVec vtmp) %{
8223
  predicate(Matcher::vector_length_in_bytes(n->in(1)) < 16);
8224
  match(Set cr (VectorTest src1 src2));
8225
  effect(TEMP vtmp);
8226
  format %{ "vptest_lt16  $src1, $src2\t! using $vtmp as TEMP" %}
8227
  ins_encode %{
8228
    BasicType bt = Matcher::vector_element_basic_type(this, $src1);
8229
    int vlen = Matcher::vector_length_in_bytes(this, $src1);
8230
    __ vectortest(bt, $src1$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister, vlen);
8231
  %}
8232
  ins_pipe( pipe_slow );
8233
%}
8234

8235
instruct vptest_ge16(rFlagsRegU cr, legVec src1, legVec src2) %{
8236
  predicate(Matcher::vector_length_in_bytes(n->in(1)) >= 16);
8237
  match(Set cr (VectorTest src1 src2));
8238
  format %{ "vptest_ge16  $src1, $src2\n\t" %}
8239
  ins_encode %{
8240
    BasicType bt = Matcher::vector_element_basic_type(this, $src1);
8241
    int vlen = Matcher::vector_length_in_bytes(this, $src1);
8242
    __ vectortest(bt, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, vlen);
8243
  %}
8244
  ins_pipe( pipe_slow );
8245
%}
8246

8247
instruct ktest_alltrue_le8(rFlagsRegU cr, kReg src1, kReg src2, rRegI tmp) %{
8248
  predicate((Matcher::vector_length(n->in(1)) < 8 ||
8249
             (Matcher::vector_length(n->in(1)) == 8 && !VM_Version::supports_avx512dq())) &&
8250
            static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow);
8251
  match(Set cr (VectorTest src1 src2));
8252
  effect(TEMP tmp);
8253
  format %{ "ktest_alltrue_le8  $src1, $src2\t! using $tmp as TEMP" %}
8254
  ins_encode %{
8255
    uint masklen = Matcher::vector_length(this, $src1);
8256
    __ kmovwl($tmp$$Register, $src1$$KRegister);
8257
    __ andl($tmp$$Register, (1 << masklen) - 1);
8258
    __ cmpl($tmp$$Register, (1 << masklen) - 1);
8259
  %}
8260
  ins_pipe( pipe_slow );
8261
%}
8262

8263
instruct ktest_anytrue_le8(rFlagsRegU cr, kReg src1, kReg src2, rRegI tmp) %{
8264
  predicate((Matcher::vector_length(n->in(1)) < 8 ||
8265
             (Matcher::vector_length(n->in(1)) == 8 && !VM_Version::supports_avx512dq())) &&
8266
            static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
8267
  match(Set cr (VectorTest src1 src2));
8268
  effect(TEMP tmp);
8269
  format %{ "ktest_anytrue_le8  $src1, $src2\t! using $tmp as TEMP" %}
8270
  ins_encode %{
8271
    uint masklen = Matcher::vector_length(this, $src1);
8272
    __ kmovwl($tmp$$Register, $src1$$KRegister);
8273
    __ andl($tmp$$Register, (1 << masklen) - 1);
8274
  %}
8275
  ins_pipe( pipe_slow );
8276
%}
8277

8278
instruct ktest_ge8(rFlagsRegU cr, kReg src1, kReg src2) %{
8279
  predicate(Matcher::vector_length(n->in(1)) >= 16 ||
8280
            (Matcher::vector_length(n->in(1)) == 8 && VM_Version::supports_avx512dq()));
8281
  match(Set cr (VectorTest src1 src2));
8282
  format %{ "ktest_ge8  $src1, $src2\n\t" %}
8283
  ins_encode %{
8284
    uint masklen = Matcher::vector_length(this, $src1);
8285
    __ kortest(masklen, $src1$$KRegister, $src1$$KRegister);
8286
  %}
8287
  ins_pipe( pipe_slow );
8288
%}
8289
#endif
8290

8291
//------------------------------------- LoadMask --------------------------------------------
8292

8293
instruct loadMask(legVec dst, legVec src) %{
8294
  predicate(n->bottom_type()->isa_vectmask() == nullptr && !VM_Version::supports_avx512vlbw());
8295
  match(Set dst (VectorLoadMask src));
8296
  effect(TEMP dst);
8297
  format %{ "vector_loadmask_byte $dst, $src\n\t" %}
8298
  ins_encode %{
8299
    int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
8300
    BasicType elem_bt = Matcher::vector_element_basic_type(this);
8301
    __ load_vector_mask($dst$$XMMRegister, $src$$XMMRegister, vlen_in_bytes, elem_bt, true);
8302
  %}
8303
  ins_pipe( pipe_slow );
8304
%}
8305

8306
instruct loadMask64(kReg dst, vec src, vec xtmp) %{
8307
  predicate(n->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
8308
  match(Set dst (VectorLoadMask src));
8309
  effect(TEMP xtmp);
8310
  format %{ "vector_loadmask_64byte $dst, $src\t! using $xtmp as TEMP" %}
8311
  ins_encode %{
8312
    __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
8313
                        true, Assembler::AVX_512bit);
8314
  %}
8315
  ins_pipe( pipe_slow );
8316
%}
8317

8318
instruct loadMask_evex(kReg dst, vec src,  vec xtmp) %{
8319
  predicate(n->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
8320
  match(Set dst (VectorLoadMask src));
8321
  effect(TEMP xtmp);
8322
  format %{ "vector_loadmask_byte $dst, $src\t! using $xtmp as TEMP" %}
8323
  ins_encode %{
8324
    int vlen_enc = vector_length_encoding(in(1));
8325
    __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
8326
                        false, vlen_enc);
8327
  %}
8328
  ins_pipe( pipe_slow );
8329
%}
8330

8331
//------------------------------------- StoreMask --------------------------------------------
8332

8333
instruct vstoreMask1B(vec dst, vec src, immI_1 size) %{
8334
  predicate(Matcher::vector_length(n) < 64 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
8335
  match(Set dst (VectorStoreMask src size));
8336
  format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
8337
  ins_encode %{
8338
    int vlen = Matcher::vector_length(this);
8339
    if (vlen <= 16 && UseAVX <= 2) {
8340
      assert(UseSSE >= 3, "required");
8341
      __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
8342
    } else {
8343
      assert(UseAVX > 0, "required");
8344
      int src_vlen_enc = vector_length_encoding(this, $src);
8345
      __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
8346
    }
8347
  %}
8348
  ins_pipe( pipe_slow );
8349
%}
8350

8351
instruct vstoreMask2B(vec dst, vec src, vec xtmp, immI_2 size) %{
8352
  predicate(Matcher::vector_length(n) <= 16 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
8353
  match(Set dst (VectorStoreMask src size));
8354
  effect(TEMP_DEF dst, TEMP xtmp);
8355
  format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
8356
  ins_encode %{
8357
    int vlen_enc = Assembler::AVX_128bit;
8358
    int vlen = Matcher::vector_length(this);
8359
    if (vlen <= 8) {
8360
      assert(UseSSE >= 3, "required");
8361
      __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
8362
      __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
8363
      __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
8364
    } else {
8365
      assert(UseAVX > 0, "required");
8366
      __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
8367
      __ vpacksswb($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
8368
      __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
8369
    }
8370
  %}
8371
  ins_pipe( pipe_slow );
8372
%}
8373

8374
instruct vstoreMask4B(vec dst, vec src, vec xtmp, immI_4 size) %{
8375
  predicate(UseAVX <= 2 && Matcher::vector_length(n) <= 8 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
8376
  match(Set dst (VectorStoreMask src size));
8377
  format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
8378
  effect(TEMP_DEF dst, TEMP xtmp);
8379
  ins_encode %{
8380
    int vlen_enc = Assembler::AVX_128bit;
8381
    int vlen = Matcher::vector_length(this);
8382
    if (vlen <= 4) {
8383
      assert(UseSSE >= 3, "required");
8384
      __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
8385
      __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
8386
      __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
8387
      __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
8388
    } else {
8389
      assert(UseAVX > 0, "required");
8390
      __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
8391
      __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
8392
      __ vpackssdw($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
8393
      __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
8394
      __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
8395
    }
8396
  %}
8397
  ins_pipe( pipe_slow );
8398
%}
8399

8400
instruct storeMask8B(vec dst, vec src, vec xtmp, immI_8 size) %{
8401
  predicate(UseAVX <= 2 && Matcher::vector_length(n) == 2);
8402
  match(Set dst (VectorStoreMask src size));
8403
  effect(TEMP_DEF dst, TEMP xtmp);
8404
  format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
8405
  ins_encode %{
8406
    assert(UseSSE >= 3, "required");
8407
    __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
8408
    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x8);
8409
    __ pabsd($dst$$XMMRegister, $dst$$XMMRegister);
8410
    __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
8411
    __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
8412
  %}
8413
  ins_pipe( pipe_slow );
8414
%}
8415

8416
instruct storeMask8B_avx(vec dst, vec src, immI_8 size, vec vtmp) %{
8417
  predicate(UseAVX <= 2 && Matcher::vector_length(n) == 4);
8418
  match(Set dst (VectorStoreMask src size));
8419
  format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s], using $vtmp as TEMP" %}
8420
  effect(TEMP_DEF dst, TEMP vtmp);
8421
  ins_encode %{
8422
    int vlen_enc = Assembler::AVX_128bit;
8423
    __ vshufps($dst$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 0x88, Assembler::AVX_256bit);
8424
    __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
8425
    __ vblendps($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0xC, vlen_enc);
8426
    __ vpxor($vtmp$$XMMRegister, $vtmp$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
8427
    __ vpackssdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
8428
    __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
8429
    __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
8430
  %}
8431
  ins_pipe( pipe_slow );
8432
%}
8433

8434
instruct vstoreMask4B_evex_novectmask(vec dst, vec src, immI_4 size) %{
8435
  predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
8436
  match(Set dst (VectorStoreMask src size));
8437
  format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
8438
  ins_encode %{
8439
    int src_vlen_enc = vector_length_encoding(this, $src);
8440
    int dst_vlen_enc = vector_length_encoding(this);
8441
    if (!VM_Version::supports_avx512vl()) {
8442
      src_vlen_enc = Assembler::AVX_512bit;
8443
    }
8444
    __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
8445
    __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
8446
  %}
8447
  ins_pipe( pipe_slow );
8448
%}
8449

8450
instruct vstoreMask8B_evex_novectmask(vec dst, vec src, immI_8 size) %{
8451
  predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
8452
  match(Set dst (VectorStoreMask src size));
8453
  format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
8454
  ins_encode %{
8455
    int src_vlen_enc = vector_length_encoding(this, $src);
8456
    int dst_vlen_enc = vector_length_encoding(this);
8457
    if (!VM_Version::supports_avx512vl()) {
8458
      src_vlen_enc = Assembler::AVX_512bit;
8459
    }
8460
    __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
8461
    __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
8462
  %}
8463
  ins_pipe( pipe_slow );
8464
%}
8465

8466
instruct vstoreMask_evex_vectmask(vec dst, kReg mask, immI size) %{
8467
  predicate(n->in(1)->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
8468
  match(Set dst (VectorStoreMask mask size));
8469
  effect(TEMP_DEF dst);
8470
  format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
8471
  ins_encode %{
8472
    assert(Matcher::vector_length_in_bytes(this, $mask) == 64, "");
8473
    __ evmovdqul($dst$$XMMRegister, $mask$$KRegister, ExternalAddress(vector_int_mask_cmp_bits()),
8474
                 false, Assembler::AVX_512bit, noreg);
8475
    __ evpmovdb($dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_512bit);
8476
  %}
8477
  ins_pipe( pipe_slow );
8478
%}
8479

8480
instruct vstoreMask_evex(vec dst, kReg mask, immI size) %{
8481
  predicate(n->in(1)->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
8482
  match(Set dst (VectorStoreMask mask size));
8483
  effect(TEMP_DEF dst);
8484
  format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
8485
  ins_encode %{
8486
    int dst_vlen_enc = vector_length_encoding(this);
8487
    __ evpmovm2b($dst$$XMMRegister, $mask$$KRegister, dst_vlen_enc);
8488
    __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
8489
  %}
8490
  ins_pipe( pipe_slow );
8491
%}
8492

8493
instruct vmaskcast_evex(kReg dst) %{
8494
  match(Set dst (VectorMaskCast dst));
8495
  ins_cost(0);
8496
  format %{ "vector_mask_cast $dst" %}
8497
  ins_encode %{
8498
    // empty
8499
  %}
8500
  ins_pipe(empty);
8501
%}
8502

8503
instruct vmaskcast(vec dst) %{
8504
  predicate(Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1)));
8505
  match(Set dst (VectorMaskCast dst));
8506
  ins_cost(0);
8507
  format %{ "vector_mask_cast $dst" %}
8508
  ins_encode %{
8509
    // empty
8510
  %}
8511
  ins_pipe(empty);
8512
%}
8513

8514
instruct vmaskcast_avx(vec dst, vec src) %{
8515
  predicate(Matcher::vector_length_in_bytes(n) != Matcher::vector_length_in_bytes(n->in(1)));
8516
  match(Set dst (VectorMaskCast src));
8517
  format %{ "vector_mask_cast $dst, $src" %}
8518
  ins_encode %{
8519
    int vlen = Matcher::vector_length(this);
8520
    BasicType src_bt = Matcher::vector_element_basic_type(this, $src);
8521
    BasicType dst_bt = Matcher::vector_element_basic_type(this);
8522
    __ vector_mask_cast($dst$$XMMRegister, $src$$XMMRegister, dst_bt, src_bt, vlen);
8523
  %}
8524
  ins_pipe(pipe_slow);
8525
%}
8526

8527
//-------------------------------- Load Iota Indices ----------------------------------
8528

8529
instruct loadIotaIndices(vec dst, immI_0 src) %{
8530
  match(Set dst (VectorLoadConst src));
8531
  format %{ "vector_load_iota $dst CONSTANT_MEMORY\t! load iota indices" %}
8532
  ins_encode %{
8533
     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
8534
     BasicType bt = Matcher::vector_element_basic_type(this);
8535
     __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, bt);
8536
  %}
8537
  ins_pipe( pipe_slow );
8538
%}
8539

8540
#ifdef _LP64
8541
instruct VectorPopulateIndex(vec dst, rRegI src1, immI_1 src2, vec vtmp) %{
8542
  match(Set dst (PopulateIndex src1 src2));
8543
  effect(TEMP dst, TEMP vtmp);
8544
  format %{ "vector_populate_index $dst $src1 $src2\t! using $vtmp as TEMP" %}
8545
  ins_encode %{
8546
     assert($src2$$constant == 1, "required");
8547
     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
8548
     int vlen_enc = vector_length_encoding(this);
8549
     BasicType elem_bt = Matcher::vector_element_basic_type(this);
8550
     __ vpbroadcast(elem_bt, $vtmp$$XMMRegister, $src1$$Register, vlen_enc);
8551
     __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, elem_bt);
8552
     __ vpadd(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
8553
  %}
8554
  ins_pipe( pipe_slow );
8555
%}
8556

8557
instruct VectorPopulateLIndex(vec dst, rRegL src1, immI_1 src2, vec vtmp) %{
8558
  match(Set dst (PopulateIndex src1 src2));
8559
  effect(TEMP dst, TEMP vtmp);
8560
  format %{ "vector_populate_index $dst $src1 $src2\t! using $vtmp as TEMP" %}
8561
  ins_encode %{
8562
     assert($src2$$constant == 1, "required");
8563
     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
8564
     int vlen_enc = vector_length_encoding(this);
8565
     BasicType elem_bt = Matcher::vector_element_basic_type(this);
8566
     __ vpbroadcast(elem_bt, $vtmp$$XMMRegister, $src1$$Register, vlen_enc);
8567
     __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, elem_bt);
8568
     __ vpadd(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
8569
  %}
8570
  ins_pipe( pipe_slow );
8571
%}
8572
#endif
8573
//-------------------------------- Rearrange ----------------------------------
8574

8575
// LoadShuffle/Rearrange for Byte
8576

8577
instruct loadShuffleB(vec dst) %{
8578
  predicate(Matcher::vector_element_basic_type(n) == T_BYTE);
8579
  match(Set dst (VectorLoadShuffle dst));
8580
  format %{ "vector_load_shuffle $dst, $dst" %}
8581
  ins_encode %{
8582
    // empty
8583
  %}
8584
  ins_pipe( pipe_slow );
8585
%}
8586

8587
instruct rearrangeB(vec dst, vec shuffle) %{
8588
  predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
8589
            Matcher::vector_length(n) < 32);
8590
  match(Set dst (VectorRearrange dst shuffle));
8591
  format %{ "vector_rearrange $dst, $shuffle, $dst" %}
8592
  ins_encode %{
8593
    assert(UseSSE >= 4, "required");
8594
    __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
8595
  %}
8596
  ins_pipe( pipe_slow );
8597
%}
8598

8599
instruct rearrangeB_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2) %{
8600
  predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
8601
            Matcher::vector_length(n) == 32 && !VM_Version::supports_avx512_vbmi());
8602
  match(Set dst (VectorRearrange src shuffle));
8603
  effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
8604
  format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2 as TEMP" %}
8605
  ins_encode %{
8606
    assert(UseAVX >= 2, "required");
8607
    // Swap src into vtmp1
8608
    __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
8609
    // Shuffle swapped src to get entries from other 128 bit lane
8610
    __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
8611
    // Shuffle original src to get entries from self 128 bit lane
8612
    __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
8613
    // Create a blend mask by setting high bits for entries coming from other lane in shuffle
8614
    __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, noreg);
8615
    // Perform the blend
8616
    __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
8617
  %}
8618
  ins_pipe( pipe_slow );
8619
%}
8620

8621

8622
instruct rearrangeB_evex(vec dst, vec src, vec shuffle, vec xtmp1, vec xtmp2, vec xtmp3, kReg ktmp, rRegI rtmp) %{
8623
  predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
8624
            Matcher::vector_length(n) > 32 && !VM_Version::supports_avx512_vbmi());
8625
  match(Set dst (VectorRearrange src shuffle));
8626
  effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP ktmp, TEMP rtmp);
8627
  format %{ "vector_rearrange $dst, $shuffle, $src!\t using $xtmp1, $xtmp2, $xtmp3, $rtmp and $ktmp as TEMP" %}
8628
  ins_encode %{
8629
    int vlen_enc = vector_length_encoding(this);
8630
    __ rearrange_bytes($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister,
8631
                       $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister,
8632
                       $rtmp$$Register, $ktmp$$KRegister, vlen_enc);
8633
  %}
8634
  ins_pipe( pipe_slow );
8635
%}
8636

8637
instruct rearrangeB_evex_vbmi(vec dst, vec src, vec shuffle) %{
8638
  predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
8639
            Matcher::vector_length(n) >= 32 && VM_Version::supports_avx512_vbmi());
8640
  match(Set dst (VectorRearrange src shuffle));
8641
  format %{ "vector_rearrange $dst, $shuffle, $src" %}
8642
  ins_encode %{
8643
    int vlen_enc = vector_length_encoding(this);
8644
    __ vpermb($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
8645
  %}
8646
  ins_pipe( pipe_slow );
8647
%}
8648

8649
// LoadShuffle/Rearrange for Short
8650

8651
instruct loadShuffleS(vec dst, vec src, vec vtmp) %{
8652
  predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
8653
            Matcher::vector_length(n) <= 16 && !VM_Version::supports_avx512bw()); // NB! aligned with rearrangeS
8654
  match(Set dst (VectorLoadShuffle src));
8655
  effect(TEMP dst, TEMP vtmp);
8656
  format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
8657
  ins_encode %{
8658
    // Create a byte shuffle mask from short shuffle mask
8659
    // only byte shuffle instruction available on these platforms
8660
    int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
8661
    if (UseAVX == 0) {
8662
      assert(vlen_in_bytes <= 16, "required");
8663
      // Multiply each shuffle by two to get byte index
8664
      __ pmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister);
8665
      __ psllw($vtmp$$XMMRegister, 1);
8666

8667
      // Duplicate to create 2 copies of byte index
8668
      __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
8669
      __ psllw($dst$$XMMRegister, 8);
8670
      __ por($dst$$XMMRegister, $vtmp$$XMMRegister);
8671

8672
      // Add one to get alternate byte index
8673
      __ movdqu($vtmp$$XMMRegister, ExternalAddress(vector_short_shufflemask()), noreg);
8674
      __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
8675
    } else {
8676
      assert(UseAVX > 1 || vlen_in_bytes <= 16, "required");
8677
      int vlen_enc = vector_length_encoding(this);
8678
      // Multiply each shuffle by two to get byte index
8679
      __ vpmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
8680
      __ vpsllw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 1, vlen_enc);
8681

8682
      // Duplicate to create 2 copies of byte index
8683
      __ vpsllw($dst$$XMMRegister, $vtmp$$XMMRegister,  8, vlen_enc);
8684
      __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
8685

8686
      // Add one to get alternate byte index
8687
      __ vpaddb($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_shufflemask()), vlen_enc, noreg);
8688
    }
8689
  %}
8690
  ins_pipe( pipe_slow );
8691
%}
8692

8693
instruct rearrangeS(vec dst, vec shuffle) %{
8694
  predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
8695
            Matcher::vector_length(n) <= 8 && !VM_Version::supports_avx512bw());
8696
  match(Set dst (VectorRearrange dst shuffle));
8697
  format %{ "vector_rearrange $dst, $shuffle, $dst" %}
8698
  ins_encode %{
8699
    assert(UseSSE >= 4, "required");
8700
    __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
8701
  %}
8702
  ins_pipe( pipe_slow );
8703
%}
8704

8705
instruct rearrangeS_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2) %{
8706
  predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
8707
            Matcher::vector_length(n) == 16 && !VM_Version::supports_avx512bw());
8708
  match(Set dst (VectorRearrange src shuffle));
8709
  effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
8710
  format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2 as TEMP" %}
8711
  ins_encode %{
8712
    assert(UseAVX >= 2, "required");
8713
    // Swap src into vtmp1
8714
    __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
8715
    // Shuffle swapped src to get entries from other 128 bit lane
8716
    __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
8717
    // Shuffle original src to get entries from self 128 bit lane
8718
    __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
8719
    // Create a blend mask by setting high bits for entries coming from other lane in shuffle
8720
    __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, noreg);
8721
    // Perform the blend
8722
    __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
8723
  %}
8724
  ins_pipe( pipe_slow );
8725
%}
8726

8727
instruct loadShuffleS_evex(vec dst, vec src) %{
8728
  predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
8729
            VM_Version::supports_avx512bw());
8730
  match(Set dst (VectorLoadShuffle src));
8731
  format %{ "vector_load_shuffle $dst, $src" %}
8732
  ins_encode %{
8733
    int vlen_enc = vector_length_encoding(this);
8734
    if (!VM_Version::supports_avx512vl()) {
8735
      vlen_enc = Assembler::AVX_512bit;
8736
    }
8737
    __ vpmovzxbw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
8738
  %}
8739
  ins_pipe( pipe_slow );
8740
%}
8741

8742
instruct rearrangeS_evex(vec dst, vec src, vec shuffle) %{
8743
  predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
8744
            VM_Version::supports_avx512bw());
8745
  match(Set dst (VectorRearrange src shuffle));
8746
  format %{ "vector_rearrange $dst, $shuffle, $src" %}
8747
  ins_encode %{
8748
    int vlen_enc = vector_length_encoding(this);
8749
    if (!VM_Version::supports_avx512vl()) {
8750
      vlen_enc = Assembler::AVX_512bit;
8751
    }
8752
    __ vpermw($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
8753
  %}
8754
  ins_pipe( pipe_slow );
8755
%}
8756

8757
// LoadShuffle/Rearrange for Integer and Float
8758

8759
instruct loadShuffleI(vec dst, vec src, vec vtmp) %{
8760
  predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
8761
            Matcher::vector_length(n) == 4 && UseAVX == 0);
8762
  match(Set dst (VectorLoadShuffle src));
8763
  effect(TEMP dst, TEMP vtmp);
8764
  format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
8765
  ins_encode %{
8766
    assert(UseSSE >= 4, "required");
8767

8768
    // Create a byte shuffle mask from int shuffle mask
8769
    // only byte shuffle instruction available on these platforms
8770

8771
    // Duplicate and multiply each shuffle by 4
8772
    __ pmovzxbd($vtmp$$XMMRegister, $src$$XMMRegister);
8773
    __ pshuflw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
8774
    __ pshufhw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
8775
    __ psllw($vtmp$$XMMRegister, 2);
8776

8777
    // Duplicate again to create 4 copies of byte index
8778
    __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
8779
    __ psllw($dst$$XMMRegister, 8);
8780
    __ por($vtmp$$XMMRegister, $dst$$XMMRegister);
8781

8782
    // Add 3,2,1,0 to get alternate byte index
8783
    __ movdqu($dst$$XMMRegister, ExternalAddress(vector_int_shufflemask()), noreg);
8784
    __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
8785
  %}
8786
  ins_pipe( pipe_slow );
8787
%}
8788

8789
instruct rearrangeI(vec dst, vec shuffle) %{
8790
  predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
8791
            UseAVX == 0);
8792
  match(Set dst (VectorRearrange dst shuffle));
8793
  format %{ "vector_rearrange $dst, $shuffle, $dst" %}
8794
  ins_encode %{
8795
    assert(UseSSE >= 4, "required");
8796
    __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
8797
  %}
8798
  ins_pipe( pipe_slow );
8799
%}
8800

8801
instruct loadShuffleI_avx(vec dst, vec src) %{
8802
  predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
8803
            UseAVX > 0);
8804
  match(Set dst (VectorLoadShuffle src));
8805
  format %{ "vector_load_shuffle $dst, $src" %}
8806
  ins_encode %{
8807
    int vlen_enc = vector_length_encoding(this);
8808
    __ vpmovzxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
8809
  %}
8810
  ins_pipe( pipe_slow );
8811
%}
8812

8813
instruct rearrangeI_avx(vec dst, vec src, vec shuffle) %{
8814
  predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
8815
            UseAVX > 0);
8816
  match(Set dst (VectorRearrange src shuffle));
8817
  format %{ "vector_rearrange $dst, $shuffle, $src" %}
8818
  ins_encode %{
8819
    int vlen_enc = vector_length_encoding(this);
8820
    BasicType bt = Matcher::vector_element_basic_type(this);
8821
    __ vector_rearrange_int_float(bt, $dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
8822
  %}
8823
  ins_pipe( pipe_slow );
8824
%}
8825

8826
// LoadShuffle/Rearrange for Long and Double
8827

8828
instruct loadShuffleL(vec dst, vec src, vec vtmp) %{
8829
  predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
8830
            Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
8831
  match(Set dst (VectorLoadShuffle src));
8832
  effect(TEMP dst, TEMP vtmp);
8833
  format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
8834
  ins_encode %{
8835
    assert(UseAVX >= 2, "required");
8836

8837
    int vlen_enc = vector_length_encoding(this);
8838
    // Create a double word shuffle mask from long shuffle mask
8839
    // only double word shuffle instruction available on these platforms
8840

8841
    // Multiply each shuffle by two to get double word index
8842
    __ vpmovzxbq($vtmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
8843
    __ vpsllq($vtmp$$XMMRegister, $vtmp$$XMMRegister, 1, vlen_enc);
8844

8845
    // Duplicate each double word shuffle
8846
    __ vpsllq($dst$$XMMRegister, $vtmp$$XMMRegister, 32, vlen_enc);
8847
    __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
8848

8849
    // Add one to get alternate double word index
8850
    __ vpaddd($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_long_shufflemask()), vlen_enc, noreg);
8851
  %}
8852
  ins_pipe( pipe_slow );
8853
%}
8854

8855
instruct rearrangeL(vec dst, vec src, vec shuffle) %{
8856
  predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
8857
            Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
8858
  match(Set dst (VectorRearrange src shuffle));
8859
  format %{ "vector_rearrange $dst, $shuffle, $src" %}
8860
  ins_encode %{
8861
    assert(UseAVX >= 2, "required");
8862

8863
    int vlen_enc = vector_length_encoding(this);
8864
    __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
8865
  %}
8866
  ins_pipe( pipe_slow );
8867
%}
8868

8869
instruct loadShuffleL_evex(vec dst, vec src) %{
8870
  predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
8871
            (Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
8872
  match(Set dst (VectorLoadShuffle src));
8873
  format %{ "vector_load_shuffle $dst, $src" %}
8874
  ins_encode %{
8875
    assert(UseAVX > 2, "required");
8876

8877
    int vlen_enc = vector_length_encoding(this);
8878
    __ vpmovzxbq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
8879
  %}
8880
  ins_pipe( pipe_slow );
8881
%}
8882

8883
instruct rearrangeL_evex(vec dst, vec src, vec shuffle) %{
8884
  predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
8885
            (Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
8886
  match(Set dst (VectorRearrange src shuffle));
8887
  format %{ "vector_rearrange $dst, $shuffle, $src" %}
8888
  ins_encode %{
8889
    assert(UseAVX > 2, "required");
8890

8891
    int vlen_enc = vector_length_encoding(this);
8892
    if (vlen_enc == Assembler::AVX_128bit) {
8893
      vlen_enc = Assembler::AVX_256bit;
8894
    }
8895
    __ vpermq($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
8896
  %}
8897
  ins_pipe( pipe_slow );
8898
%}
8899

8900
// --------------------------------- FMA --------------------------------------
8901
// a * b + c
8902

8903
instruct vfmaF_reg(vec a, vec b, vec c) %{
8904
  match(Set c (FmaVF  c (Binary a b)));
8905
  format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
8906
  ins_cost(150);
8907
  ins_encode %{
8908
    assert(UseFMA, "not enabled");
8909
    int vlen_enc = vector_length_encoding(this);
8910
    __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
8911
  %}
8912
  ins_pipe( pipe_slow );
8913
%}
8914

8915
instruct vfmaF_mem(vec a, memory b, vec c) %{
8916
  predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
8917
  match(Set c (FmaVF  c (Binary a (LoadVector b))));
8918
  format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
8919
  ins_cost(150);
8920
  ins_encode %{
8921
    assert(UseFMA, "not enabled");
8922
    int vlen_enc = vector_length_encoding(this);
8923
    __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
8924
  %}
8925
  ins_pipe( pipe_slow );
8926
%}
8927

8928
instruct vfmaD_reg(vec a, vec b, vec c) %{
8929
  match(Set c (FmaVD  c (Binary a b)));
8930
  format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
8931
  ins_cost(150);
8932
  ins_encode %{
8933
    assert(UseFMA, "not enabled");
8934
    int vlen_enc = vector_length_encoding(this);
8935
    __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
8936
  %}
8937
  ins_pipe( pipe_slow );
8938
%}
8939

8940
instruct vfmaD_mem(vec a, memory b, vec c) %{
8941
  predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
8942
  match(Set c (FmaVD  c (Binary a (LoadVector b))));
8943
  format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
8944
  ins_cost(150);
8945
  ins_encode %{
8946
    assert(UseFMA, "not enabled");
8947
    int vlen_enc = vector_length_encoding(this);
8948
    __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
8949
  %}
8950
  ins_pipe( pipe_slow );
8951
%}
8952

8953
// --------------------------------- Vector Multiply Add --------------------------------------
8954

8955
instruct vmuladdS2I_reg_sse(vec dst, vec src1) %{
8956
  predicate(UseAVX == 0);
8957
  match(Set dst (MulAddVS2VI dst src1));
8958
  format %{ "pmaddwd $dst,$src1\t! muladd packedStoI" %}
8959
  ins_encode %{
8960
    __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
8961
  %}
8962
  ins_pipe( pipe_slow );
8963
%}
8964

8965
instruct vmuladdS2I_reg_avx(vec dst, vec src1, vec src2) %{
8966
  predicate(UseAVX > 0);
8967
  match(Set dst (MulAddVS2VI src1 src2));
8968
  format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packedStoI" %}
8969
  ins_encode %{
8970
    int vlen_enc = vector_length_encoding(this);
8971
    __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
8972
  %}
8973
  ins_pipe( pipe_slow );
8974
%}
8975

8976
// --------------------------------- Vector Multiply Add Add ----------------------------------
8977

8978
instruct vmuladdaddS2I_reg(vec dst, vec src1, vec src2) %{
8979
  predicate(VM_Version::supports_avx512_vnni());
8980
  match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
8981
  format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packedStoI" %}
8982
  ins_encode %{
8983
    assert(UseAVX > 2, "required");
8984
    int vlen_enc = vector_length_encoding(this);
8985
    __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
8986
  %}
8987
  ins_pipe( pipe_slow );
8988
  ins_cost(10);
8989
%}
8990

8991
// --------------------------------- PopCount --------------------------------------
8992

8993
instruct vpopcount_integral_reg_evex(vec dst, vec src) %{
8994
  predicate(is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
8995
  match(Set dst (PopCountVI src));
8996
  match(Set dst (PopCountVL src));
8997
  format %{ "vector_popcount_integral $dst, $src" %}
8998
  ins_encode %{
8999
    int opcode = this->ideal_Opcode();
9000
    int vlen_enc = vector_length_encoding(this, $src);
9001
    BasicType bt = Matcher::vector_element_basic_type(this, $src);
9002
    __ vector_popcount_integral_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, k0, true, vlen_enc);
9003
  %}
9004
  ins_pipe( pipe_slow );
9005
%}
9006

9007
instruct vpopcount_integral_reg_evex_masked(vec dst, vec src, kReg mask) %{
9008
  predicate(is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
9009
  match(Set dst (PopCountVI src mask));
9010
  match(Set dst (PopCountVL src mask));
9011
  format %{ "vector_popcount_integral_masked $dst, $src, $mask" %}
9012
  ins_encode %{
9013
    int vlen_enc = vector_length_encoding(this, $src);
9014
    BasicType bt = Matcher::vector_element_basic_type(this, $src);
9015
    __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
9016
    __ vector_popcount_integral_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $mask$$KRegister, true, vlen_enc);
9017
  %}
9018
  ins_pipe( pipe_slow );
9019
%}
9020

9021
instruct vpopcount_avx_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegP rtmp) %{
9022
  predicate(!is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
9023
  match(Set dst (PopCountVI src));
9024
  match(Set dst (PopCountVL src));
9025
  effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
9026
  format %{ "vector_popcount_integral $dst, $src\t! using $xtmp1, $xtmp2, and $rtmp as TEMP" %}
9027
  ins_encode %{
9028
    int opcode = this->ideal_Opcode();
9029
    int vlen_enc = vector_length_encoding(this, $src);
9030
    BasicType bt = Matcher::vector_element_basic_type(this, $src);
9031
    __ vector_popcount_integral(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
9032
                                $xtmp2$$XMMRegister, $rtmp$$Register, vlen_enc);
9033
  %}
9034
  ins_pipe( pipe_slow );
9035
%}
9036

9037
// --------------------------------- Vector Trailing Zeros Count --------------------------------------
9038

9039
instruct vcount_trailing_zeros_reg_evex(vec dst, vec src, vec xtmp, rRegP rtmp) %{
9040
  predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
9041
                                              Matcher::vector_length_in_bytes(n->in(1))));
9042
  match(Set dst (CountTrailingZerosV src));
9043
  effect(TEMP dst, TEMP xtmp, TEMP rtmp);
9044
  ins_cost(400);
9045
  format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp and $rtmp as TEMP" %}
9046
  ins_encode %{
9047
    int vlen_enc = vector_length_encoding(this, $src);
9048
    BasicType bt = Matcher::vector_element_basic_type(this, $src);
9049
    __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
9050
                                        xnoreg, xnoreg, $xtmp$$XMMRegister, k0, $rtmp$$Register, vlen_enc);
9051
  %}
9052
  ins_pipe( pipe_slow );
9053
%}
9054

9055
instruct vcount_trailing_zeros_short_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
9056
  predicate(Matcher::vector_element_basic_type(n->in(1)) == T_SHORT &&
9057
            VM_Version::supports_avx512cd() &&
9058
            (VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64));
9059
  match(Set dst (CountTrailingZerosV src));
9060
  effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
9061
  ins_cost(400);
9062
  format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3 and $rtmp as TEMP" %}
9063
  ins_encode %{
9064
    int vlen_enc = vector_length_encoding(this, $src);
9065
    BasicType bt = Matcher::vector_element_basic_type(this, $src);
9066
    __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
9067
                                        $xtmp2$$XMMRegister, xnoreg, $xtmp3$$XMMRegister, k0, $rtmp$$Register, vlen_enc);
9068
  %}
9069
  ins_pipe( pipe_slow );
9070
%}
9071

9072
instruct vcount_trailing_zeros_byte_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, kReg ktmp, rRegP rtmp) %{
9073
  predicate(Matcher::vector_element_basic_type(n->in(1)) == T_BYTE && VM_Version::supports_avx512vlbw());
9074
  match(Set dst (CountTrailingZerosV src));
9075
  effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP ktmp, TEMP rtmp);
9076
  ins_cost(400);
9077
  format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3, $xtmp4, $ktmp and $rtmp as TEMP" %}
9078
  ins_encode %{
9079
    int vlen_enc = vector_length_encoding(this, $src);
9080
    BasicType bt = Matcher::vector_element_basic_type(this, $src);
9081
    __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
9082
                                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister,
9083
                                        $ktmp$$KRegister, $rtmp$$Register, vlen_enc);
9084
  %}
9085
  ins_pipe( pipe_slow );
9086
%}
9087

9088
instruct vcount_trailing_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
9089
  predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
9090
  match(Set dst (CountTrailingZerosV src));
9091
  effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
9092
  format %{ "vector_count_trailing_zeros $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
9093
  ins_encode %{
9094
    int vlen_enc = vector_length_encoding(this, $src);
9095
    BasicType bt = Matcher::vector_element_basic_type(this, $src);
9096
    __ vector_count_trailing_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
9097
                                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
9098
  %}
9099
  ins_pipe( pipe_slow );
9100
%}
9101

9102

9103
// --------------------------------- Bitwise Ternary Logic ----------------------------------
9104

9105
instruct vpternlog(vec dst, vec src2, vec src3, immU8 func) %{
9106
  match(Set dst (MacroLogicV (Binary dst src2) (Binary src3 func)));
9107
  effect(TEMP dst);
9108
  format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
9109
  ins_encode %{
9110
    int vector_len = vector_length_encoding(this);
9111
    __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$XMMRegister, vector_len);
9112
  %}
9113
  ins_pipe( pipe_slow );
9114
%}
9115

9116
instruct vpternlog_mem(vec dst, vec src2, memory src3, immU8 func) %{
9117
  predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) > 8);
9118
  match(Set dst (MacroLogicV (Binary dst src2) (Binary (LoadVector src3) func)));
9119
  effect(TEMP dst);
9120
  format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
9121
  ins_encode %{
9122
    int vector_len = vector_length_encoding(this);
9123
    __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$Address, vector_len);
9124
  %}
9125
  ins_pipe( pipe_slow );
9126
%}
9127

9128
// --------------------------------- Rotation Operations ----------------------------------
9129
instruct vprotate_immI8(vec dst, vec src, immI8 shift) %{
9130
  match(Set dst (RotateLeftV src shift));
9131
  match(Set dst (RotateRightV src shift));
9132
  format %{ "vprotate_imm8 $dst,$src,$shift\t! vector rotate" %}
9133
  ins_encode %{
9134
    int opcode      = this->ideal_Opcode();
9135
    int vector_len  = vector_length_encoding(this);
9136
    BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
9137
    __ vprotate_imm(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
9138
  %}
9139
  ins_pipe( pipe_slow );
9140
%}
9141

9142
instruct vprorate(vec dst, vec src, vec shift) %{
9143
  match(Set dst (RotateLeftV src shift));
9144
  match(Set dst (RotateRightV src shift));
9145
  format %{ "vprotate $dst,$src,$shift\t! vector rotate" %}
9146
  ins_encode %{
9147
    int opcode      = this->ideal_Opcode();
9148
    int vector_len  = vector_length_encoding(this);
9149
    BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
9150
    __ vprotate_var(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9151
  %}
9152
  ins_pipe( pipe_slow );
9153
%}
9154

9155
// ---------------------------------- Masked Operations ------------------------------------
9156
instruct vmasked_load_avx_non_subword(vec dst, memory mem, vec mask) %{
9157
  predicate(!n->in(3)->bottom_type()->isa_vectmask());
9158
  match(Set dst (LoadVectorMasked mem mask));
9159
  format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
9160
  ins_encode %{
9161
    BasicType elmType = this->bottom_type()->is_vect()->element_basic_type();
9162
    int vlen_enc = vector_length_encoding(this);
9163
    __ vmovmask(elmType, $dst$$XMMRegister, $mem$$Address, $mask$$XMMRegister, vlen_enc);
9164
  %}
9165
  ins_pipe( pipe_slow );
9166
%}
9167

9168

9169
instruct vmasked_load_evex(vec dst, memory mem, kReg mask) %{
9170
  predicate(n->in(3)->bottom_type()->isa_vectmask());
9171
  match(Set dst (LoadVectorMasked mem mask));
9172
  format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
9173
  ins_encode %{
9174
    BasicType elmType =  this->bottom_type()->is_vect()->element_basic_type();
9175
    int vector_len = vector_length_encoding(this);
9176
    __ evmovdqu(elmType, $mask$$KRegister, $dst$$XMMRegister, $mem$$Address, false, vector_len);
9177
  %}
9178
  ins_pipe( pipe_slow );
9179
%}
9180

9181
instruct vmasked_store_avx_non_subword(memory mem, vec src, vec mask) %{
9182
  predicate(!n->in(3)->in(2)->bottom_type()->isa_vectmask());
9183
  match(Set mem (StoreVectorMasked mem (Binary src mask)));
9184
  format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
9185
  ins_encode %{
9186
    const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
9187
    int vlen_enc = vector_length_encoding(src_node);
9188
    BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
9189
    __ vmovmask(elmType, $mem$$Address, $src$$XMMRegister, $mask$$XMMRegister, vlen_enc);
9190
  %}
9191
  ins_pipe( pipe_slow );
9192
%}
9193

9194
instruct vmasked_store_evex(memory mem, vec src, kReg mask) %{
9195
  predicate(n->in(3)->in(2)->bottom_type()->isa_vectmask());
9196
  match(Set mem (StoreVectorMasked mem (Binary src mask)));
9197
  format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
9198
  ins_encode %{
9199
    const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
9200
    BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
9201
    int vlen_enc = vector_length_encoding(src_node);
9202
    __ evmovdqu(elmType, $mask$$KRegister, $mem$$Address, $src$$XMMRegister, true, vlen_enc);
9203
  %}
9204
  ins_pipe( pipe_slow );
9205
%}
9206

9207
#ifdef _LP64
9208
instruct verify_vector_alignment(rRegP addr, immL32 mask, rFlagsReg cr) %{
9209
  match(Set addr (VerifyVectorAlignment addr mask));
9210
  effect(KILL cr);
9211
  format %{ "verify_vector_alignment $addr $mask \t! verify alignment" %}
9212
  ins_encode %{
9213
    Label Lskip;
9214
    // check if masked bits of addr are zero
9215
    __ testq($addr$$Register, $mask$$constant);
9216
    __ jccb(Assembler::equal, Lskip);
9217
    __ stop("verify_vector_alignment found a misaligned vector memory access");
9218
    __ bind(Lskip);
9219
  %}
9220
  ins_pipe(pipe_slow);
9221
%}
9222

9223
instruct vmask_cmp_node(rRegI dst, vec src1, vec src2, kReg mask, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
9224
  match(Set dst (VectorCmpMasked src1 (Binary src2 mask)));
9225
  effect(TEMP_DEF dst, TEMP ktmp1, TEMP ktmp2, KILL cr);
9226
  format %{ "vector_mask_cmp $src1, $src2, $mask \t! vector mask comparison" %}
9227
  ins_encode %{
9228
    assert(vector_length_encoding(this, $src1) == vector_length_encoding(this, $src2), "mismatch");
9229
    assert(Matcher::vector_element_basic_type(this, $src1) == Matcher::vector_element_basic_type(this, $src2), "mismatch");
9230

9231
    Label DONE;
9232
    int vlen_enc = vector_length_encoding(this, $src1);
9233
    BasicType elem_bt = Matcher::vector_element_basic_type(this, $src1);
9234

9235
    __ knotql($ktmp2$$KRegister, $mask$$KRegister);
9236
    __ mov64($dst$$Register, -1L);
9237
    __ evpcmp(elem_bt, $ktmp1$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, Assembler::eq, vlen_enc);
9238
    __ kortestql($ktmp2$$KRegister, $ktmp1$$KRegister);
9239
    __ jccb(Assembler::carrySet, DONE);
9240
    __ kmovql($dst$$Register, $ktmp1$$KRegister);
9241
    __ notq($dst$$Register);
9242
    __ tzcntq($dst$$Register, $dst$$Register);
9243
    __ bind(DONE);
9244
  %}
9245
  ins_pipe( pipe_slow );
9246
%}
9247

9248

9249
instruct vmask_gen(kReg dst, rRegL len, rRegL temp, rFlagsReg cr) %{
9250
  match(Set dst (VectorMaskGen len));
9251
  effect(TEMP temp, KILL cr);
9252
  format %{ "vector_mask_gen32 $dst, $len \t! vector mask generator" %}
9253
  ins_encode %{
9254
    __ genmask($dst$$KRegister, $len$$Register, $temp$$Register);
9255
  %}
9256
  ins_pipe( pipe_slow );
9257
%}
9258

9259
instruct vmask_gen_imm(kReg dst, immL len, rRegL temp) %{
9260
  match(Set dst (VectorMaskGen len));
9261
  format %{ "vector_mask_gen $len \t! vector mask generator" %}
9262
  effect(TEMP temp);
9263
  ins_encode %{
9264
    __ mov64($temp$$Register, (0xFFFFFFFFFFFFFFFFUL >> (64 -$len$$constant)));
9265
    __ kmovql($dst$$KRegister, $temp$$Register);
9266
  %}
9267
  ins_pipe( pipe_slow );
9268
%}
9269

9270
instruct vmask_tolong_evex(rRegL dst, kReg mask, rFlagsReg cr) %{
9271
  predicate(n->in(1)->bottom_type()->isa_vectmask());
9272
  match(Set dst (VectorMaskToLong mask));
9273
  effect(TEMP dst, KILL cr);
9274
  format %{ "vector_tolong_evex $dst, $mask \t! vector mask tolong" %}
9275
  ins_encode %{
9276
    int opcode = this->ideal_Opcode();
9277
    BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
9278
    int mask_len = Matcher::vector_length(this, $mask);
9279
    int mask_size = mask_len * type2aelembytes(mbt);
9280
    int vlen_enc = vector_length_encoding(this, $mask);
9281
    __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
9282
                             $dst$$Register, mask_len, mask_size, vlen_enc);
9283
  %}
9284
  ins_pipe( pipe_slow );
9285
%}
9286

9287
instruct vmask_tolong_bool(rRegL dst, vec mask, vec xtmp, rFlagsReg cr) %{
9288
  predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
9289
  match(Set dst (VectorMaskToLong mask));
9290
  format %{ "vector_tolong_bool $dst, $mask \t! using $xtmp as TEMP" %}
9291
  effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
9292
  ins_encode %{
9293
    int opcode = this->ideal_Opcode();
9294
    BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
9295
    int mask_len = Matcher::vector_length(this, $mask);
9296
    int vlen_enc = vector_length_encoding(this, $mask);
9297
    __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
9298
                             $dst$$Register, mask_len, mbt, vlen_enc);
9299
  %}
9300
  ins_pipe( pipe_slow );
9301
%}
9302

9303
instruct vmask_tolong_avx(rRegL dst, vec mask, immI size, vec xtmp, rFlagsReg cr) %{
9304
  predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
9305
  match(Set dst (VectorMaskToLong (VectorStoreMask mask size)));
9306
  format %{ "vector_tolong_avx $dst, $mask \t! using $xtmp as TEMP" %}
9307
  effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
9308
  ins_encode %{
9309
    int opcode = this->ideal_Opcode();
9310
    BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
9311
    int mask_len = Matcher::vector_length(this, $mask);
9312
    int vlen_enc = vector_length_encoding(this, $mask);
9313
    __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
9314
                             $dst$$Register, mask_len, mbt, vlen_enc);
9315
  %}
9316
  ins_pipe( pipe_slow );
9317
%}
9318

9319
instruct vmask_truecount_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
9320
  predicate(n->in(1)->bottom_type()->isa_vectmask());
9321
  match(Set dst (VectorMaskTrueCount mask));
9322
  effect(TEMP_DEF dst, TEMP tmp, KILL cr);
9323
  format %{ "vector_truecount_evex $dst, $mask \t! using $tmp as TEMP" %}
9324
  ins_encode %{
9325
    int opcode = this->ideal_Opcode();
9326
    BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
9327
    int mask_len = Matcher::vector_length(this, $mask);
9328
    int mask_size = mask_len * type2aelembytes(mbt);
9329
    int vlen_enc = vector_length_encoding(this, $mask);
9330
    __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
9331
                             $tmp$$Register, mask_len, mask_size, vlen_enc);
9332
  %}
9333
  ins_pipe( pipe_slow );
9334
%}
9335

9336
instruct vmask_truecount_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
9337
  predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
9338
  match(Set dst (VectorMaskTrueCount mask));
9339
  effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
9340
  format %{ "vector_truecount_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
9341
  ins_encode %{
9342
    int opcode = this->ideal_Opcode();
9343
    BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
9344
    int mask_len = Matcher::vector_length(this, $mask);
9345
    int vlen_enc = vector_length_encoding(this, $mask);
9346
    __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
9347
                             $tmp$$Register, mask_len, mbt, vlen_enc);
9348
  %}
9349
  ins_pipe( pipe_slow );
9350
%}
9351

9352
instruct vmask_truecount_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
9353
  predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
9354
  match(Set dst (VectorMaskTrueCount (VectorStoreMask mask size)));
9355
  effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
9356
  format %{ "vector_truecount_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
9357
  ins_encode %{
9358
    int opcode = this->ideal_Opcode();
9359
    BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
9360
    int mask_len = Matcher::vector_length(this, $mask);
9361
    int vlen_enc = vector_length_encoding(this, $mask);
9362
    __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
9363
                             $tmp$$Register, mask_len, mbt, vlen_enc);
9364
  %}
9365
  ins_pipe( pipe_slow );
9366
%}
9367

9368
instruct vmask_first_or_last_true_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
9369
  predicate(n->in(1)->bottom_type()->isa_vectmask());
9370
  match(Set dst (VectorMaskFirstTrue mask));
9371
  match(Set dst (VectorMaskLastTrue mask));
9372
  effect(TEMP_DEF dst, TEMP tmp, KILL cr);
9373
  format %{ "vector_mask_first_or_last_true_evex $dst, $mask \t! using $tmp as TEMP" %}
9374
  ins_encode %{
9375
    int opcode = this->ideal_Opcode();
9376
    BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
9377
    int mask_len = Matcher::vector_length(this, $mask);
9378
    int mask_size = mask_len * type2aelembytes(mbt);
9379
    int vlen_enc = vector_length_encoding(this, $mask);
9380
    __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
9381
                             $tmp$$Register, mask_len, mask_size, vlen_enc);
9382
  %}
9383
  ins_pipe( pipe_slow );
9384
%}
9385

9386
instruct vmask_first_or_last_true_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
9387
  predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
9388
  match(Set dst (VectorMaskFirstTrue mask));
9389
  match(Set dst (VectorMaskLastTrue mask));
9390
  effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
9391
  format %{ "vector_mask_first_or_last_true_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
9392
  ins_encode %{
9393
    int opcode = this->ideal_Opcode();
9394
    BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
9395
    int mask_len = Matcher::vector_length(this, $mask);
9396
    int vlen_enc = vector_length_encoding(this, $mask);
9397
    __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
9398
                             $tmp$$Register, mask_len, mbt, vlen_enc);
9399
  %}
9400
  ins_pipe( pipe_slow );
9401
%}
9402

9403
instruct vmask_first_or_last_true_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
9404
  predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
9405
  match(Set dst (VectorMaskFirstTrue (VectorStoreMask mask size)));
9406
  match(Set dst (VectorMaskLastTrue (VectorStoreMask mask size)));
9407
  effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
9408
  format %{ "vector_mask_first_or_last_true_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
9409
  ins_encode %{
9410
    int opcode = this->ideal_Opcode();
9411
    BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
9412
    int mask_len = Matcher::vector_length(this, $mask);
9413
    int vlen_enc = vector_length_encoding(this, $mask);
9414
    __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
9415
                             $tmp$$Register, mask_len, mbt, vlen_enc);
9416
  %}
9417
  ins_pipe( pipe_slow );
9418
%}
9419

9420
// --------------------------------- Compress/Expand Operations ---------------------------
9421
#ifdef _LP64
9422
instruct vcompress_reg_avx(vec dst, vec src, vec mask, rRegI rtmp, rRegL rscratch, vec perm, vec xtmp, rFlagsReg cr) %{
9423
  predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
9424
  match(Set dst (CompressV src mask));
9425
  match(Set dst (ExpandV src mask));
9426
  effect(TEMP_DEF dst, TEMP perm, TEMP xtmp, TEMP rtmp, TEMP rscratch, KILL cr);
9427
  format %{ "vector_compress $dst, $src, $mask \t!using $xtmp, $rtmp, $rscratch and $perm as TEMP" %}
9428
  ins_encode %{
9429
    int opcode = this->ideal_Opcode();
9430
    int vlen_enc = vector_length_encoding(this);
9431
    BasicType bt  = Matcher::vector_element_basic_type(this);
9432
    __ vector_compress_expand_avx2(opcode, $dst$$XMMRegister, $src$$XMMRegister, $mask$$XMMRegister, $rtmp$$Register,
9433
                                   $rscratch$$Register, $perm$$XMMRegister, $xtmp$$XMMRegister, bt, vlen_enc);
9434
  %}
9435
  ins_pipe( pipe_slow );
9436
%}
9437
#endif
9438

9439
instruct vcompress_expand_reg_evex(vec dst, vec src, kReg mask) %{
9440
  predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
9441
  match(Set dst (CompressV src mask));
9442
  match(Set dst (ExpandV src mask));
9443
  format %{ "vector_compress_expand $dst, $src, $mask" %}
9444
  ins_encode %{
9445
    int opcode = this->ideal_Opcode();
9446
    int vector_len = vector_length_encoding(this);
9447
    BasicType bt  = Matcher::vector_element_basic_type(this);
9448
    __ vector_compress_expand(opcode, $dst$$XMMRegister, $src$$XMMRegister, $mask$$KRegister, false, bt, vector_len);
9449
  %}
9450
  ins_pipe( pipe_slow );
9451
%}
9452

9453
instruct vcompress_mask_reg_evex(kReg dst, kReg mask, rRegL rtmp1, rRegL rtmp2, rFlagsReg cr) %{
9454
  match(Set dst (CompressM mask));
9455
  effect(TEMP rtmp1, TEMP rtmp2, KILL cr);
9456
  format %{ "mask_compress_evex $dst, $mask\t! using $rtmp1 and $rtmp2 as TEMP" %}
9457
  ins_encode %{
9458
    assert(this->in(1)->bottom_type()->isa_vectmask(), "");
9459
    int mask_len = Matcher::vector_length(this);
9460
    __ vector_mask_compress($dst$$KRegister, $mask$$KRegister, $rtmp1$$Register, $rtmp2$$Register, mask_len);
9461
  %}
9462
  ins_pipe( pipe_slow );
9463
%}
9464

9465
#endif // _LP64
9466

9467
// -------------------------------- Bit and Byte Reversal Vector Operations ------------------------
9468

9469
instruct vreverse_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegI rtmp) %{
9470
  predicate(!VM_Version::supports_gfni());
9471
  match(Set dst (ReverseV src));
9472
  effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
9473
  format %{ "vector_reverse_bit_evex $dst, $src!\t using $xtmp1, $xtmp2 and $rtmp as TEMP" %}
9474
  ins_encode %{
9475
    int vec_enc = vector_length_encoding(this);
9476
    BasicType bt = Matcher::vector_element_basic_type(this);
9477
    __ vector_reverse_bit(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
9478
                          $xtmp2$$XMMRegister, $rtmp$$Register, vec_enc);
9479
  %}
9480
  ins_pipe( pipe_slow );
9481
%}
9482

9483
instruct vreverse_reg_gfni(vec dst, vec src, vec xtmp) %{
9484
  predicate(VM_Version::supports_gfni());
9485
  match(Set dst (ReverseV src));
9486
  effect(TEMP dst, TEMP xtmp);
9487
  format %{ "vector_reverse_bit_gfni $dst, $src!\t using $xtmp as TEMP" %}
9488
  ins_encode %{
9489
    int vec_enc = vector_length_encoding(this);
9490
    BasicType bt  = Matcher::vector_element_basic_type(this);
9491
    InternalAddress addr = $constantaddress(T_LONG, vreplicate_imm(T_LONG, 0x8040201008040201L, 1));
9492
    __ vector_reverse_bit_gfni(bt, $dst$$XMMRegister, $src$$XMMRegister, addr, vec_enc,
9493
                               $xtmp$$XMMRegister);
9494
  %}
9495
  ins_pipe( pipe_slow );
9496
%}
9497

9498
instruct vreverse_byte_reg(vec dst, vec src) %{
9499
  predicate(VM_Version::supports_avx512bw() || Matcher::vector_length_in_bytes(n) < 64);
9500
  match(Set dst (ReverseBytesV src));
9501
  effect(TEMP dst);
9502
  format %{ "vector_reverse_byte $dst, $src" %}
9503
  ins_encode %{
9504
    int vec_enc = vector_length_encoding(this);
9505
    BasicType bt = Matcher::vector_element_basic_type(this);
9506
    __ vector_reverse_byte(bt, $dst$$XMMRegister, $src$$XMMRegister, vec_enc);
9507
  %}
9508
  ins_pipe( pipe_slow );
9509
%}
9510

9511
instruct vreverse_byte64_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegI rtmp) %{
9512
  predicate(!VM_Version::supports_avx512bw() && Matcher::vector_length_in_bytes(n) == 64);
9513
  match(Set dst (ReverseBytesV src));
9514
  effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
9515
  format %{ "vector_reverse_byte $dst, $src!\t using $xtmp1, $xtmp2 and $rtmp as TEMP" %}
9516
  ins_encode %{
9517
    int vec_enc = vector_length_encoding(this);
9518
    BasicType bt = Matcher::vector_element_basic_type(this);
9519
    __ vector_reverse_byte64(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
9520
                             $xtmp2$$XMMRegister, $rtmp$$Register, vec_enc);
9521
  %}
9522
  ins_pipe( pipe_slow );
9523
%}
9524

9525
// ---------------------------------- Vector Count Leading Zeros -----------------------------------
9526

9527
instruct vcount_leading_zeros_IL_reg_evex(vec dst, vec src) %{
9528
  predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
9529
                                              Matcher::vector_length_in_bytes(n->in(1))));
9530
  match(Set dst (CountLeadingZerosV src));
9531
  format %{ "vector_count_leading_zeros $dst, $src" %}
9532
  ins_encode %{
9533
     int vlen_enc = vector_length_encoding(this, $src);
9534
     BasicType bt = Matcher::vector_element_basic_type(this, $src);
9535
     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
9536
                                        xnoreg, xnoreg, k0, noreg, true, vlen_enc);
9537
  %}
9538
  ins_pipe( pipe_slow );
9539
%}
9540

9541
instruct vcount_leading_zeros_IL_reg_evex_masked(vec dst, vec src, kReg mask) %{
9542
  predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
9543
                                              Matcher::vector_length_in_bytes(n->in(1))));
9544
  match(Set dst (CountLeadingZerosV src mask));
9545
  format %{ "vector_count_leading_zeros $dst, $src, $mask" %}
9546
  ins_encode %{
9547
    int vlen_enc = vector_length_encoding(this, $src);
9548
    BasicType bt = Matcher::vector_element_basic_type(this, $src);
9549
    __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
9550
    __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg, xnoreg,
9551
                                       xnoreg, $mask$$KRegister, noreg, true, vlen_enc);
9552
  %}
9553
  ins_pipe( pipe_slow );
9554
%}
9555

9556
instruct vcount_leading_zeros_short_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2) %{
9557
  predicate(Matcher::vector_element_basic_type(n->in(1)) == T_SHORT &&
9558
            VM_Version::supports_avx512cd() &&
9559
            (VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64));
9560
  match(Set dst (CountLeadingZerosV src));
9561
  effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
9562
  format %{ "vector_count_leading_zeros $dst, $src!\t using $xtmp1 and $xtmp2 as TEMP" %}
9563
  ins_encode %{
9564
    int vlen_enc = vector_length_encoding(this, $src);
9565
    BasicType bt = Matcher::vector_element_basic_type(this, $src);
9566
    __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
9567
                                       $xtmp2$$XMMRegister, xnoreg, k0, noreg, true, vlen_enc);
9568
  %}
9569
  ins_pipe( pipe_slow );
9570
%}
9571

9572
instruct vcount_leading_zeros_byte_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, kReg ktmp, rRegP rtmp) %{
9573
  predicate(Matcher::vector_element_basic_type(n->in(1)) == T_BYTE && VM_Version::supports_avx512vlbw());
9574
  match(Set dst (CountLeadingZerosV src));
9575
  effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP ktmp, TEMP rtmp);
9576
  format %{ "vector_count_leading_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3, $ktmp and $rtmp as TEMP" %}
9577
  ins_encode %{
9578
    int vlen_enc = vector_length_encoding(this, $src);
9579
    BasicType bt = Matcher::vector_element_basic_type(this, $src);
9580
    __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
9581
                                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $ktmp$$KRegister,
9582
                                       $rtmp$$Register, true, vlen_enc);
9583
  %}
9584
  ins_pipe( pipe_slow );
9585
%}
9586

9587
instruct vcount_leading_zeros_int_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3) %{
9588
  predicate(Matcher::vector_element_basic_type(n->in(1)) == T_INT &&
9589
            !VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
9590
  match(Set dst (CountLeadingZerosV src));
9591
  effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3);
9592
  format %{ "vector_count_leading_zeros $dst, $src\t! using $xtmp1, $xtmp2 and $xtmp3 as TEMP" %}
9593
  ins_encode %{
9594
    int vlen_enc = vector_length_encoding(this, $src);
9595
    BasicType bt = Matcher::vector_element_basic_type(this, $src);
9596
    __ vector_count_leading_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
9597
                                      $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, noreg, vlen_enc);
9598
  %}
9599
  ins_pipe( pipe_slow );
9600
%}
9601

9602
instruct vcount_leading_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
9603
  predicate(Matcher::vector_element_basic_type(n->in(1)) != T_INT &&
9604
            !VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
9605
  match(Set dst (CountLeadingZerosV src));
9606
  effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
9607
  format %{ "vector_count_leading_zeros $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
9608
  ins_encode %{
9609
    int vlen_enc = vector_length_encoding(this, $src);
9610
    BasicType bt = Matcher::vector_element_basic_type(this, $src);
9611
    __ vector_count_leading_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
9612
                                      $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
9613
  %}
9614
  ins_pipe( pipe_slow );
9615
%}
9616

9617
// ---------------------------------- Vector Masked Operations ------------------------------------
9618

9619
instruct vadd_reg_masked(vec dst, vec src2, kReg mask) %{
9620
  match(Set dst (AddVB (Binary dst src2) mask));
9621
  match(Set dst (AddVS (Binary dst src2) mask));
9622
  match(Set dst (AddVI (Binary dst src2) mask));
9623
  match(Set dst (AddVL (Binary dst src2) mask));
9624
  match(Set dst (AddVF (Binary dst src2) mask));
9625
  match(Set dst (AddVD (Binary dst src2) mask));
9626
  format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
9627
  ins_encode %{
9628
    int vlen_enc = vector_length_encoding(this);
9629
    BasicType bt = Matcher::vector_element_basic_type(this);
9630
    int opc = this->ideal_Opcode();
9631
    __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9632
                   $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
9633
  %}
9634
  ins_pipe( pipe_slow );
9635
%}
9636

9637
instruct vadd_mem_masked(vec dst, memory src2, kReg mask) %{
9638
  match(Set dst (AddVB (Binary dst (LoadVector src2)) mask));
9639
  match(Set dst (AddVS (Binary dst (LoadVector src2)) mask));
9640
  match(Set dst (AddVI (Binary dst (LoadVector src2)) mask));
9641
  match(Set dst (AddVL (Binary dst (LoadVector src2)) mask));
9642
  match(Set dst (AddVF (Binary dst (LoadVector src2)) mask));
9643
  match(Set dst (AddVD (Binary dst (LoadVector src2)) mask));
9644
  format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
9645
  ins_encode %{
9646
    int vlen_enc = vector_length_encoding(this);
9647
    BasicType bt = Matcher::vector_element_basic_type(this);
9648
    int opc = this->ideal_Opcode();
9649
    __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9650
                   $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
9651
  %}
9652
  ins_pipe( pipe_slow );
9653
%}
9654

9655
instruct vxor_reg_masked(vec dst, vec src2, kReg mask) %{
9656
  match(Set dst (XorV (Binary dst src2) mask));
9657
  format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
9658
  ins_encode %{
9659
    int vlen_enc = vector_length_encoding(this);
9660
    BasicType bt = Matcher::vector_element_basic_type(this);
9661
    int opc = this->ideal_Opcode();
9662
    __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9663
                   $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
9664
  %}
9665
  ins_pipe( pipe_slow );
9666
%}
9667

9668
instruct vxor_mem_masked(vec dst, memory src2, kReg mask) %{
9669
  match(Set dst (XorV (Binary dst (LoadVector src2)) mask));
9670
  format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
9671
  ins_encode %{
9672
    int vlen_enc = vector_length_encoding(this);
9673
    BasicType bt = Matcher::vector_element_basic_type(this);
9674
    int opc = this->ideal_Opcode();
9675
    __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9676
                   $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
9677
  %}
9678
  ins_pipe( pipe_slow );
9679
%}
9680

9681
instruct vor_reg_masked(vec dst, vec src2, kReg mask) %{
9682
  match(Set dst (OrV (Binary dst src2) mask));
9683
  format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
9684
  ins_encode %{
9685
    int vlen_enc = vector_length_encoding(this);
9686
    BasicType bt = Matcher::vector_element_basic_type(this);
9687
    int opc = this->ideal_Opcode();
9688
    __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9689
                   $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
9690
  %}
9691
  ins_pipe( pipe_slow );
9692
%}
9693

9694
instruct vor_mem_masked(vec dst, memory src2, kReg mask) %{
9695
  match(Set dst (OrV (Binary dst (LoadVector src2)) mask));
9696
  format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
9697
  ins_encode %{
9698
    int vlen_enc = vector_length_encoding(this);
9699
    BasicType bt = Matcher::vector_element_basic_type(this);
9700
    int opc = this->ideal_Opcode();
9701
    __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9702
                   $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
9703
  %}
9704
  ins_pipe( pipe_slow );
9705
%}
9706

9707
instruct vand_reg_masked(vec dst, vec src2, kReg mask) %{
9708
  match(Set dst (AndV (Binary dst src2) mask));
9709
  format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
9710
  ins_encode %{
9711
    int vlen_enc = vector_length_encoding(this);
9712
    BasicType bt = Matcher::vector_element_basic_type(this);
9713
    int opc = this->ideal_Opcode();
9714
    __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9715
                   $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
9716
  %}
9717
  ins_pipe( pipe_slow );
9718
%}
9719

9720
instruct vand_mem_masked(vec dst, memory src2, kReg mask) %{
9721
  match(Set dst (AndV (Binary dst (LoadVector src2)) mask));
9722
  format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
9723
  ins_encode %{
9724
    int vlen_enc = vector_length_encoding(this);
9725
    BasicType bt = Matcher::vector_element_basic_type(this);
9726
    int opc = this->ideal_Opcode();
9727
    __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9728
                   $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
9729
  %}
9730
  ins_pipe( pipe_slow );
9731
%}
9732

9733
instruct vsub_reg_masked(vec dst, vec src2, kReg mask) %{
9734
  match(Set dst (SubVB (Binary dst src2) mask));
9735
  match(Set dst (SubVS (Binary dst src2) mask));
9736
  match(Set dst (SubVI (Binary dst src2) mask));
9737
  match(Set dst (SubVL (Binary dst src2) mask));
9738
  match(Set dst (SubVF (Binary dst src2) mask));
9739
  match(Set dst (SubVD (Binary dst src2) mask));
9740
  format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
9741
  ins_encode %{
9742
    int vlen_enc = vector_length_encoding(this);
9743
    BasicType bt = Matcher::vector_element_basic_type(this);
9744
    int opc = this->ideal_Opcode();
9745
    __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9746
                   $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
9747
  %}
9748
  ins_pipe( pipe_slow );
9749
%}
9750

9751
instruct vsub_mem_masked(vec dst, memory src2, kReg mask) %{
9752
  match(Set dst (SubVB (Binary dst (LoadVector src2)) mask));
9753
  match(Set dst (SubVS (Binary dst (LoadVector src2)) mask));
9754
  match(Set dst (SubVI (Binary dst (LoadVector src2)) mask));
9755
  match(Set dst (SubVL (Binary dst (LoadVector src2)) mask));
9756
  match(Set dst (SubVF (Binary dst (LoadVector src2)) mask));
9757
  match(Set dst (SubVD (Binary dst (LoadVector src2)) mask));
9758
  format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
9759
  ins_encode %{
9760
    int vlen_enc = vector_length_encoding(this);
9761
    BasicType bt = Matcher::vector_element_basic_type(this);
9762
    int opc = this->ideal_Opcode();
9763
    __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9764
                   $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
9765
  %}
9766
  ins_pipe( pipe_slow );
9767
%}
9768

9769
instruct vmul_reg_masked(vec dst, vec src2, kReg mask) %{
9770
  match(Set dst (MulVS (Binary dst src2) mask));
9771
  match(Set dst (MulVI (Binary dst src2) mask));
9772
  match(Set dst (MulVL (Binary dst src2) mask));
9773
  match(Set dst (MulVF (Binary dst src2) mask));
9774
  match(Set dst (MulVD (Binary dst src2) mask));
9775
  format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
9776
  ins_encode %{
9777
    int vlen_enc = vector_length_encoding(this);
9778
    BasicType bt = Matcher::vector_element_basic_type(this);
9779
    int opc = this->ideal_Opcode();
9780
    __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9781
                   $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
9782
  %}
9783
  ins_pipe( pipe_slow );
9784
%}
9785

9786
instruct vmul_mem_masked(vec dst, memory src2, kReg mask) %{
9787
  match(Set dst (MulVS (Binary dst (LoadVector src2)) mask));
9788
  match(Set dst (MulVI (Binary dst (LoadVector src2)) mask));
9789
  match(Set dst (MulVL (Binary dst (LoadVector src2)) mask));
9790
  match(Set dst (MulVF (Binary dst (LoadVector src2)) mask));
9791
  match(Set dst (MulVD (Binary dst (LoadVector src2)) mask));
9792
  format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
9793
  ins_encode %{
9794
    int vlen_enc = vector_length_encoding(this);
9795
    BasicType bt = Matcher::vector_element_basic_type(this);
9796
    int opc = this->ideal_Opcode();
9797
    __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9798
                   $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
9799
  %}
9800
  ins_pipe( pipe_slow );
9801
%}
9802

9803
instruct vsqrt_reg_masked(vec dst, kReg mask) %{
9804
  match(Set dst (SqrtVF dst mask));
9805
  match(Set dst (SqrtVD dst mask));
9806
  format %{ "vpsqrt_masked $dst, $mask\t! sqrt masked operation" %}
9807
  ins_encode %{
9808
    int vlen_enc = vector_length_encoding(this);
9809
    BasicType bt = Matcher::vector_element_basic_type(this);
9810
    int opc = this->ideal_Opcode();
9811
    __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9812
                   $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
9813
  %}
9814
  ins_pipe( pipe_slow );
9815
%}
9816

9817
instruct vdiv_reg_masked(vec dst, vec src2, kReg mask) %{
9818
  match(Set dst (DivVF (Binary dst src2) mask));
9819
  match(Set dst (DivVD (Binary dst src2) mask));
9820
  format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
9821
  ins_encode %{
9822
    int vlen_enc = vector_length_encoding(this);
9823
    BasicType bt = Matcher::vector_element_basic_type(this);
9824
    int opc = this->ideal_Opcode();
9825
    __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9826
                   $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
9827
  %}
9828
  ins_pipe( pipe_slow );
9829
%}
9830

9831
instruct vdiv_mem_masked(vec dst, memory src2, kReg mask) %{
9832
  match(Set dst (DivVF (Binary dst (LoadVector src2)) mask));
9833
  match(Set dst (DivVD (Binary dst (LoadVector src2)) mask));
9834
  format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
9835
  ins_encode %{
9836
    int vlen_enc = vector_length_encoding(this);
9837
    BasicType bt = Matcher::vector_element_basic_type(this);
9838
    int opc = this->ideal_Opcode();
9839
    __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9840
                   $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
9841
  %}
9842
  ins_pipe( pipe_slow );
9843
%}
9844

9845

9846
instruct vrol_imm_masked(vec dst, immI8 shift, kReg mask) %{
9847
  match(Set dst (RotateLeftV (Binary dst shift) mask));
9848
  match(Set dst (RotateRightV (Binary dst shift) mask));
9849
  format %{ "vprotate_imm_masked $dst, $dst, $shift, $mask\t! rotate masked operation" %}
9850
  ins_encode %{
9851
    int vlen_enc = vector_length_encoding(this);
9852
    BasicType bt = Matcher::vector_element_basic_type(this);
9853
    int opc = this->ideal_Opcode();
9854
    __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9855
                   $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
9856
  %}
9857
  ins_pipe( pipe_slow );
9858
%}
9859

9860
instruct vrol_reg_masked(vec dst, vec src2, kReg mask) %{
9861
  match(Set dst (RotateLeftV (Binary dst src2) mask));
9862
  match(Set dst (RotateRightV (Binary dst src2) mask));
9863
  format %{ "vrotate_masked $dst, $dst, $src2, $mask\t! rotate masked operation" %}
9864
  ins_encode %{
9865
    int vlen_enc = vector_length_encoding(this);
9866
    BasicType bt = Matcher::vector_element_basic_type(this);
9867
    int opc = this->ideal_Opcode();
9868
    __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9869
                   $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
9870
  %}
9871
  ins_pipe( pipe_slow );
9872
%}
9873

9874
instruct vlshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
9875
  match(Set dst (LShiftVS (Binary dst (LShiftCntV shift)) mask));
9876
  match(Set dst (LShiftVI (Binary dst (LShiftCntV shift)) mask));
9877
  match(Set dst (LShiftVL (Binary dst (LShiftCntV shift)) mask));
9878
  format %{ "vplshift_imm_masked $dst, $dst, $shift, $mask\t! lshift masked operation" %}
9879
  ins_encode %{
9880
    int vlen_enc = vector_length_encoding(this);
9881
    BasicType bt = Matcher::vector_element_basic_type(this);
9882
    int opc = this->ideal_Opcode();
9883
    __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9884
                   $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
9885
  %}
9886
  ins_pipe( pipe_slow );
9887
%}
9888

9889
instruct vlshift_reg_masked(vec dst, vec src2, kReg mask) %{
9890
  predicate(!n->as_ShiftV()->is_var_shift());
9891
  match(Set dst (LShiftVS (Binary dst src2) mask));
9892
  match(Set dst (LShiftVI (Binary dst src2) mask));
9893
  match(Set dst (LShiftVL (Binary dst src2) mask));
9894
  format %{ "vplshift_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
9895
  ins_encode %{
9896
    int vlen_enc = vector_length_encoding(this);
9897
    BasicType bt = Matcher::vector_element_basic_type(this);
9898
    int opc = this->ideal_Opcode();
9899
    __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9900
                   $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
9901
  %}
9902
  ins_pipe( pipe_slow );
9903
%}
9904

9905
instruct vlshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
9906
  predicate(n->as_ShiftV()->is_var_shift());
9907
  match(Set dst (LShiftVS (Binary dst src2) mask));
9908
  match(Set dst (LShiftVI (Binary dst src2) mask));
9909
  match(Set dst (LShiftVL (Binary dst src2) mask));
9910
  format %{ "vplshiftv_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
9911
  ins_encode %{
9912
    int vlen_enc = vector_length_encoding(this);
9913
    BasicType bt = Matcher::vector_element_basic_type(this);
9914
    int opc = this->ideal_Opcode();
9915
    __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9916
                   $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
9917
  %}
9918
  ins_pipe( pipe_slow );
9919
%}
9920

9921
instruct vrshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
9922
  match(Set dst (RShiftVS (Binary dst (RShiftCntV shift)) mask));
9923
  match(Set dst (RShiftVI (Binary dst (RShiftCntV shift)) mask));
9924
  match(Set dst (RShiftVL (Binary dst (RShiftCntV shift)) mask));
9925
  format %{ "vprshift_imm_masked $dst, $dst, $shift, $mask\t! rshift masked operation" %}
9926
  ins_encode %{
9927
    int vlen_enc = vector_length_encoding(this);
9928
    BasicType bt = Matcher::vector_element_basic_type(this);
9929
    int opc = this->ideal_Opcode();
9930
    __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9931
                   $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
9932
  %}
9933
  ins_pipe( pipe_slow );
9934
%}
9935

9936
instruct vrshift_reg_masked(vec dst, vec src2, kReg mask) %{
9937
  predicate(!n->as_ShiftV()->is_var_shift());
9938
  match(Set dst (RShiftVS (Binary dst src2) mask));
9939
  match(Set dst (RShiftVI (Binary dst src2) mask));
9940
  match(Set dst (RShiftVL (Binary dst src2) mask));
9941
  format %{ "vprshift_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
9942
  ins_encode %{
9943
    int vlen_enc = vector_length_encoding(this);
9944
    BasicType bt = Matcher::vector_element_basic_type(this);
9945
    int opc = this->ideal_Opcode();
9946
    __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9947
                   $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
9948
  %}
9949
  ins_pipe( pipe_slow );
9950
%}
9951

9952
instruct vrshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
9953
  predicate(n->as_ShiftV()->is_var_shift());
9954
  match(Set dst (RShiftVS (Binary dst src2) mask));
9955
  match(Set dst (RShiftVI (Binary dst src2) mask));
9956
  match(Set dst (RShiftVL (Binary dst src2) mask));
9957
  format %{ "vprshiftv_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
9958
  ins_encode %{
9959
    int vlen_enc = vector_length_encoding(this);
9960
    BasicType bt = Matcher::vector_element_basic_type(this);
9961
    int opc = this->ideal_Opcode();
9962
    __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9963
                   $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
9964
  %}
9965
  ins_pipe( pipe_slow );
9966
%}
9967

9968
instruct vurshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
9969
  match(Set dst (URShiftVS (Binary dst (RShiftCntV shift)) mask));
9970
  match(Set dst (URShiftVI (Binary dst (RShiftCntV shift)) mask));
9971
  match(Set dst (URShiftVL (Binary dst (RShiftCntV shift)) mask));
9972
  format %{ "vpurshift_imm_masked $dst, $dst, $shift, $mask\t! urshift masked operation" %}
9973
  ins_encode %{
9974
    int vlen_enc = vector_length_encoding(this);
9975
    BasicType bt = Matcher::vector_element_basic_type(this);
9976
    int opc = this->ideal_Opcode();
9977
    __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9978
                   $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
9979
  %}
9980
  ins_pipe( pipe_slow );
9981
%}
9982

9983
instruct vurshift_reg_masked(vec dst, vec src2, kReg mask) %{
9984
  predicate(!n->as_ShiftV()->is_var_shift());
9985
  match(Set dst (URShiftVS (Binary dst src2) mask));
9986
  match(Set dst (URShiftVI (Binary dst src2) mask));
9987
  match(Set dst (URShiftVL (Binary dst src2) mask));
9988
  format %{ "vpurshift_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
9989
  ins_encode %{
9990
    int vlen_enc = vector_length_encoding(this);
9991
    BasicType bt = Matcher::vector_element_basic_type(this);
9992
    int opc = this->ideal_Opcode();
9993
    __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9994
                   $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
9995
  %}
9996
  ins_pipe( pipe_slow );
9997
%}
9998

9999
instruct vurshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
10000
  predicate(n->as_ShiftV()->is_var_shift());
10001
  match(Set dst (URShiftVS (Binary dst src2) mask));
10002
  match(Set dst (URShiftVI (Binary dst src2) mask));
10003
  match(Set dst (URShiftVL (Binary dst src2) mask));
10004
  format %{ "vpurshiftv_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
10005
  ins_encode %{
10006
    int vlen_enc = vector_length_encoding(this);
10007
    BasicType bt = Matcher::vector_element_basic_type(this);
10008
    int opc = this->ideal_Opcode();
10009
    __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10010
                   $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
10011
  %}
10012
  ins_pipe( pipe_slow );
10013
%}
10014

10015
instruct vmaxv_reg_masked(vec dst, vec src2, kReg mask) %{
10016
  match(Set dst (MaxV (Binary dst src2) mask));
10017
  format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
10018
  ins_encode %{
10019
    int vlen_enc = vector_length_encoding(this);
10020
    BasicType bt = Matcher::vector_element_basic_type(this);
10021
    int opc = this->ideal_Opcode();
10022
    __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10023
                   $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10024
  %}
10025
  ins_pipe( pipe_slow );
10026
%}
10027

10028
instruct vmaxv_mem_masked(vec dst, memory src2, kReg mask) %{
10029
  match(Set dst (MaxV (Binary dst (LoadVector src2)) mask));
10030
  format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
10031
  ins_encode %{
10032
    int vlen_enc = vector_length_encoding(this);
10033
    BasicType bt = Matcher::vector_element_basic_type(this);
10034
    int opc = this->ideal_Opcode();
10035
    __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10036
                   $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
10037
  %}
10038
  ins_pipe( pipe_slow );
10039
%}
10040

10041
instruct vminv_reg_masked(vec dst, vec src2, kReg mask) %{
10042
  match(Set dst (MinV (Binary dst src2) mask));
10043
  format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
10044
  ins_encode %{
10045
    int vlen_enc = vector_length_encoding(this);
10046
    BasicType bt = Matcher::vector_element_basic_type(this);
10047
    int opc = this->ideal_Opcode();
10048
    __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10049
                   $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10050
  %}
10051
  ins_pipe( pipe_slow );
10052
%}
10053

10054
instruct vminv_mem_masked(vec dst, memory src2, kReg mask) %{
10055
  match(Set dst (MinV (Binary dst (LoadVector src2)) mask));
10056
  format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
10057
  ins_encode %{
10058
    int vlen_enc = vector_length_encoding(this);
10059
    BasicType bt = Matcher::vector_element_basic_type(this);
10060
    int opc = this->ideal_Opcode();
10061
    __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10062
                   $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
10063
  %}
10064
  ins_pipe( pipe_slow );
10065
%}
10066

10067
instruct vrearrangev_reg_masked(vec dst, vec src2, kReg mask) %{
10068
  match(Set dst (VectorRearrange (Binary dst src2) mask));
10069
  format %{ "vprearrange_masked $dst, $dst, $src2, $mask\t! rearrange masked operation" %}
10070
  ins_encode %{
10071
    int vlen_enc = vector_length_encoding(this);
10072
    BasicType bt = Matcher::vector_element_basic_type(this);
10073
    int opc = this->ideal_Opcode();
10074
    __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10075
                   $dst$$XMMRegister, $src2$$XMMRegister, false, vlen_enc);
10076
  %}
10077
  ins_pipe( pipe_slow );
10078
%}
10079

10080
instruct vabs_masked(vec dst, kReg mask) %{
10081
  match(Set dst (AbsVB dst mask));
10082
  match(Set dst (AbsVS dst mask));
10083
  match(Set dst (AbsVI dst mask));
10084
  match(Set dst (AbsVL dst mask));
10085
  format %{ "vabs_masked $dst, $mask \t! vabs masked operation" %}
10086
  ins_encode %{
10087
    int vlen_enc = vector_length_encoding(this);
10088
    BasicType bt = Matcher::vector_element_basic_type(this);
10089
    int opc = this->ideal_Opcode();
10090
    __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10091
                   $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
10092
  %}
10093
  ins_pipe( pipe_slow );
10094
%}
10095

10096
instruct vfma_reg_masked(vec dst, vec src2, vec src3, kReg mask) %{
10097
  match(Set dst (FmaVF (Binary dst src2) (Binary src3 mask)));
10098
  match(Set dst (FmaVD (Binary dst src2) (Binary src3 mask)));
10099
  format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
10100
  ins_encode %{
10101
    assert(UseFMA, "Needs FMA instructions support.");
10102
    int vlen_enc = vector_length_encoding(this);
10103
    BasicType bt = Matcher::vector_element_basic_type(this);
10104
    int opc = this->ideal_Opcode();
10105
    __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10106
                   $src2$$XMMRegister, $src3$$XMMRegister, true, vlen_enc);
10107
  %}
10108
  ins_pipe( pipe_slow );
10109
%}
10110

10111
instruct vfma_mem_masked(vec dst, vec src2, memory src3, kReg mask) %{
10112
  match(Set dst (FmaVF (Binary dst src2) (Binary (LoadVector src3) mask)));
10113
  match(Set dst (FmaVD (Binary dst src2) (Binary (LoadVector src3) mask)));
10114
  format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
10115
  ins_encode %{
10116
    assert(UseFMA, "Needs FMA instructions support.");
10117
    int vlen_enc = vector_length_encoding(this);
10118
    BasicType bt = Matcher::vector_element_basic_type(this);
10119
    int opc = this->ideal_Opcode();
10120
    __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10121
                   $src2$$XMMRegister, $src3$$Address, true, vlen_enc);
10122
  %}
10123
  ins_pipe( pipe_slow );
10124
%}
10125

10126
instruct evcmp_masked(kReg dst, vec src1, vec src2, immI8 cond, kReg mask) %{
10127
  match(Set dst (VectorMaskCmp (Binary src1 src2) (Binary cond mask)));
10128
  format %{ "vcmp_masked $dst, $src1, $src2, $cond, $mask" %}
10129
  ins_encode %{
10130
    assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
10131
    int vlen_enc = vector_length_encoding(this, $src1);
10132
    BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
10133

10134
    // Comparison i
10135
    switch (src1_elem_bt) {
10136
      case T_BYTE: {
10137
        bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10138
        Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10139
        __ evpcmpb($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10140
        break;
10141
      }
10142
      case T_SHORT: {
10143
        bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10144
        Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10145
        __ evpcmpw($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10146
        break;
10147
      }
10148
      case T_INT: {
10149
        bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10150
        Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10151
        __ evpcmpd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10152
        break;
10153
      }
10154
      case T_LONG: {
10155
        bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10156
        Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10157
        __ evpcmpq($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10158
        break;
10159
      }
10160
      case T_FLOAT: {
10161
        Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
10162
        __ evcmpps($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
10163
        break;
10164
      }
10165
      case T_DOUBLE: {
10166
        Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
10167
        __ evcmppd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
10168
        break;
10169
      }
10170
      default: assert(false, "%s", type2name(src1_elem_bt)); break;
10171
    }
10172
  %}
10173
  ins_pipe( pipe_slow );
10174
%}
10175

10176
instruct mask_all_evexI_LE32(kReg dst, rRegI src) %{
10177
  predicate(Matcher::vector_length(n) <= 32);
10178
  match(Set dst (MaskAll src));
10179
  format %{ "mask_all_evexI_LE32 $dst, $src \t" %}
10180
  ins_encode %{
10181
    int mask_len = Matcher::vector_length(this);
10182
    __ vector_maskall_operation($dst$$KRegister, $src$$Register, mask_len);
10183
  %}
10184
  ins_pipe( pipe_slow );
10185
%}
10186

10187
#ifdef _LP64
10188
instruct mask_not_immLT8(kReg dst, kReg src, rRegI rtmp, kReg ktmp, immI_M1 cnt) %{
10189
  predicate(Matcher::vector_length(n) < 8 && VM_Version::supports_avx512dq());
10190
  match(Set dst (XorVMask src (MaskAll cnt)));
10191
  effect(TEMP_DEF dst, TEMP rtmp, TEMP ktmp);
10192
  format %{ "mask_not_LT8 $dst, $src, $cnt \t!using $ktmp and $rtmp as TEMP" %}
10193
  ins_encode %{
10194
    uint masklen = Matcher::vector_length(this);
10195
    __ knot(masklen, $dst$$KRegister, $src$$KRegister, $ktmp$$KRegister, $rtmp$$Register);
10196
  %}
10197
  ins_pipe( pipe_slow );
10198
%}
10199

10200
instruct mask_not_imm(kReg dst, kReg src, immI_M1 cnt) %{
10201
  predicate((Matcher::vector_length(n) == 8 && VM_Version::supports_avx512dq()) ||
10202
            (Matcher::vector_length(n) == 16) ||
10203
            (Matcher::vector_length(n) > 16 && VM_Version::supports_avx512bw()));
10204
  match(Set dst (XorVMask src (MaskAll cnt)));
10205
  format %{ "mask_not $dst, $src, $cnt \t! mask not operation" %}
10206
  ins_encode %{
10207
    uint masklen = Matcher::vector_length(this);
10208
    __ knot(masklen, $dst$$KRegister, $src$$KRegister);
10209
  %}
10210
  ins_pipe( pipe_slow );
10211
%}
10212

10213
instruct long_to_maskLE8_avx(vec dst, rRegL src, rRegL rtmp1, rRegL rtmp2, vec xtmp) %{
10214
  predicate(n->bottom_type()->isa_vectmask() == nullptr && Matcher::vector_length(n) <= 8);
10215
  match(Set dst (VectorLongToMask src));
10216
  effect(TEMP dst, TEMP rtmp1, TEMP rtmp2, TEMP xtmp);
10217
  format %{ "long_to_mask_avx $dst, $src\t! using $rtmp1, $rtmp2, $xtmp as TEMP" %}
10218
  ins_encode %{
10219
    int mask_len = Matcher::vector_length(this);
10220
    int vec_enc  = vector_length_encoding(mask_len);
10221
    __ vector_long_to_maskvec($dst$$XMMRegister, $src$$Register, $rtmp1$$Register,
10222
                              $rtmp2$$Register, xnoreg, mask_len, vec_enc);
10223
  %}
10224
  ins_pipe( pipe_slow );
10225
%}
10226

10227

10228
instruct long_to_maskGT8_avx(vec dst, rRegL src, rRegL rtmp1, rRegL rtmp2, vec xtmp1, rFlagsReg cr) %{
10229
  predicate(n->bottom_type()->isa_vectmask() == nullptr && Matcher::vector_length(n) > 8);
10230
  match(Set dst (VectorLongToMask src));
10231
  effect(TEMP dst, TEMP rtmp1, TEMP rtmp2, TEMP xtmp1, KILL cr);
10232
  format %{ "long_to_mask_avx $dst, $src\t! using $rtmp1, $rtmp2, $xtmp1, as TEMP" %}
10233
  ins_encode %{
10234
    int mask_len = Matcher::vector_length(this);
10235
    assert(mask_len <= 32, "invalid mask length");
10236
    int vec_enc  = vector_length_encoding(mask_len);
10237
    __ vector_long_to_maskvec($dst$$XMMRegister, $src$$Register, $rtmp1$$Register,
10238
                              $rtmp2$$Register, $xtmp1$$XMMRegister, mask_len, vec_enc);
10239
  %}
10240
  ins_pipe( pipe_slow );
10241
%}
10242

10243
instruct long_to_mask_evex(kReg dst, rRegL src) %{
10244
  predicate(n->bottom_type()->isa_vectmask());
10245
  match(Set dst (VectorLongToMask src));
10246
  format %{ "long_to_mask_evex $dst, $src\t!" %}
10247
  ins_encode %{
10248
    __ kmov($dst$$KRegister, $src$$Register);
10249
  %}
10250
  ins_pipe( pipe_slow );
10251
%}
10252
#endif
10253

10254
instruct mask_opers_evex(kReg dst, kReg src1, kReg src2, kReg kscratch) %{
10255
  match(Set dst (AndVMask src1 src2));
10256
  match(Set dst (OrVMask src1 src2));
10257
  match(Set dst (XorVMask src1 src2));
10258
  effect(TEMP kscratch);
10259
  format %{ "mask_opers_evex $dst, $src1, $src2\t! using $kscratch as TEMP" %}
10260
  ins_encode %{
10261
    const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
10262
    const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
10263
    assert(Type::equals(mask1->bottom_type(), mask2->bottom_type()), "Mask types must be equal");
10264
    uint masklen = Matcher::vector_length(this);
10265
    masklen = (masklen < 16 && !VM_Version::supports_avx512dq()) ? 16 : masklen;
10266
    __ masked_op(this->ideal_Opcode(), masklen, $dst$$KRegister, $src1$$KRegister, $src2$$KRegister);
10267
  %}
10268
  ins_pipe( pipe_slow );
10269
%}
10270

10271
instruct vternlog_reg_masked(vec dst, vec src2, vec src3, immU8 func, kReg mask) %{
10272
  match(Set dst (MacroLogicV dst (Binary src2 (Binary src3 (Binary func mask)))));
10273
  format %{ "vternlog_masked $dst,$src2,$src3,$func,$mask\t! vternlog masked operation" %}
10274
  ins_encode %{
10275
    int vlen_enc = vector_length_encoding(this);
10276
    BasicType bt = Matcher::vector_element_basic_type(this);
10277
    __ evpternlog($dst$$XMMRegister, $func$$constant, $mask$$KRegister,
10278
                  $src2$$XMMRegister, $src3$$XMMRegister, true, bt, vlen_enc);
10279
  %}
10280
  ins_pipe( pipe_slow );
10281
%}
10282

10283
instruct vternlogd_mem_masked(vec dst, vec src2, memory src3, immU8 func, kReg mask) %{
10284
  match(Set dst (MacroLogicV dst (Binary src2 (Binary src3 (Binary func mask)))));
10285
  format %{ "vternlog_masked $dst,$src2,$src3,$func,$mask\t! vternlog masked operation" %}
10286
  ins_encode %{
10287
    int vlen_enc = vector_length_encoding(this);
10288
    BasicType bt = Matcher::vector_element_basic_type(this);
10289
    __ evpternlog($dst$$XMMRegister, $func$$constant, $mask$$KRegister,
10290
                  $src2$$XMMRegister, $src3$$Address, true, bt, vlen_enc);
10291
  %}
10292
  ins_pipe( pipe_slow );
10293
%}
10294

10295
instruct castMM(kReg dst)
10296
%{
10297
  match(Set dst (CastVV dst));
10298

10299
  size(0);
10300
  format %{ "# castVV of $dst" %}
10301
  ins_encode(/* empty encoding */);
10302
  ins_cost(0);
10303
  ins_pipe(empty);
10304
%}
10305

10306
instruct castVV(vec dst)
10307
%{
10308
  match(Set dst (CastVV dst));
10309

10310
  size(0);
10311
  format %{ "# castVV of $dst" %}
10312
  ins_encode(/* empty encoding */);
10313
  ins_cost(0);
10314
  ins_pipe(empty);
10315
%}
10316

10317
instruct castVVLeg(legVec dst)
10318
%{
10319
  match(Set dst (CastVV dst));
10320

10321
  size(0);
10322
  format %{ "# castVV of $dst" %}
10323
  ins_encode(/* empty encoding */);
10324
  ins_cost(0);
10325
  ins_pipe(empty);
10326
%}
10327

10328
instruct FloatClassCheck_reg_reg_vfpclass(rRegI dst, regF src, kReg ktmp, rFlagsReg cr)
10329
%{
10330
  match(Set dst (IsInfiniteF src));
10331
  effect(TEMP ktmp, KILL cr);
10332
  format %{ "float_class_check $dst, $src" %}
10333
  ins_encode %{
10334
    __ vfpclassss($ktmp$$KRegister, $src$$XMMRegister, 0x18);
10335
    __ kmovbl($dst$$Register, $ktmp$$KRegister);
10336
  %}
10337
  ins_pipe(pipe_slow);
10338
%}
10339

10340
instruct DoubleClassCheck_reg_reg_vfpclass(rRegI dst, regD src, kReg ktmp, rFlagsReg cr)
10341
%{
10342
  match(Set dst (IsInfiniteD src));
10343
  effect(TEMP ktmp, KILL cr);
10344
  format %{ "double_class_check $dst, $src" %}
10345
  ins_encode %{
10346
    __ vfpclasssd($ktmp$$KRegister, $src$$XMMRegister, 0x18);
10347
    __ kmovbl($dst$$Register, $ktmp$$KRegister);
10348
  %}
10349
  ins_pipe(pipe_slow);
10350
%}
10351

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.