Home | History | Annotate | Download | only in X86

Lines Matching full:next

13 ; ALL-NEXT:    movswl %di, %eax
14 ; ALL-NEXT: vmovd %eax, %xmm0
15 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
16 ; ALL-NEXT: retq
25 ; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
26 ; ALL-NEXT: vmovq %xmm0, %rax
27 ; ALL-NEXT: movq %rax, %rcx
28 ; ALL-NEXT: movq %rax, %rdx
29 ; ALL-NEXT: movswl %ax, %esi
30 ; ALL-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
31 ; ALL-NEXT: shrl $16, %eax
32 ; ALL-NEXT: shrq $32, %rcx
33 ; ALL-NEXT: shrq $48, %rdx
34 ; ALL-NEXT: movswl %dx, %edx
35 ; ALL-NEXT: vmovd %edx, %xmm0
36 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
37 ; ALL-NEXT: movswl %cx, %ecx
38 ; ALL-NEXT: vmovd %ecx, %xmm1
39 ; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
40 ; ALL-NEXT: cwtl
41 ; ALL-NEXT: vmovd %eax, %xmm2
42 ; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
43 ; ALL-NEXT: vmovd %esi, %xmm3
44 ; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
45 ; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
46 ; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
47 ; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
48 ; ALL-NEXT: retq
57 ; ALL-NEXT: vmovq %xmm0, %rax
58 ; ALL-NEXT: movq %rax, %rcx
59 ; ALL-NEXT: movq %rax, %rdx
60 ; ALL-NEXT: movswl %ax, %esi
61 ; ALL-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
62 ; ALL-NEXT: shrl $16, %eax
63 ; ALL-NEXT: shrq $32, %rcx
64 ; ALL-NEXT: shrq $48, %rdx
65 ; ALL-NEXT: movswl %dx, %edx
66 ; ALL-NEXT: vmovd %edx, %xmm0
67 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
68 ; ALL-NEXT: movswl %cx, %ecx
69 ; ALL-NEXT: vmovd %ecx, %xmm1
70 ; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
71 ; ALL-NEXT: cwtl
72 ; ALL-NEXT: vmovd %eax, %xmm2
73 ; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
74 ; ALL-NEXT: vmovd %esi, %xmm3
75 ; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
76 ; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
77 ; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
78 ; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
79 ; ALL-NEXT: retq
89 ; AVX1-NEXT: vpextrq $1, %xmm0, %rdx
90 ; AVX1-NEXT: movq %rdx, %r8
91 ; AVX1-NEXT: movq %rdx, %r10
92 ; AVX1-NEXT: movswl %dx, %r9d
93 ; AVX1-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<kill>
94 ; AVX1-NEXT: shrl $16, %edx
95 ; AVX1-NEXT: shrq $32, %r8
96 ; AVX1-NEXT: shrq $48, %r10
97 ; AVX1-NEXT: vmovq %xmm0, %rdi
98 ; AVX1-NEXT: movq %rdi, %rax
99 ; AVX1-NEXT: movq %rdi, %rsi
100 ; AVX1-NEXT: movswl %di, %ecx
101 ; AVX1-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<kill>
102 ; AVX1-NEXT: shrl $16, %edi
103 ; AVX1-NEXT: shrq $32, %rax
104 ; AVX1-NEXT: shrq $48, %rsi
105 ; AVX1-NEXT: movswl %si, %esi
106 ; AVX1-NEXT: vmovd %esi, %xmm0
107 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
108 ; AVX1-NEXT: cwtl
109 ; AVX1-NEXT: vmovd %eax, %xmm1
110 ; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
111 ; AVX1-NEXT: movswl %di, %eax
112 ; AVX1-NEXT: vmovd %eax, %xmm2
113 ; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
114 ; AVX1-NEXT: vmovd %ecx, %xmm3
115 ; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3
116 ; AVX1-NEXT: movswl %r10w, %eax
117 ; AVX1-NEXT: vmovd %eax, %xmm4
118 ; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4
119 ; AVX1-NEXT: movswl %r8w, %eax
120 ; AVX1-NEXT: vmovd %eax, %xmm5
121 ; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5
122 ; AVX1-NEXT: movswl %dx, %eax
123 ; AVX1-NEXT: vmovd %eax, %xmm6
124 ; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6
125 ; AVX1-NEXT: vmovd %r9d, %xmm7
126 ; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7
127 ; AVX1-NEXT: vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3]
128 ; AVX1-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
129 ; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
130 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
131 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
132 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
133 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
134 ; AVX1-NEXT: retq
138 ; AVX2-NEXT: vpextrq $1, %xmm0, %rdx
139 ; AVX2-NEXT: movq %rdx, %r8
140 ; AVX2-NEXT: movq %rdx, %r10
141 ; AVX2-NEXT: movswl %dx, %r9d
142 ; AVX2-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<kill>
143 ; AVX2-NEXT: shrl $16, %edx
144 ; AVX2-NEXT: shrq $32, %r8
145 ; AVX2-NEXT: shrq $48, %r10
146 ; AVX2-NEXT: vmovq %xmm0, %rdi
147 ; AVX2-NEXT: movq %rdi, %rax
148 ; AVX2-NEXT: movq %rdi, %rsi
149 ; AVX2-NEXT: movswl %di, %ecx
150 ; AVX2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<kill>
151 ; AVX2-NEXT: shrl $16, %edi
152 ; AVX2-NEXT: shrq $32, %rax
153 ; AVX2-NEXT: shrq $48, %rsi
154 ; AVX2-NEXT: movswl %si, %esi
155 ; AVX2-NEXT: vmovd %esi, %xmm0
156 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
157 ; AVX2-NEXT: cwtl
158 ; AVX2-NEXT: vmovd %eax, %xmm1
159 ; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
160 ; AVX2-NEXT: movswl %di, %eax
161 ; AVX2-NEXT: vmovd %eax, %xmm2
162 ; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
163 ; AVX2-NEXT: vmovd %ecx, %xmm3
164 ; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
165 ; AVX2-NEXT: movswl %r10w, %eax
166 ; AVX2-NEXT: vmovd %eax, %xmm4
167 ; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4
168 ; AVX2-NEXT: movswl %r8w, %eax
169 ; AVX2-NEXT: vmovd %eax, %xmm5
170 ; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5
171 ; AVX2-NEXT: movswl %dx, %eax
172 ; AVX2-NEXT: vmovd %eax, %xmm6
173 ; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6
174 ; AVX2-NEXT: vmovd %r9d, %xmm7
175 ; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7
176 ; AVX2-NEXT: vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3]
177 ; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
178 ; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
179 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
180 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
181 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
182 ; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
183 ; AVX2-NEXT: retq
187 ; AVX512-NEXT: vpextrq $1, %xmm0, %rdx
188 ; AVX512-NEXT: movq %rdx, %r8
189 ; AVX512-NEXT: movq %rdx, %r10
190 ; AVX512-NEXT: movswl %dx, %r9d
191 ; AVX512-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<kill>
192 ; AVX512-NEXT: shrl $16, %edx
193 ; AVX512-NEXT: shrq $32, %r8
194 ; AVX512-NEXT: shrq $48, %r10
195 ; AVX512-NEXT: vmovq %xmm0, %rdi
196 ; AVX512-NEXT: movq %rdi, %rax
197 ; AVX512-NEXT: movq %rdi, %rsi
198 ; AVX512-NEXT: movswl %di, %ecx
199 ; AVX512-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<kill>
200 ; AVX512-NEXT: shrl $16, %edi
201 ; AVX512-NEXT: shrq $32, %rax
202 ; AVX512-NEXT: shrq $48, %rsi
203 ; AVX512-NEXT: movswl %si, %esi
204 ; AVX512-NEXT: vmovd %esi, %xmm0
205 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
206 ; AVX512-NEXT: cwtl
207 ; AVX512-NEXT: vmovd %eax, %xmm1
208 ; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
209 ; AVX512-NEXT: movswl %di, %eax
210 ; AVX512-NEXT: vmovd %eax, %xmm2
211 ; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
212 ; AVX512-NEXT: vmovd %ecx, %xmm3
213 ; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
214 ; AVX512-NEXT: movswl %r10w, %eax
215 ; AVX512-NEXT: vmovd %eax, %xmm4
216 ; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4
217 ; AVX512-NEXT: movswl %r8w, %eax
218 ; AVX512-NEXT: vmovd %eax, %xmm5
219 ; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
220 ; AVX512-NEXT: movswl %dx, %eax
221 ; AVX512-NEXT: vmovd %eax, %xmm6
222 ; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6
223 ; AVX512-NEXT: vmovd %r9d, %xmm7
224 ; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7
225 ; AVX512-NEXT: vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3]
226 ; AVX512-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
227 ; AVX512-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
228 ; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
229 ; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
230 ; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
231 ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0
232 ; AVX512-NEXT: retq
241 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
242 ; AVX1-NEXT: vmovq %xmm4, %rax
243 ; AVX1-NEXT: movq %rax, %rcx
244 ; AVX1-NEXT: shrq $48, %rcx
245 ; AVX1-NEXT: movswl %cx, %ecx
246 ; AVX1-NEXT: vmovd %ecx, %xmm8
247 ; AVX1-NEXT: movq %rax, %rcx
248 ; AVX1-NEXT: shrq $32, %rcx
249 ; AVX1-NEXT: movswl %cx, %ecx
250 ; AVX1-NEXT: vmovd %ecx, %xmm9
251 ; AVX1-NEXT: movswl %ax, %ecx
252 ; AVX1-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
253 ; AVX1-NEXT: shrl $16, %eax
254 ; AVX1-NEXT: cwtl
255 ; AVX1-NEXT: vmovd %eax, %xmm10
256 ; AVX1-NEXT: vpextrq $1, %xmm4, %rax
257 ; AVX1-NEXT: vmovd %ecx, %xmm11
258 ; AVX1-NEXT: movq %rax, %rcx
259 ; AVX1-NEXT: shrq $48, %rcx
260 ; AVX1-NEXT: movswl %cx, %ecx
261 ; AVX1-NEXT: vmovd %ecx, %xmm12
262 ; AVX1-NEXT: movq %rax, %rcx
263 ; AVX1-NEXT: shrq $32, %rcx
264 ; AVX1-NEXT: movswl %cx, %ecx
265 ; AVX1-NEXT: vmovd %ecx, %xmm13
266 ; AVX1-NEXT: movswl %ax, %ecx
267 ; AVX1-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
268 ; AVX1-NEXT: shrl $16, %eax
269 ; AVX1-NEXT: cwtl
270 ; AVX1-NEXT: vmovd %eax, %xmm14
271 ; AVX1-NEXT: vmovq %xmm0, %rax
272 ; AVX1-NEXT: vmovd %ecx, %xmm15
273 ; AVX1-NEXT: movq %rax, %rcx
274 ; AVX1-NEXT: shrq $48, %rcx
275 ; AVX1-NEXT: movswl %cx, %ecx
276 ; AVX1-NEXT: vmovd %ecx, %xmm2
277 ; AVX1-NEXT: movq %rax, %rcx
278 ; AVX1-NEXT: shrq $32, %rcx
279 ; AVX1-NEXT: movswl %cx, %ecx
280 ; AVX1-NEXT: vmovd %ecx, %xmm3
281 ; AVX1-NEXT: movswl %ax, %ecx
282 ; AVX1-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
283 ; AVX1-NEXT: shrl $16, %eax
284 ; AVX1-NEXT: cwtl
285 ; AVX1-NEXT: vmovd %eax, %xmm4
286 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax
287 ; AVX1-NEXT: vmovd %ecx, %xmm0
288 ; AVX1-NEXT: movq %rax, %rcx
289 ; AVX1-NEXT: shrq $48, %rcx
290 ; AVX1-NEXT: movswl %cx, %ecx
291 ; AVX1-NEXT: vmovd %ecx, %xmm5
292 ; AVX1-NEXT: movq %rax, %rcx
293 ; AVX1-NEXT: shrq $32, %rcx
294 ; AVX1-NEXT: movswl %cx, %ecx
295 ; AVX1-NEXT: vmovd %ecx, %xmm6
296 ; AVX1-NEXT: movl %eax, %ecx
297 ; AVX1-NEXT: shrl $16, %ecx
298 ; AVX1-NEXT: movswl %cx, %ecx
299 ; AVX1-NEXT: vmovd %ecx, %xmm7
300 ; AVX1-NEXT: cwtl
301 ; AVX1-NEXT: vmovd %eax, %xmm1
302 ; AVX1-NEXT: vcvtph2ps %xmm8, %xmm8
303 ; AVX1-NEXT: vcvtph2ps %xmm9, %xmm9
304 ; AVX1-NEXT: vcvtph2ps %xmm10, %xmm10
305 ; AVX1-NEXT: vcvtph2ps %xmm11, %xmm11
306 ; AVX1-NEXT: vcvtph2ps %xmm12, %xmm12
307 ; AVX1-NEXT: vcvtph2ps %xmm13, %xmm13
308 ; AVX1-NEXT: vcvtph2ps %xmm14, %xmm14
309 ; AVX1-NEXT: vcvtph2ps %xmm15, %xmm15
310 ; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
311 ; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3
312 ; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4
313 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
314 ; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5
315 ; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6
316 ; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7
317 ; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
318 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[2,3]
319 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3]
320 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0]
321 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3]
322 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
323 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0]
324 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
325 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[2,3]
326 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
327 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
328 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[2,3]
329 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
330 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
331 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
332 ; AVX1-NEXT: retq
336 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
337 ; AVX2-NEXT: vmovq %xmm4, %rax
338 ; AVX2-NEXT: movq %rax, %rcx
339 ; AVX2-NEXT: shrq $48, %rcx
340 ; AVX2-NEXT: movswl %cx, %ecx
341 ; AVX2-NEXT: vmovd %ecx, %xmm8
342 ; AVX2-NEXT: movq %rax, %rcx
343 ; AVX2-NEXT: shrq $32, %rcx
344 ; AVX2-NEXT: movswl %cx, %ecx
345 ; AVX2-NEXT: vmovd %ecx, %xmm9
346 ; AVX2-NEXT: movswl %ax, %ecx
347 ; AVX2-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
348 ; AVX2-NEXT: shrl $16, %eax
349 ; AVX2-NEXT: cwtl
350 ; AVX2-NEXT: vmovd %eax, %xmm10
351 ; AVX2-NEXT: vpextrq $1, %xmm4, %rax
352 ; AVX2-NEXT: vmovd %ecx, %xmm11
353 ; AVX2-NEXT: movq %rax, %rcx
354 ; AVX2-NEXT: shrq $48, %rcx
355 ; AVX2-NEXT: movswl %cx, %ecx
356 ; AVX2-NEXT: vmovd %ecx, %xmm12
357 ; AVX2-NEXT: movq %rax, %rcx
358 ; AVX2-NEXT: shrq $32, %rcx
359 ; AVX2-NEXT: movswl %cx, %ecx
360 ; AVX2-NEXT: vmovd %ecx, %xmm13
361 ; AVX2-NEXT: movswl %ax, %ecx
362 ; AVX2-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
363 ; AVX2-NEXT: shrl $16, %eax
364 ; AVX2-NEXT: cwtl
365 ; AVX2-NEXT: vmovd %eax, %xmm14
366 ; AVX2-NEXT: vmovq %xmm0, %rax
367 ; AVX2-NEXT: vmovd %ecx, %xmm15
368 ; AVX2-NEXT: movq %rax, %rcx
369 ; AVX2-NEXT: shrq $48, %rcx
370 ; AVX2-NEXT: movswl %cx, %ecx
371 ; AVX2-NEXT: vmovd %ecx, %xmm2
372 ; AVX2-NEXT: movq %rax, %rcx
373 ; AVX2-NEXT: shrq $32, %rcx
374 ; AVX2-NEXT: movswl %cx, %ecx
375 ; AVX2-NEXT: vmovd %ecx, %xmm3
376 ; AVX2-NEXT: movswl %ax, %ecx
377 ; AVX2-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
378 ; AVX2-NEXT: shrl $16, %eax
379 ; AVX2-NEXT: cwtl
380 ; AVX2-NEXT: vmovd %eax, %xmm4
381 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax
382 ; AVX2-NEXT: vmovd %ecx, %xmm0
383 ; AVX2-NEXT: movq %rax, %rcx
384 ; AVX2-NEXT: shrq $48, %rcx
385 ; AVX2-NEXT: movswl %cx, %ecx
386 ; AVX2-NEXT: vmovd %ecx, %xmm5
387 ; AVX2-NEXT: movq %rax, %rcx
388 ; AVX2-NEXT: shrq $32, %rcx
389 ; AVX2-NEXT: movswl %cx, %ecx
390 ; AVX2-NEXT: vmovd %ecx, %xmm6
391 ; AVX2-NEXT: movl %eax, %ecx
392 ; AVX2-NEXT: shrl $16, %ecx
393 ; AVX2-NEXT: movswl %cx, %ecx
394 ; AVX2-NEXT: vmovd %ecx, %xmm7
395 ; AVX2-NEXT: cwtl
396 ; AVX2-NEXT: vmovd %eax, %xmm1
397 ; AVX2-NEXT: vcvtph2ps %xmm8, %xmm8
398 ; AVX2-NEXT: vcvtph2ps %xmm9, %xmm9
399 ; AVX2-NEXT: vcvtph2ps %xmm10, %xmm10
400 ; AVX2-NEXT: vcvtph2ps %xmm11, %xmm11
401 ; AVX2-NEXT: vcvtph2ps %xmm12, %xmm12
402 ; AVX2-NEXT: vcvtph2ps %xmm13, %xmm13
403 ; AVX2-NEXT: vcvtph2ps %xmm14, %xmm14
404 ; AVX2-NEXT: vcvtph2ps %xmm15, %xmm15
405 ; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
406 ; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
407 ; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4
408 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
409 ; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5
410 ; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6
411 ; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7
412 ; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
413 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[2,3]
414 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3]
415 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0]
416 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3]
417 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
418 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0]
419 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
420 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[2,3]
421 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
422 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
423 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[2,3]
424 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
425 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
426 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
427 ; AVX2-NEXT: retq
431 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm10
432 ; AVX512-NEXT: vmovq %xmm0, %rax
433 ; AVX512-NEXT: movq %rax, %rcx
434 ; AVX512-NEXT: shrq $48, %rcx
435 ; AVX512-NEXT: movswl %cx, %ecx
436 ; AVX512-NEXT: vmovd %ecx, %xmm8
437 ; AVX512-NEXT: movq %rax, %rcx
438 ; AVX512-NEXT: shrq $32, %rcx
439 ; AVX512-NEXT: movswl %cx, %ecx
440 ; AVX512-NEXT: vmovd %ecx, %xmm9
441 ; AVX512-NEXT: movswl %ax, %ecx
442 ; AVX512-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
443 ; AVX512-NEXT: shrl $16, %eax
444 ; AVX512-NEXT: cwtl
445 ; AVX512-NEXT: vmovd %eax, %xmm11
446 ; AVX512-NEXT: vpextrq $1, %xmm0, %rax
447 ; AVX512-NEXT: vmovd %ecx, %xmm12
448 ; AVX512-NEXT: movq %rax, %rcx
449 ; AVX512-NEXT: shrq $48, %rcx
450 ; AVX512-NEXT: movswl %cx, %ecx
451 ; AVX512-NEXT: vmovd %ecx, %xmm13
452 ; AVX512-NEXT: movq %rax, %rcx
453 ; AVX512-NEXT: shrq $32, %rcx
454 ; AVX512-NEXT: movswl %cx, %ecx
455 ; AVX512-NEXT: vmovd %ecx, %xmm14
456 ; AVX512-NEXT: movswl %ax, %ecx
457 ; AVX512-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
458 ; AVX512-NEXT: shrl $16, %eax
459 ; AVX512-NEXT: cwtl
460 ; AVX512-NEXT: vmovd %eax, %xmm15
461 ; AVX512-NEXT: vmovq %xmm10, %rax
462 ; AVX512-NEXT: vmovd %ecx, %xmm2
463 ; AVX512-NEXT: movq %rax, %rcx
464 ; AVX512-NEXT: shrq $48, %rcx
465 ; AVX512-NEXT: movswl %cx, %ecx
466 ; AVX512-NEXT: vmovd %ecx, %xmm3
467 ; AVX512-NEXT: movq %rax, %rcx
468 ; AVX512-NEXT: shrq $32, %rcx
469 ; AVX512-NEXT: movswl %cx, %ecx
470 ; AVX512-NEXT: vmovd %ecx, %xmm1
471 ; AVX512-NEXT: movswl %ax, %ecx
472 ; AVX512-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
473 ; AVX512-NEXT: shrl $16, %eax
474 ; AVX512-NEXT: cwtl
475 ; AVX512-NEXT: vmovd %eax, %xmm4
476 ; AVX512-NEXT: vpextrq $1, %xmm10, %rax
477 ; AVX512-NEXT: vmovd %ecx, %xmm10
478 ; AVX512-NEXT: movq %rax, %rcx
479 ; AVX512-NEXT: shrq $48, %rcx
480 ; AVX512-NEXT: movswl %cx, %ecx
481 ; AVX512-NEXT: vmovd %ecx, %xmm5
482 ; AVX512-NEXT: movq %rax, %rcx
483 ; AVX512-NEXT: shrq $32, %rcx
484 ; AVX512-NEXT: movswl %cx, %ecx
485 ; AVX512-NEXT: vmovd %ecx, %xmm6
486 ; AVX512-NEXT: movl %eax, %ecx
487 ; AVX512-NEXT: shrl $16, %ecx
488 ; AVX512-NEXT: movswl %cx, %ecx
489 ; AVX512-NEXT: vmovd %ecx, %xmm7
490 ; AVX512-NEXT: cwtl
491 ; AVX512-NEXT: vmovd %eax, %xmm0
492 ; AVX512-NEXT: vcvtph2ps %xmm8, %xmm8
493 ; AVX512-NEXT: vcvtph2ps %xmm9, %xmm9
494 ; AVX512-NEXT: vcvtph2ps %xmm11, %xmm11
495 ; AVX512-NEXT: vcvtph2ps %xmm12, %xmm12
496 ; AVX512-NEXT: vcvtph2ps %xmm13, %xmm13
497 ; AVX512-NEXT: vcvtph2ps %xmm14, %xmm14
498 ; AVX512-NEXT: vcvtph2ps %xmm15, %xmm15
499 ; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
500 ; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
501 ; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
502 ; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4
503 ; AVX512-NEXT: vcvtph2ps %xmm10, %xmm10
504 ; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
505 ; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6
506 ; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7
507 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
508 ; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[2,3]
509 ; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0],xmm0[3]
510 ; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[0]
511 ; AVX512-NEXT: vinsertps {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[2,3]
512 ; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0,1],xmm1[0],xmm4[3]
513 ; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0]
514 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
515 ; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm15[0],xmm2[2,3]
516 ; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm14[0],xmm1[3]
517 ; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm13[0]
518 ; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[2,3]
519 ; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
520 ; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
521 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
522 ; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
523 ; AVX512-NEXT: retq
536 ; ALL-NEXT: movswl (%rdi), %eax
537 ; ALL-NEXT: vmovd %eax, %xmm0
538 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
539 ; ALL-NEXT: retq
549 ; ALL-NEXT: movswl 6(%rdi), %eax
550 ; ALL-NEXT: vmovd %eax, %xmm0
551 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
552 ; ALL-NEXT: movswl 4(%rdi), %eax
553 ; ALL-NEXT: vmovd %eax, %xmm1
554 ; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
555 ; ALL-NEXT: movswl (%rdi), %eax
556 ; ALL-NEXT: vmovd %eax, %xmm2
557 ; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
558 ; ALL-NEXT: movswl 2(%rdi), %eax
559 ; ALL-NEXT: vmovd %eax, %xmm3
560 ; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
561 ; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
562 ; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
563 ; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
564 ; ALL-NEXT: retq
574 ; ALL-NEXT: movq (%rdi), %rax
575 ; ALL-NEXT: movq %rax, %rcx
576 ; ALL-NEXT: movq %rax, %rdx
577 ; ALL-NEXT: movswl %ax, %esi
578 ; ALL-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
579 ; ALL-NEXT: shrl $16, %eax
580 ; ALL-NEXT: shrq $32, %rcx
581 ; ALL-NEXT: shrq $48, %rdx
582 ; ALL-NEXT: movswl %dx, %edx
583 ; ALL-NEXT: vmovd %edx, %xmm0
584 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
585 ; ALL-NEXT: movswl %cx, %ecx
586 ; ALL-NEXT: vmovd %ecx, %xmm1
587 ; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
588 ; ALL-NEXT: cwtl
589 ; ALL-NEXT: vmovd %eax, %xmm2
590 ; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
591 ; ALL-NEXT: vmovd %esi, %xmm3
592 ; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
593 ; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
594 ; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
595 ; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
596 ; ALL-NEXT: retq
607 ; AVX1-NEXT: movswl 6(%rdi), %eax
608 ; AVX1-NEXT: vmovd %eax, %xmm0
609 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
610 ; AVX1-NEXT: movswl 4(%rdi), %eax
611 ; AVX1-NEXT: vmovd %eax, %xmm1
612 ; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
613 ; AVX1-NEXT: movswl (%rdi), %eax
614 ; AVX1-NEXT: vmovd %eax, %xmm2
615 ; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
616 ; AVX1-NEXT: movswl 2(%rdi), %eax
617 ; AVX1-NEXT: vmovd %eax, %xmm3
618 ; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3
619 ; AVX1-NEXT: movswl 14(%rdi), %eax
620 ; AVX1-NEXT: vmovd %eax, %xmm4
621 ; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4
622 ; AVX1-NEXT: movswl 12(%rdi), %eax
623 ; AVX1-NEXT: vmovd %eax, %xmm5
624 ; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5
625 ; AVX1-NEXT: movswl 8(%rdi), %eax
626 ; AVX1-NEXT: vmovd %eax, %xmm6
627 ; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6
628 ; AVX1-NEXT: movswl 10(%rdi), %eax
629 ; AVX1-NEXT: vmovd %eax, %xmm7
630 ; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7
631 ; AVX1-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
632 ; AVX1-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
633 ; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
634 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
635 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
636 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
637 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
638 ; AVX1-NEXT: retq
642 ; AVX2-NEXT: movswl 6(%rdi), %eax
643 ; AVX2-NEXT: vmovd %eax, %xmm0
644 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
645 ; AVX2-NEXT: movswl 4(%rdi), %eax
646 ; AVX2-NEXT: vmovd %eax, %xmm1
647 ; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
648 ; AVX2-NEXT: movswl (%rdi), %eax
649 ; AVX2-NEXT: vmovd %eax, %xmm2
650 ; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
651 ; AVX2-NEXT: movswl 2(%rdi), %eax
652 ; AVX2-NEXT: vmovd %eax, %xmm3
653 ; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
654 ; AVX2-NEXT: movswl 14(%rdi), %eax
655 ; AVX2-NEXT: vmovd %eax, %xmm4
656 ; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4
657 ; AVX2-NEXT: movswl 12(%rdi), %eax
658 ; AVX2-NEXT: vmovd %eax, %xmm5
659 ; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5
660 ; AVX2-NEXT: movswl 8(%rdi), %eax
661 ; AVX2-NEXT: vmovd %eax, %xmm6
662 ; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6
663 ; AVX2-NEXT: movswl 10(%rdi), %eax
664 ; AVX2-NEXT: vmovd %eax, %xmm7
665 ; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7
666 ; AVX2-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
667 ; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
668 ; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
669 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
670 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
671 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
672 ; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
673 ; AVX2-NEXT: retq
677 ; AVX512-NEXT: movswl 6(%rdi), %eax
678 ; AVX512-NEXT: vmovd %eax, %xmm0
679 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
680 ; AVX512-NEXT: movswl 4(%rdi), %eax
681 ; AVX512-NEXT: vmovd %eax, %xmm1
682 ; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
683 ; AVX512-NEXT: movswl (%rdi), %eax
684 ; AVX512-NEXT: vmovd %eax, %xmm2
685 ; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
686 ; AVX512-NEXT: movswl 2(%rdi), %eax
687 ; AVX512-NEXT: vmovd %eax, %xmm3
688 ; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
689 ; AVX512-NEXT: movswl 14(%rdi), %eax
690 ; AVX512-NEXT: vmovd %eax, %xmm4
691 ; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4
692 ; AVX512-NEXT: movswl 12(%rdi), %eax
693 ; AVX512-NEXT: vmovd %eax, %xmm5
694 ; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
695 ; AVX512-NEXT: movswl 8(%rdi), %eax
696 ; AVX512-NEXT: vmovd %eax, %xmm6
697 ; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6
698 ; AVX512-NEXT: movswl 10(%rdi), %eax
699 ; AVX512-NEXT: vmovd %eax, %xmm7
700 ; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7
701 ; AVX512-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
702 ; AVX512-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
703 ; AVX512-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
704 ; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
705 ; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
706 ; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
707 ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0
708 ; AVX512-NEXT: retq
718 ; AVX1-NEXT: movswl 22(%rdi), %eax
719 ; AVX1-NEXT: vmovd %eax, %xmm0
720 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm8
721 ; AVX1-NEXT: movswl 20(%rdi), %eax
722 ; AVX1-NEXT: vmovd %eax, %xmm0
723 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm9
724 ; AVX1-NEXT: movswl 16(%rdi), %eax
725 ; AVX1-NEXT: vmovd %eax, %xmm0
726 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm10
727 ; AVX1-NEXT: movswl 18(%rdi), %eax
728 ; AVX1-NEXT: vmovd %eax, %xmm0
729 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm11
730 ; AVX1-NEXT: movswl 30(%rdi), %eax
731 ; AVX1-NEXT: vmovd %eax, %xmm0
732 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm12
733 ; AVX1-NEXT: movswl 28(%rdi), %eax
734 ; AVX1-NEXT: vmovd %eax, %xmm0
735 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm13
736 ; AVX1-NEXT: movswl 24(%rdi), %eax
737 ; AVX1-NEXT: vmovd %eax, %xmm0
738 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm14
739 ; AVX1-NEXT: movswl 26(%rdi), %eax
740 ; AVX1-NEXT: vmovd %eax, %xmm0
741 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm15
742 ; AVX1-NEXT: movswl 6(%rdi), %eax
743 ; AVX1-NEXT: vmovd %eax, %xmm0
744 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
745 ; AVX1-NEXT: movswl 4(%rdi), %eax
746 ; AVX1-NEXT: vmovd %eax, %xmm2
747 ; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
748 ; AVX1-NEXT: movswl (%rdi), %eax
749 ; AVX1-NEXT: vmovd %eax, %xmm3
750 ; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3
751 ; AVX1-NEXT: movswl 2(%rdi), %eax
752 ; AVX1-NEXT: vmovd %eax, %xmm4
753 ; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4
754 ; AVX1-NEXT: movswl 14(%rdi), %eax
755 ; AVX1-NEXT: vmovd %eax, %xmm5
756 ; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5
757 ; AVX1-NEXT: movswl 12(%rdi), %eax
758 ; AVX1-NEXT: vmovd %eax, %xmm6
759 ; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6
760 ; AVX1-NEXT: movswl 8(%rdi), %eax
761 ; AVX1-NEXT: vmovd %eax, %xmm7
762 ; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7
763 ; AVX1-NEXT: movswl 10(%rdi), %eax
764 ; AVX1-NEXT: vmovd %eax, %xmm1
765 ; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
766 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[2,3]
767 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3]
768 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0]
769 ; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3]
770 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0],xmm3[3]
771 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
772 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
773 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3]
774 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
775 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
776 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3]
777 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
778 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
779 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
780 ; AVX1-NEXT: retq
784 ; AVX2-NEXT: movswl 22(%rdi), %eax
785 ; AVX2-NEXT: vmovd %eax, %xmm0
786 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm8
787 ; AVX2-NEXT: movswl 20(%rdi), %eax
788 ; AVX2-NEXT: vmovd %eax, %xmm0
789 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm9
790 ; AVX2-NEXT: movswl 16(%rdi), %eax
791 ; AVX2-NEXT: vmovd %eax, %xmm0
792 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm10
793 ; AVX2-NEXT: movswl 18(%rdi), %eax
794 ; AVX2-NEXT: vmovd %eax, %xmm0
795 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm11
796 ; AVX2-NEXT: movswl 30(%rdi), %eax
797 ; AVX2-NEXT: vmovd %eax, %xmm0
798 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm12
799 ; AVX2-NEXT: movswl 28(%rdi), %eax
800 ; AVX2-NEXT: vmovd %eax, %xmm0
801 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm13
802 ; AVX2-NEXT: movswl 24(%rdi), %eax
803 ; AVX2-NEXT: vmovd %eax, %xmm0
804 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm14
805 ; AVX2-NEXT: movswl 26(%rdi), %eax
806 ; AVX2-NEXT: vmovd %eax, %xmm0
807 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm15
808 ; AVX2-NEXT: movswl 6(%rdi), %eax
809 ; AVX2-NEXT: vmovd %eax, %xmm0
810 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
811 ; AVX2-NEXT: movswl 4(%rdi), %eax
812 ; AVX2-NEXT: vmovd %eax, %xmm2
813 ; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
814 ; AVX2-NEXT: movswl (%rdi), %eax
815 ; AVX2-NEXT: vmovd %eax, %xmm3
816 ; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
817 ; AVX2-NEXT: movswl 2(%rdi), %eax
818 ; AVX2-NEXT: vmovd %eax, %xmm4
819 ; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4
820 ; AVX2-NEXT: movswl 14(%rdi), %eax
821 ; AVX2-NEXT: vmovd %eax, %xmm5
822 ; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5
823 ; AVX2-NEXT: movswl 12(%rdi), %eax
824 ; AVX2-NEXT: vmovd %eax, %xmm6
825 ; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6
826 ; AVX2-NEXT: movswl 8(%rdi), %eax
827 ; AVX2-NEXT: vmovd %eax, %xmm7
828 ; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7
829 ; AVX2-NEXT: movswl 10(%rdi), %eax
830 ; AVX2-NEXT: vmovd %eax, %xmm1
831 ; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
832 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[2,3]
833 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3]
834 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0]
835 ; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3]
836 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0],xmm3[3]
837 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
838 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
839 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3]
840 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
841 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
842 ; AVX2-NEXT
843 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
844 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
845 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
846 ; AVX2-NEXT: retq
850 ; AVX512-NEXT: movswl 6(%rdi), %eax
851 ; AVX512-NEXT: vmovd %eax, %xmm0
852 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm8
853 ; AVX512-NEXT: movswl 4(%rdi), %eax
854 ; AVX512-NEXT: vmovd %eax, %xmm0
855 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm9
856 ; AVX512-NEXT: movswl (%rdi), %eax
857 ; AVX512-NEXT: vmovd %eax, %xmm0
858 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm10
859 ; AVX512-NEXT: movswl 2(%rdi), %eax
860 ; AVX512-NEXT: vmovd %eax, %xmm0
861 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm11
862 ; AVX512-NEXT: movswl 14(%rdi), %eax
863 ; AVX512-NEXT: vmovd %eax, %xmm0
864 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm12
865 ; AVX512-NEXT: movswl 12(%rdi), %eax
866 ; AVX512-NEXT: vmovd %eax, %xmm0
867 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm13
868 ; AVX512-NEXT: movswl 8(%rdi), %eax
869 ; AVX512-NEXT: vmovd %eax, %xmm0
870 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm14
871 ; AVX512-NEXT: movswl 10(%rdi), %eax
872 ; AVX512-NEXT: vmovd %eax, %xmm0
873 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm15
874 ; AVX512-NEXT: movswl 22(%rdi), %eax
875 ; AVX512-NEXT: vmovd %eax, %xmm0
876 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
877 ; AVX512-NEXT: movswl 20(%rdi), %eax
878 ; AVX512-NEXT: vmovd %eax, %xmm1
879 ; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
880 ; AVX512-NEXT: movswl 16(%rdi), %eax
881 ; AVX512-NEXT: vmovd %eax, %xmm2
882 ; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
883 ; AVX512-NEXT: movswl 18(%rdi), %eax
884 ; AVX512-NEXT: vmovd %eax, %xmm3
885 ; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
886 ; AVX512-NEXT: movswl 30(%rdi), %eax
887 ; AVX512-NEXT: vmovd %eax, %xmm4
888 ; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4
889 ; AVX512-NEXT: movswl 28(%rdi), %eax
890 ; AVX512-NEXT: vmovd %eax, %xmm5
891 ; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
892 ; AVX512-NEXT: movswl 24(%rdi), %eax
893 ; AVX512-NEXT: vmovd %eax, %xmm6
894 ; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6
895 ; AVX512-NEXT: movswl 26(%rdi), %eax
896 ; AVX512-NEXT: vmovd %eax, %xmm7
897 ; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7
898 ; AVX512-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
899 ; AVX512-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
900 ; AVX512-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
901 ; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
902 ; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
903 ; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
904 ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0
905 ; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3]
906 ; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
907 ; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
908 ; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3]
909 ; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
910 ; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
911 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
912 ; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
913 ; AVX512-NEXT: retq
927 ; ALL-NEXT: movswl %di, %eax
928 ; ALL-NEXT: vmovd %eax, %xmm0
929 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
930 ; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
931 ; ALL-NEXT: retq
940 ; ALL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
941 ; ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
942 ; ALL-NEXT: vmovd %xmm0, %eax
943 ; ALL-NEXT: movswl %ax, %ecx
944 ; ALL-NEXT: shrl $16, %eax
945 ; ALL-NEXT: cwtl
946 ; ALL-NEXT: vmovd %eax, %xmm0
947 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
948 ; ALL-NEXT: vmovd %ecx, %xmm1
949 ; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
950 ; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
951 ; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
952 ; ALL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
953 ; ALL-NEXT: retq
962 ; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
963 ; ALL-NEXT: vmovq %xmm0, %rax
964 ; ALL-NEXT: movq %rax, %rcx
965 ; ALL-NEXT: movl %eax, %edx
966 ; ALL-NEXT: movswl %ax, %esi
967 ; ALL-NEXT: shrq $48, %rax
968 ; ALL-NEXT: shrq $32, %rcx
969 ; ALL-NEXT: shrl $16, %edx
970 ; ALL-NEXT: movswl %dx, %edx
971 ; ALL-NEXT: vmovd %edx, %xmm0
972 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
973 ; ALL-NEXT: vmovd %esi, %xmm1
974 ; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
975 ; ALL-NEXT: movswl %cx, %ecx
976 ; ALL-NEXT: vmovd %ecx, %xmm2
977 ; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
978 ; ALL-NEXT: cwtl
979 ; ALL-NEXT: vmovd %eax, %xmm3
980 ; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
981 ; ALL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
982 ; ALL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
983 ; ALL-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
984 ; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
985 ; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
986 ; ALL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
987 ; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
988 ; ALL-NEXT: retq
997 ; ALL-NEXT: vmovd %xmm0, %eax
998 ; ALL-NEXT: movswl %ax, %ecx
999 ; ALL-NEXT: shrl $16, %eax
1000 ; ALL-NEXT: cwtl
1001 ; ALL-NEXT: vmovd %eax, %xmm0
1002 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
1003 ; ALL-NEXT: vmovd %ecx, %xmm1
1004 ; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
1005 ; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
1006 ; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
1007 ; ALL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1008 ; ALL-NEXT: retq
1018 ; ALL-NEXT: vmovq %xmm0, %rax
1019 ; ALL-NEXT: movq %rax, %rcx
1020 ; ALL-NEXT: movl %eax, %edx
1021 ; ALL-NEXT: movswl %ax, %esi
1022 ; ALL-NEXT: shrq $48, %rax
1023 ; ALL-NEXT: shrq $32, %rcx
1024 ; ALL-NEXT: shrl $16, %edx
1025 ; ALL-NEXT: movswl %dx, %edx
1026 ; ALL-NEXT: vmovd %edx, %xmm0
1027 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
1028 ; ALL-NEXT: vmovd %esi, %xmm1
1029 ; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
1030 ; ALL-NEXT: movswl %cx, %ecx
1031 ; ALL-NEXT: vmovd %ecx, %xmm2
1032 ; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
1033 ; ALL-NEXT: cwtl
1034 ; ALL-NEXT: vmovd %eax, %xmm3
1035 ; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
1036 ; ALL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
1037 ; ALL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
1038 ; ALL-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1039 ; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
1040 ; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
1041 ; ALL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1042 ; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1043 ; ALL-NEXT: retq
1053 ; AVX1-NEXT: vmovq %xmm0, %rdx
1054 ; AVX1-NEXT: movq %rdx, %r9
1055 ; AVX1-NEXT: movl %edx, %r10d
1056 ; AVX1-NEXT: movswl %dx, %r8d
1057 ; AVX1-NEXT: shrq $48, %rdx
1058 ; AVX1-NEXT: shrq $32, %r9
1059 ; AVX1-NEXT: shrl $16, %r10d
1060 ; AVX1-NEXT: vpextrq $1, %xmm0, %rdi
1061 ; AVX1-NEXT: movq %rdi, %rsi
1062 ; AVX1-NEXT: movl %edi, %eax
1063 ; AVX1-NEXT: movswl %di, %ecx
1064 ; AVX1-NEXT: shrq $48, %rdi
1065 ; AVX1-NEXT: shrq $32, %rsi
1066 ; AVX1-NEXT: shrl $16, %eax
1067 ; AVX1-NEXT: cwtl
1068 ; AVX1-NEXT: vmovd %eax, %xmm0
1069 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm1
1070 ; AVX1-NEXT: vmovd %ecx, %xmm0
1071 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm2
1072 ; AVX1-NEXT: movswl %si, %eax
1073 ; AVX1-NEXT: vmovd %eax, %xmm0
1074 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm3
1075 ; AVX1-NEXT: movswl %di, %eax
1076 ; AVX1-NEXT: vmovd %eax, %xmm0
1077 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm4
1078 ; AVX1-NEXT: movswl %r10w, %eax
1079 ; AVX1-NEXT: vmovd %eax, %xmm0
1080 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
1081 ; AVX1-NEXT: vmovd %r8d, %xmm5
1082 ; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5
1083 ; AVX1-NEXT: movswl %r9w, %eax
1084 ; AVX1-NEXT: vmovd %eax, %xmm6
1085 ; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6
1086 ; AVX1-NEXT: movswl %dx, %eax
1087 ; AVX1-NEXT: vmovd %eax, %xmm7
1088 ; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7
1089 ; AVX1-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
1090 ; AVX1-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
1091 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
1092 ; AVX1-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
1093 ; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
1094 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm5[0],xmm0[0]
1095 ; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0
1096 ; AVX1-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
1097 ; AVX1-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
1098 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0]
1099 ; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
1100 ; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
1101 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0]
1102 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
1103 ; AVX1-NEXT: retq
1107 ; AVX2-NEXT: vmovq %xmm0, %rdx
1108 ; AVX2-NEXT: movq %rdx, %r9
1109 ; AVX2-NEXT: movl %edx, %r10d
1110 ; AVX2-NEXT: movswl %dx, %r8d
1111 ; AVX2-NEXT: shrq $48, %rdx
1112 ; AVX2-NEXT: shrq $32, %r9
1113 ; AVX2-NEXT: shrl $16, %r10d
1114 ; AVX2-NEXT: vpextrq $1, %xmm0, %rdi
1115 ; AVX2-NEXT: movq %rdi, %rsi
1116 ; AVX2-NEXT: movl %edi, %eax
1117 ; AVX2-NEXT: movswl %di, %ecx
1118 ; AVX2-NEXT: shrq $48, %rdi
1119 ; AVX2-NEXT: shrq $32, %rsi
1120 ; AVX2-NEXT: shrl $16, %eax
1121 ; AVX2-NEXT: cwtl
1122 ; AVX2-NEXT: vmovd %eax, %xmm0
1123 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm1
1124 ; AVX2-NEXT: vmovd %ecx, %xmm0
1125 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm2
1126 ; AVX2-NEXT: movswl %si, %eax
1127 ; AVX2-NEXT: vmovd %eax, %xmm0
1128 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm3
1129 ; AVX2-NEXT: movswl %di, %eax
1130 ; AVX2-NEXT: vmovd %eax, %xmm0
1131 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm4
1132 ; AVX2-NEXT: movswl %r10w, %eax
1133 ; AVX2-NEXT: vmovd %eax, %xmm0
1134 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
1135 ; AVX2-NEXT: vmovd %r8d, %xmm5
1136 ; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5
1137 ; AVX2-NEXT: movswl %r9w, %eax
1138 ; AVX2-NEXT: vmovd %eax, %xmm6
1139 ; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6
1140 ; AVX2-NEXT: movswl %dx, %eax
1141 ; AVX2-NEXT: vmovd %eax, %xmm7
1142 ; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7
1143 ; AVX2-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
1144 ; AVX2-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
1145 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
1146 ; AVX2-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
1147 ; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
1148 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm5[0],xmm0[0]
1149 ; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0
1150 ; AVX2-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
1151 ; AVX2-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
1152 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0]
1153 ; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
1154 ; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
1155 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0]
1156 ; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
1157 ; AVX2-NEXT: retq
1161 ; AVX512-NEXT: vpextrq $1, %xmm0, %rdx
1162 ; AVX512-NEXT: movq %rdx, %r8
1163 ; AVX512-NEXT: movl %edx, %r10d
1164 ; AVX512-NEXT: movswl %dx, %r9d
1165 ; AVX512-NEXT: shrq $48, %rdx
1166 ; AVX512-NEXT: shrq $32, %r8
1167 ; AVX512-NEXT: shrl $16, %r10d
1168 ; AVX512-NEXT: vmovq %xmm0, %rdi
1169 ; AVX512-NEXT: movq %rdi, %rax
1170 ; AVX512-NEXT: movl %edi, %esi
1171 ; AVX512-NEXT: movswl %di, %ecx
1172 ; AVX512-NEXT: shrq $48, %rdi
1173 ; AVX512-NEXT: shrq $32, %rax
1174 ; AVX512-NEXT: shrl $16, %esi
1175 ; AVX512-NEXT: movswl %si, %esi
1176 ; AVX512-NEXT: vmovd %esi, %xmm0
1177 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
1178 ; AVX512-NEXT: vmovd %ecx, %xmm1
1179 ; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
1180 ; AVX512-NEXT: cwtl
1181 ; AVX512-NEXT: vmovd %eax, %xmm2
1182 ; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
1183 ; AVX512-NEXT: movswl %di, %eax
1184 ; AVX512-NEXT: vmovd %eax, %xmm3
1185 ; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
1186 ; AVX512-NEXT: movswl %r10w, %eax
1187 ; AVX512-NEXT: vmovd %eax, %xmm4
1188 ; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4
1189 ; AVX512-NEXT: vmovd %r9d, %xmm5
1190 ; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
1191 ; AVX512-NEXT: movswl %r8w, %eax
1192 ; AVX512-NEXT: vmovd %eax, %xmm6
1193 ; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6
1194 ; AVX512-NEXT: movswl %dx, %eax
1195 ; AVX512-NEXT: vmovd %eax, %xmm7
1196 ; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7
1197 ; AVX512-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
1198 ; AVX512-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
1199 ; AVX512-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
1200 ; AVX512-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
1201 ; AVX512-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
1202 ; AVX512-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm5[0],xmm4[0]
1203 ; AVX512-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
1204 ; AVX512-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
1205 ; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
1206 ; AVX512-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1207 ; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
1208 ; AVX512-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
1209 ; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1210 ; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1211 ; AVX512-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
1212 ; AVX512-NEXT: retq
1225 ; ALL-NEXT: movswl (%rdi), %eax
1226 ; ALL-NEXT: vmovd %eax, %xmm0
1227 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
1228 ; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
1229 ; ALL-NEXT: retq
1239 ; ALL-NEXT: movswl (%rdi), %eax
1240 ; ALL-NEXT: vmovd %eax, %xmm0
1241 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
1242 ; ALL-NEXT: movswl 2(%rdi), %eax
1243 ; ALL-NEXT: vmovd %eax, %xmm1
1244 ; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
1245 ; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
1246 ; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
1247 ; ALL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1248 ; ALL-NEXT: retq
1258 ; ALL-NEXT: movswl (%rdi), %eax
1259 ; ALL-NEXT: vmovd %eax, %xmm0
1260 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
1261 ; ALL-NEXT: movswl 2(%rdi), %eax
1262 ; ALL-NEXT: vmovd %eax, %xmm1
1263 ; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
1264 ; ALL-NEXT: movswl 4(%rdi), %eax
1265 ; ALL-NEXT: vmovd %eax, %xmm2
1266 ; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
1267 ; ALL-NEXT: movswl 6(%rdi), %eax
1268 ; ALL-NEXT: vmovd %eax, %xmm3
1269 ; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
1270 ; ALL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
1271 ; ALL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
1272 ; ALL-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1273 ; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
1274 ; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
1275 ; ALL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1276 ; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1277 ; ALL-NEXT: retq
1287 ; ALL-NEXT: movq (%rdi), %rax
1288 ; ALL-NEXT: movq %rax, %rcx
1289 ; ALL-NEXT: movl %eax, %edx
1290 ; ALL-NEXT: movswl %ax, %esi
1291 ; ALL-NEXT
1292 ; ALL-NEXT: shrq $32, %rcx
1293 ; ALL-NEXT: shrl $16, %edx
1294 ; ALL-NEXT: movswl %dx, %edx
1295 ; ALL-NEXT: vmovd %edx, %xmm0
1296 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
1297 ; ALL-NEXT: vmovd %esi, %xmm1
1298 ; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
1299 ; ALL-NEXT: movswl %cx, %ecx
1300 ; ALL-NEXT: vmovd %ecx, %xmm2
1301 ; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
1302 ; ALL-NEXT: cwtl
1303 ; ALL-NEXT: vmovd %eax, %xmm3
1304 ; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
1305 ; ALL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
1306 ; ALL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
1307 ; ALL-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1308 ; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
1309 ; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
1310 ; ALL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1311 ; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1312 ; ALL-NEXT: retq
1323 ; AVX1-NEXT: movswl 8(%rdi), %eax
1324 ; AVX1-NEXT: vmovd %eax, %xmm0
1325 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm1
1326 ; AVX1-NEXT: movswl 10(%rdi), %eax
1327 ; AVX1-NEXT: vmovd %eax, %xmm0
1328 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm2
1329 ; AVX1-NEXT: movswl 12(%rdi), %eax
1330 ; AVX1-NEXT: vmovd %eax, %xmm0
1331 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm3
1332 ; AVX1-NEXT: movswl 14(%rdi), %eax
1333 ; AVX1-NEXT: vmovd %eax, %xmm0
1334 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm4
1335 ; AVX1-NEXT: movswl (%rdi), %eax
1336 ; AVX1-NEXT: vmovd %eax, %xmm0
1337 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
1338 ; AVX1-NEXT: movswl 2(%rdi), %eax
1339 ; AVX1-NEXT: vmovd %eax, %xmm5
1340 ; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5
1341 ; AVX1-NEXT: movswl 4(%rdi), %eax
1342 ; AVX1-NEXT: vmovd %eax, %xmm6
1343 ; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6
1344 ; AVX1-NEXT: movswl 6(%rdi), %eax
1345 ; AVX1-NEXT: vmovd %eax, %xmm7
1346 ; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7
1347 ; AVX1-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
1348 ; AVX1-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
1349 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
1350 ; AVX1-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
1351 ; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
1352 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm5[0]
1353 ; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0
1354 ; AVX1-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
1355 ; AVX1-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
1356 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0]
1357 ; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
1358 ; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
1359 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1360 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
1361 ; AVX1-NEXT: retq
1365 ; AVX2-NEXT: movswl 8(%rdi), %eax
1366 ; AVX2-NEXT: vmovd %eax, %xmm0
1367 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm1
1368 ; AVX2-NEXT: movswl 10(%rdi), %eax
1369 ; AVX2-NEXT: vmovd %eax, %xmm0
1370 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm2
1371 ; AVX2-NEXT: movswl 12(%rdi), %eax
1372 ; AVX2-NEXT: vmovd %eax, %xmm0
1373 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm3
1374 ; AVX2-NEXT: movswl 14(%rdi), %eax
1375 ; AVX2-NEXT: vmovd %eax, %xmm0
1376 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm4
1377 ; AVX2-NEXT: movswl (%rdi), %eax
1378 ; AVX2-NEXT: vmovd %eax, %xmm0
1379 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
1380 ; AVX2-NEXT: movswl 2(%rdi), %eax
1381 ; AVX2-NEXT: vmovd %eax, %xmm5
1382 ; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5
1383 ; AVX2-NEXT: movswl 4(%rdi), %eax
1384 ; AVX2-NEXT: vmovd %eax, %xmm6
1385 ; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6
1386 ; AVX2-NEXT: movswl 6(%rdi), %eax
1387 ; AVX2-NEXT: vmovd %eax, %xmm7
1388 ; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7
1389 ; AVX2-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
1390 ; AVX2-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
1391 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
1392 ; AVX2-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
1393 ; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
1394 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm5[0]
1395 ; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0
1396 ; AVX2-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
1397 ; AVX2-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
1398 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0]
1399 ; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
1400 ; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
1401 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1402 ; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
1403 ; AVX2-NEXT: retq
1407 ; AVX512-NEXT: movswl (%rdi), %eax
1408 ; AVX512-NEXT: vmovd %eax, %xmm0
1409 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
1410 ; AVX512-NEXT: movswl 2(%rdi), %eax
1411 ; AVX512-NEXT: vmovd %eax, %xmm1
1412 ; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
1413 ; AVX512-NEXT: movswl 4(%rdi), %eax
1414 ; AVX512-NEXT: vmovd %eax, %xmm2
1415 ; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
1416 ; AVX512-NEXT: movswl 6(%rdi), %eax
1417 ; AVX512-NEXT: vmovd %eax, %xmm3
1418 ; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
1419 ; AVX512-NEXT: movswl 8(%rdi), %eax
1420 ; AVX512-NEXT: vmovd %eax, %xmm4
1421 ; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4
1422 ; AVX512-NEXT: movswl 10(%rdi), %eax
1423 ; AVX512-NEXT: vmovd %eax, %xmm5
1424 ; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
1425 ; AVX512-NEXT: movswl 12(%rdi), %eax
1426 ; AVX512-NEXT: vmovd %eax, %xmm6
1427 ; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6
1428 ; AVX512-NEXT: movswl 14(%rdi), %eax
1429 ; AVX512-NEXT: vmovd %eax, %xmm7
1430 ; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7
1431 ; AVX512-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
1432 ; AVX512-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
1433 ; AVX512-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
1434 ; AVX512-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
1435 ; AVX512-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
1436 ; AVX512-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],xmm5[0]
1437 ; AVX512-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
1438 ; AVX512-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
1439 ; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
1440 ; AVX512-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1441 ; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
1442 ; AVX512-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
1443 ; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1444 ; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1445 ; AVX512-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
1446 ; AVX512-NEXT: retq
1460 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1461 ; ALL-NEXT: vmovd %xmm0, %eax
1462 ; ALL-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
1463 ; ALL-NEXT: retq
1472 ; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1473 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1474 ; ALL-NEXT: vmovd %xmm1, %eax
1475 ; ALL-NEXT: shll $16, %eax
1476 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
1477 ; ALL-NEXT: vmovd %xmm1, %ecx
1478 ; ALL-NEXT: movzwl %cx, %ecx
1479 ; ALL-NEXT: orl %eax, %ecx
1480 ; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
1481 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1482 ; ALL-NEXT: vmovd %xmm1, %eax
1483 ; ALL-NEXT: shll $16, %eax
1484 ; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1485 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1486 ; ALL-NEXT: vmovd %xmm0, %edx
1487 ; ALL-NEXT: movzwl %dx, %edx
1488 ; ALL-NEXT: orl %eax, %edx
1489 ; ALL-NEXT: shlq $32, %rdx
1490 ; ALL-NEXT: orq %rcx, %rdx
1491 ; ALL-NEXT: vmovq %rdx, %xmm0
1492 ; ALL-NEXT: retq
1501 ; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1502 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1503 ; ALL-NEXT: vmovd %xmm1, %eax
1504 ; ALL-NEXT: shll $16, %eax
1505 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
1506 ; ALL-NEXT: vmovd %xmm1, %ecx
1507 ; ALL-NEXT: movzwl %cx, %ecx
1508 ; ALL-NEXT: orl %eax, %ecx
1509 ; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
1510 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1511 ; ALL-NEXT: vmovd %xmm1, %eax
1512 ; ALL-NEXT: shll $16, %eax
1513 ; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1514 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1515 ; ALL-NEXT: vmovd %xmm0, %edx
1516 ; ALL-NEXT: movzwl %dx, %edx
1517 ; ALL-NEXT: orl %eax, %edx
1518 ; ALL-NEXT: shlq $32, %rdx
1519 ; ALL-NEXT: orq %rcx, %rdx
1520 ; ALL-NEXT: vmovq %rdx, %xmm0
1521 ; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1522 ; ALL-NEXT: retq
1532 ; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1533 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1534 ; ALL-NEXT: vmovd %xmm1, %eax
1535 ; ALL-NEXT: shll $16, %eax
1536 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
1537 ; ALL-NEXT: vmovd %xmm1, %ecx
1538 ; ALL-NEXT: movzwl %cx, %ecx
1539 ; ALL-NEXT: orl %eax, %ecx
1540 ; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
1541 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1542 ; ALL-NEXT: vmovd %xmm1, %eax
1543 ; ALL-NEXT: shll $16, %eax
1544 ; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1545 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1546 ; ALL-NEXT: vmovd %xmm0, %edx
1547 ; ALL-NEXT: movzwl %dx, %edx
1548 ; ALL-NEXT: orl %eax, %edx
1549 ; ALL-NEXT: shlq $32, %rdx
1550 ; ALL-NEXT: orq %rcx, %rdx
1551 ; ALL-NEXT: vmovq %rdx, %xmm0
1552 ; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
1553 ; ALL-NEXT: retq
1563 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1564 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1565 ; AVX1-NEXT: vmovd %xmm1, %eax
1566 ; AVX1-NEXT: shll $16, %eax
1567 ; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm1
1568 ; AVX1-NEXT: vmovd %xmm1, %ecx
1569 ; AVX1-NEXT: movzwl %cx, %ecx
1570 ; AVX1-NEXT: orl %eax, %ecx
1571 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
1572 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1573 ; AVX1-NEXT: vmovd %xmm1, %edx
1574 ; AVX1-NEXT: shll $16, %edx
1575 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1576 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1577 ; AVX1-NEXT: vmovd %xmm1, %eax
1578 ; AVX1-NEXT: movzwl %ax, %eax
1579 ; AVX1-NEXT: orl %edx, %eax
1580 ; AVX1-NEXT: shlq $32, %rax
1581 ; AVX1-NEXT: orq %rcx, %rax
1582 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1583 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1584 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1585 ; AVX1-NEXT: vmovd %xmm1, %ecx
1586 ; AVX1-NEXT: shll $16, %ecx
1587 ; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm1
1588 ; AVX1-NEXT: vmovd %xmm1, %edx
1589 ; AVX1-NEXT: movzwl %dx, %edx
1590 ; AVX1-NEXT: orl %ecx, %edx
1591 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
1592 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1593 ; AVX1-NEXT: vmovd %xmm1, %ecx
1594 ; AVX1-NEXT: shll $16, %ecx
1595 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1596 ; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1597 ; AVX1-NEXT: vmovd %xmm0, %esi
1598 ; AVX1-NEXT: movzwl %si, %esi
1599 ; AVX1-NEXT: orl %ecx, %esi
1600 ; AVX1-NEXT: shlq $32, %rsi
1601 ; AVX1-NEXT: orq %rdx, %rsi
1602 ; AVX1-NEXT: vmovq %rsi, %xmm0
1603 ; AVX1-NEXT: vmovq %rax, %xmm1
1604 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1605 ; AVX1-NEXT: vzeroupper
1606 ; AVX1-NEXT: retq
1610 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1611 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1612 ; AVX2-NEXT: vmovd %xmm1, %eax
1613 ; AVX2-NEXT: shll $16, %eax
1614 ; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm1
1615 ; AVX2-NEXT: vmovd %xmm1, %ecx
1616 ; AVX2-NEXT: movzwl %cx, %ecx
1617 ; AVX2-NEXT: orl %eax, %ecx
1618 ; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
1619 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1620 ; AVX2-NEXT: vmovd %xmm1, %edx
1621 ; AVX2-NEXT: shll $16, %edx
1622 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1623 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1624 ; AVX2-NEXT: vmovd %xmm1, %eax
1625 ; AVX2-NEXT: movzwl %ax, %eax
1626 ; AVX2-NEXT: orl %edx, %eax
1627 ; AVX2-NEXT: shlq $32, %rax
1628 ; AVX2-NEXT: orq %rcx, %rax
1629 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
1630 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1631 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1632 ; AVX2-NEXT: vmovd %xmm1, %ecx
1633 ; AVX2-NEXT: shll $16, %ecx
1634 ; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm1
1635 ; AVX2-NEXT: vmovd %xmm1, %edx
1636 ; AVX2-NEXT: movzwl %dx, %edx
1637 ; AVX2-NEXT: orl %ecx, %edx
1638 ; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
1639 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1640 ; AVX2-NEXT: vmovd %xmm1, %ecx
1641 ; AVX2-NEXT: shll $16, %ecx
1642 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1643 ; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1644 ; AVX2-NEXT: vmovd %xmm0, %esi
1645 ; AVX2-NEXT: movzwl %si, %esi
1646 ; AVX2-NEXT: orl %ecx, %esi
1647 ; AVX2-NEXT: shlq $32, %rsi
1648 ; AVX2-NEXT: orq %rdx, %rsi
1649 ; AVX2-NEXT: vmovq %rsi, %xmm0
1650 ; AVX2-NEXT: vmovq %rax, %xmm1
1651 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1652 ; AVX2-NEXT: vzeroupper
1653 ; AVX2-NEXT: retq
1657 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1658 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1659 ; AVX512-NEXT: vmovd %xmm1, %eax
1660 ; AVX512-NEXT: shll $16, %eax
1661 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm1
1662 ; AVX512-NEXT: vmovd %xmm1, %ecx
1663 ; AVX512-NEXT: movzwl %cx, %ecx
1664 ; AVX512-NEXT: orl %eax, %ecx
1665 ; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
1666 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1667 ; AVX512-NEXT: vmovd %xmm1, %edx
1668 ; AVX512-NEXT: shll $16, %edx
1669 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1670 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1671 ; AVX512-NEXT: vmovd %xmm1, %eax
1672 ; AVX512-NEXT: movzwl %ax, %eax
1673 ; AVX512-NEXT: orl %edx, %eax
1674 ; AVX512-NEXT: shlq $32, %rax
1675 ; AVX512-NEXT: orq %rcx, %rax
1676 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
1677 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1678 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1679 ; AVX512-NEXT: vmovd %xmm1, %ecx
1680 ; AVX512-NEXT: shll $16, %ecx
1681 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm1
1682 ; AVX512-NEXT: vmovd %xmm1, %edx
1683 ; AVX512-NEXT: movzwl %dx, %edx
1684 ; AVX512-NEXT: orl %ecx, %edx
1685 ; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
1686 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1687 ; AVX512-NEXT: vmovd %xmm1, %ecx
1688 ; AVX512-NEXT: shll $16, %ecx
1689 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1690 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1691 ; AVX512-NEXT: vmovd %xmm0, %esi
1692 ; AVX512-NEXT: movzwl %si, %esi
1693 ; AVX512-NEXT: orl %ecx, %esi
1694 ; AVX512-NEXT: shlq $32, %rsi
1695 ; AVX512-NEXT: orq %rdx, %rsi
1696 ; AVX512-NEXT: vmovq %rsi, %xmm0
1697 ; AVX512-NEXT: vmovq %rax, %xmm1
1698 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1699 ; AVX512-NEXT: retq
1708 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm2
1709 ; AVX1-NEXT: vmovd %xmm2, %eax
1710 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
1711 ; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2
1712 ; AVX1-NEXT: vmovd %eax, %xmm3
1713 ; AVX1-NEXT: vmovd %xmm2, %eax
1714 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1715 ; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2
1716 ; AVX1-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
1717 ; AVX1-NEXT: vmovd %xmm2, %eax
1718 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1719 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
1720 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1721 ; AVX1-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
1722 ; AVX1-NEXT: vmovd %xmm1, %eax
1723 ; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm1
1724 ; AVX1-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
1725 ; AVX1-NEXT
1726 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
1727 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1728 ; AVX1-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
1729 ; AVX1-NEXT: vmovd %xmm1, %eax
1730 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
1731 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1732 ; AVX1-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
1733 ; AVX1-NEXT: vmovd %xmm1, %eax
1734 ; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm1
1735 ; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
1736 ; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2
1737 ; AVX1-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3
1738 ; AVX1-NEXT: vmovd %xmm2, %eax
1739 ; AVX1-NEXT: vpinsrw $7, %eax, %xmm3, %xmm2
1740 ; AVX1-NEXT: vmovd %xmm1, %eax
1741 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1742 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1743 ; AVX1-NEXT: vmovd %eax, %xmm3
1744 ; AVX1-NEXT: vmovd %xmm1, %eax
1745 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1746 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1747 ; AVX1-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
1748 ; AVX1-NEXT: vmovd %xmm1, %eax
1749 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1750 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
1751 ; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1752 ; AVX1-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
1753 ; AVX1-NEXT: vmovd %xmm0, %eax
1754 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm0
1755 ; AVX1-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
1756 ; AVX1-NEXT: vmovd %xmm0, %eax
1757 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
1758 ; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1759 ; AVX1-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
1760 ; AVX1-NEXT: vmovd %xmm0, %eax
1761 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3]
1762 ; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1763 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1764 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1765 ; AVX1-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
1766 ; AVX1-NEXT: vmovd %xmm1, %eax
1767 ; AVX1-NEXT: vpinsrw $6, %eax, %xmm3, %xmm1
1768 ; AVX1-NEXT: vmovd %xmm0, %eax
1769 ; AVX1-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
1770 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1771 ; AVX1-NEXT: retq
1775 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm2
1776 ; AVX2-NEXT: vmovd %xmm2, %eax
1777 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
1778 ; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2
1779 ; AVX2-NEXT: vmovd %eax, %xmm3
1780 ; AVX2-NEXT: vmovd %xmm2, %eax
1781 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1782 ; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2
1783 ; AVX2-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
1784 ; AVX2-NEXT: vmovd %xmm2, %eax
1785 ; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2
1786 ; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
1787 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1788 ; AVX2-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
1789 ; AVX2-NEXT: vmovd %xmm1, %eax
1790 ; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm1
1791 ; AVX2-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
1792 ; AVX2-NEXT: vmovd %xmm1, %eax
1793 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
1794 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1795 ; AVX2-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
1796 ; AVX2-NEXT: vmovd %xmm1, %eax
1797 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
1798 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1799 ; AVX2-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
1800 ; AVX2-NEXT: vmovd %xmm1, %eax
1801 ; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm1
1802 ; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
1803 ; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2
1804 ; AVX2-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3
1805 ; AVX2-NEXT: vmovd %xmm2, %eax
1806 ; AVX2-NEXT: vpinsrw $7, %eax, %xmm3, %xmm2
1807 ; AVX2-NEXT: vmovd %xmm1, %eax
1808 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1809 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1810 ; AVX2-NEXT: vmovd %eax, %xmm3
1811 ; AVX2-NEXT: vmovd %xmm1, %eax
1812 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1813 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1814 ; AVX2-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
1815 ; AVX2-NEXT: vmovd %xmm1, %eax
1816 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
1817 ; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
1818 ; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1819 ; AVX2-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
1820 ; AVX2-NEXT: vmovd %xmm0, %eax
1821 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm0
1822 ; AVX2-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
1823 ; AVX2-NEXT: vmovd %xmm0, %eax
1824 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
1825 ; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1826 ; AVX2-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
1827 ; AVX2-NEXT: vmovd %xmm0, %eax
1828 ; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3]
1829 ; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1830 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1831 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1832 ; AVX2-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
1833 ; AVX2-NEXT: vmovd %xmm1, %eax
1834 ; AVX2-NEXT: vpinsrw $6, %eax, %xmm3, %xmm1
1835 ; AVX2-NEXT: vmovd %xmm0, %eax
1836 ; AVX2-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
1837 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
1838 ; AVX2-NEXT: retq
1842 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
1843 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm2
1844 ; AVX512-NEXT: vmovd %xmm2, %eax
1845 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
1846 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
1847 ; AVX512-NEXT: vmovd %eax, %xmm3
1848 ; AVX512-NEXT: vmovd %xmm2, %eax
1849 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1850 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
1851 ; AVX512-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
1852 ; AVX512-NEXT: vmovd %xmm2, %eax
1853 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2
1854 ; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
1855 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1856 ; AVX512-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
1857 ; AVX512-NEXT: vmovd %xmm1, %eax
1858 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm1
1859 ; AVX512-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
1860 ; AVX512-NEXT: vmovd %xmm1, %eax
1861 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
1862 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1863 ; AVX512-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
1864 ; AVX512-NEXT: vmovd %xmm1, %eax
1865 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
1866 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1867 ; AVX512-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
1868 ; AVX512-NEXT: vmovd %xmm1, %eax
1869 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm1
1870 ; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
1871 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
1872 ; AVX512-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3
1873 ; AVX512-NEXT: vmovd %xmm2, %eax
1874 ; AVX512-NEXT: vpinsrw $7, %eax, %xmm3, %xmm2
1875 ; AVX512-NEXT: vmovd %xmm1, %eax
1876 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1877 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1878 ; AVX512-NEXT: vmovd %eax, %xmm3
1879 ; AVX512-NEXT: vmovd %xmm1, %eax
1880 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1881 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1882 ; AVX512-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
1883 ; AVX512-NEXT: vmovd %xmm1, %eax
1884 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
1885 ; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
1886 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1887 ; AVX512-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
1888 ; AVX512-NEXT: vmovd %xmm0, %eax
1889 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm0
1890 ; AVX512-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
1891 ; AVX512-NEXT: vmovd %xmm0, %eax
1892 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
1893 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1894 ; AVX512-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
1895 ; AVX512-NEXT: vmovd %xmm0, %eax
1896 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
1897 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1898 ; AVX512-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
1899 ; AVX512-NEXT: vmovd %xmm0, %eax
1900 ; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3]
1901 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1902 ; AVX512-NEXT: vpinsrw $6, %eax, %xmm3, %xmm1
1903 ; AVX512-NEXT: vmovd %xmm0, %eax
1904 ; AVX512-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
1905 ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
1906 ; AVX512-NEXT: retq
1919 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1920 ; ALL-NEXT: vmovd %xmm0, %eax
1921 ; ALL-NEXT: movw %ax, (%rdi)
1922 ; ALL-NEXT: retq
1932 ; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1933 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1934 ; ALL-NEXT: vmovd %xmm1, %eax
1935 ; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1936 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1937 ; ALL-NEXT: vmovd %xmm1, %ecx
1938 ; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
1939 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1940 ; ALL-NEXT: vmovd %xmm1, %edx
1941 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1942 ; ALL-NEXT: vmovd %xmm0, %esi
1943 ; ALL-NEXT: movw %si, (%rdi)
1944 ; ALL-NEXT: movw %dx, 6(%rdi)
1945 ; ALL-NEXT: movw %cx, 4(%rdi)
1946 ; ALL-NEXT: movw %ax, 2(%rdi)
1947 ; ALL-NEXT: retq
1957 ; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1958 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1959 ; ALL-NEXT: vmovd %xmm1, %eax
1960 ; ALL-NEXT: shll $16, %eax
1961 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
1962 ; ALL-NEXT: vmovd %xmm1, %ecx
1963 ; ALL-NEXT: movzwl %cx, %ecx
1964 ; ALL-NEXT: orl %eax, %ecx
1965 ; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
1966 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1967 ; ALL-NEXT: vmovd %xmm1, %eax
1968 ; ALL-NEXT: shll $16, %eax
1969 ; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1970 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1971 ; ALL-NEXT: vmovd %xmm0, %edx
1972 ; ALL-NEXT: movzwl %dx, %edx
1973 ; ALL-NEXT: orl %eax, %edx
1974 ; ALL-NEXT: shlq $32, %rdx
1975 ; ALL-NEXT: orq %rcx, %rdx
1976 ; ALL-NEXT: vmovq %rdx, %xmm0
1977 ; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1978 ; ALL-NEXT: vmovdqa %xmm0, (%rdi)
1979 ; ALL-NEXT: retq
1990 ; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1991 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1992 ; ALL-NEXT: vmovd %xmm1, %eax
1993 ; ALL-NEXT: shll $16, %eax
1994 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
1995 ; ALL-NEXT: vmovd %xmm1, %ecx
1996 ; ALL-NEXT: movzwl %cx, %ecx
1997 ; ALL-NEXT: orl %eax, %ecx
1998 ; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
1999 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2000 ; ALL-NEXT: vmovd %xmm1, %eax
2001 ; ALL-NEXT: shll $16, %eax
2002 ; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2003 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
2004 ; ALL-NEXT: vmovd %xmm0, %edx
2005 ; ALL-NEXT: movzwl %dx, %edx
2006 ; ALL-NEXT: orl %eax, %edx
2007 ; ALL-NEXT: shlq $32, %rdx
2008 ; ALL-NEXT: orq %rcx, %rdx
2009 ; ALL-NEXT: vmovq %rdx, %xmm0
2010 ; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
2011 ; ALL-NEXT: vmovdqa %xmm0, (%rdi)
2012 ; ALL-NEXT: retq
2023 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2024 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2025 ; AVX1-NEXT: vmovd %xmm1, %r8d
2026 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
2027 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2028 ; AVX1-NEXT: vmovd %xmm1, %r9d
2029 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
2030 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2031 ; AVX1-NEXT: vmovd %xmm1, %r10d
2032 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
2033 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
2034 ; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2
2035 ; AVX1-NEXT: vmovd %xmm2, %r11d
2036 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
2037 ; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2
2038 ; AVX1-NEXT: vmovd %xmm2, %eax
2039 ; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
2040 ; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2
2041 ; AVX1-NEXT: vmovd %xmm2, %ecx
2042 ; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
2043 ; AVX1-NEXT: vmovd %xmm0, %edx
2044 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm0
2045 ; AVX1-NEXT: vmovd %xmm0, %esi
2046 ; AVX1-NEXT: movw %si, 8(%rdi)
2047 ; AVX1-NEXT: movw %dx, (%rdi)
2048 ; AVX1-NEXT: movw %cx, 14(%rdi)
2049 ; AVX1-NEXT: movw %ax, 12(%rdi)
2050 ; AVX1-NEXT: movw %r11w, 10(%rdi)
2051 ; AVX1-NEXT: movw %r10w, 6(%rdi)
2052 ; AVX1-NEXT: movw %r9w, 4(%rdi)
2053 ; AVX1-NEXT: movw %r8w, 2(%rdi)
2054 ; AVX1-NEXT: vzeroupper
2055 ; AVX1-NEXT: retq
2059 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2060 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2061 ; AVX2-NEXT: vmovd %xmm1, %r8d
2062 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
2063 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2064 ; AVX2-NEXT: vmovd %xmm1, %r9d
2065 ; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
2066 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2067 ; AVX2-NEXT: vmovd %xmm1, %r10d
2068 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
2069 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
2070 ; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2
2071 ; AVX2-NEXT: vmovd %xmm2, %r11d
2072 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
2073 ; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2
2074 ; AVX2-NEXT: vmovd %xmm2, %eax
2075 ; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
2076 ; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2
2077 ; AVX2-NEXT: vmovd %xmm2, %ecx
2078 ; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
2079 ; AVX2-NEXT: vmovd %xmm0, %edx
2080 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm0
2081 ; AVX2-NEXT: vmovd %xmm0, %esi
2082 ; AVX2-NEXT: movw %si, 8(%rdi)
2083 ; AVX2-NEXT: movw %dx, (%rdi)
2084 ; AVX2-NEXT: movw %cx, 14(%rdi)
2085 ; AVX2-NEXT: movw %ax, 12(%rdi)
2086 ; AVX2-NEXT: movw %r11w, 10(%rdi)
2087 ; AVX2-NEXT: movw %r10w, 6(%rdi)
2088 ; AVX2-NEXT: movw %r9w, 4(%rdi)
2089 ; AVX2-NEXT: movw %r8w, 2(%rdi)
2090 ; AVX2-NEXT: vzeroupper
2091 ; AVX2-NEXT: retq
2095 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2096 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2097 ; AVX512-NEXT: vmovd %xmm1, %r8d
2098 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
2099 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2100 ; AVX512-NEXT: vmovd %xmm1, %r9d
2101 ; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
2102 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2103 ; AVX512-NEXT: vmovd %xmm1, %r10d
2104 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
2105 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
2106 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
2107 ; AVX512-NEXT: vmovd %xmm2, %r11d
2108 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
2109 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
2110 ; AVX512-NEXT: vmovd %xmm2, %eax
2111 ; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
2112 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
2113 ; AVX512-NEXT: vmovd %xmm2, %ecx
2114 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
2115 ; AVX512-NEXT: vmovd %xmm0, %edx
2116 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm0
2117 ; AVX512-NEXT: vmovd %xmm0, %esi
2118 ; AVX512-NEXT: movw %si, 8(%rdi)
2119 ; AVX512-NEXT: movw %dx, (%rdi)
2120 ; AVX512-NEXT: movw %cx, 14(%rdi)
2121 ; AVX512-NEXT: movw %ax, 12(%rdi)
2122 ; AVX512-NEXT: movw %r11w, 10(%rdi)
2123 ; AVX512-NEXT: movw %r10w, 6(%rdi)
2124 ; AVX512-NEXT: movw %r9w, 4(%rdi)
2125 ; AVX512-NEXT: movw %r8w, 2(%rdi)
2126 ; AVX512-NEXT: retq
2136 ; AVX1-NEXT
2137 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
2138 ; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm4
2139 ; AVX1-NEXT: vmovd %xmm4, %eax
2140 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm4
2141 ; AVX1-NEXT: movw %ax, 24(%rdi)
2142 ; AVX1-NEXT: vmovd %xmm4, %eax
2143 ; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm4
2144 ; AVX1-NEXT: movw %ax, 16(%rdi)
2145 ; AVX1-NEXT: vmovd %xmm4, %eax
2146 ; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm4
2147 ; AVX1-NEXT: movw %ax, 8(%rdi)
2148 ; AVX1-NEXT: vmovd %xmm4, %eax
2149 ; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3]
2150 ; AVX1-NEXT: vcvtps2ph $4, %xmm4, %xmm4
2151 ; AVX1-NEXT: movw %ax, (%rdi)
2152 ; AVX1-NEXT: vmovd %xmm4, %eax
2153 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
2154 ; AVX1-NEXT: vcvtps2ph $4, %xmm4, %xmm4
2155 ; AVX1-NEXT: movw %ax, 30(%rdi)
2156 ; AVX1-NEXT: vmovd %xmm4, %eax
2157 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
2158 ; AVX1-NEXT: vcvtps2ph $4, %xmm4, %xmm4
2159 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3]
2160 ; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm3
2161 ; AVX1-NEXT: movw %ax, 28(%rdi)
2162 ; AVX1-NEXT: vmovd %xmm3, %eax
2163 ; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[3,1,2,3]
2164 ; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm3
2165 ; AVX1-NEXT: movw %ax, 26(%rdi)
2166 ; AVX1-NEXT: vmovd %xmm3, %eax
2167 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
2168 ; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm3
2169 ; AVX1-NEXT: movw %ax, 22(%rdi)
2170 ; AVX1-NEXT: vmovd %xmm3, %eax
2171 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
2172 ; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm3
2173 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
2174 ; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
2175 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
2176 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2177 ; AVX1-NEXT: movw %ax, 20(%rdi)
2178 ; AVX1-NEXT: vmovd %xmm1, %eax
2179 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[3,1,2,3]
2180 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2181 ; AVX1-NEXT: movw %ax, 18(%rdi)
2182 ; AVX1-NEXT: vmovd %xmm1, %eax
2183 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
2184 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2185 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
2186 ; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2
2187 ; AVX1-NEXT: movw %ax, 14(%rdi)
2188 ; AVX1-NEXT: vmovd %xmm2, %eax
2189 ; AVX1-NEXT: movw %ax, 12(%rdi)
2190 ; AVX1-NEXT: vmovd %xmm1, %eax
2191 ; AVX1-NEXT: movw %ax, 10(%rdi)
2192 ; AVX1-NEXT: vmovd %xmm0, %eax
2193 ; AVX1-NEXT: movw %ax, 6(%rdi)
2194 ; AVX1-NEXT: vmovd %xmm3, %eax
2195 ; AVX1-NEXT: movw %ax, 4(%rdi)
2196 ; AVX1-NEXT: vmovd %xmm4, %eax
2197 ; AVX1-NEXT: movw %ax, 2(%rdi)
2198 ; AVX1-NEXT: vzeroupper
2199 ; AVX1-NEXT: retq
2203 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2
2204 ; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm3
2205 ; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm4
2206 ; AVX2-NEXT: vmovd %xmm4, %eax
2207 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm4
2208 ; AVX2-NEXT: movw %ax, 24(%rdi)
2209 ; AVX2-NEXT: vmovd %xmm4, %eax
2210 ; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm4
2211 ; AVX2-NEXT: movw %ax, 16(%rdi)
2212 ; AVX2-NEXT: vmovd %xmm4, %eax
2213 ; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm4
2214 ; AVX2-NEXT: movw %ax, 8(%rdi)
2215 ; AVX2-NEXT: vmovd %xmm4, %eax
2216 ; AVX2-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3]
2217 ; AVX2-NEXT: vcvtps2ph $4, %xmm4, %xmm4
2218 ; AVX2-NEXT: movw %ax, (%rdi)
2219 ; AVX2-NEXT: vmovd %xmm4, %eax
2220 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
2221 ; AVX2-NEXT: vcvtps2ph $4, %xmm4, %xmm4
2222 ; AVX2-NEXT: movw %ax, 30(%rdi)
2223 ; AVX2-NEXT: vmovd %xmm4, %eax
2224 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
2225 ; AVX2-NEXT: vcvtps2ph $4, %xmm4, %xmm4
2226 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3]
2227 ; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm3
2228 ; AVX2-NEXT: movw %ax, 28(%rdi)
2229 ; AVX2-NEXT: vmovd %xmm3, %eax
2230 ; AVX2-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[3,1,2,3]
2231 ; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm3
2232 ; AVX2-NEXT: movw %ax, 26(%rdi)
2233 ; AVX2-NEXT: vmovd %xmm3, %eax
2234 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
2235 ; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm3
2236 ; AVX2-NEXT: movw %ax, 22(%rdi)
2237 ; AVX2-NEXT: vmovd %xmm3, %eax
2238 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
2239 ; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm3
2240 ; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
2241 ; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
2242 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
2243 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2244 ; AVX2-NEXT: movw %ax, 20(%rdi)
2245 ; AVX2-NEXT: vmovd %xmm1, %eax
2246 ; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[3,1,2,3]
2247 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2248 ; AVX2-NEXT: movw %ax, 18(%rdi)
2249 ; AVX2-NEXT: vmovd %xmm1, %eax
2250 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
2251 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2252 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
2253 ; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2
2254 ; AVX2-NEXT: movw %ax, 14(%rdi)
2255 ; AVX2-NEXT: vmovd %xmm2, %eax
2256 ; AVX2-NEXT: movw %ax, 12(%rdi)
2257 ; AVX2-NEXT: vmovd %xmm1, %eax
2258 ; AVX2-NEXT: movw %ax, 10(%rdi)
2259 ; AVX2-NEXT: vmovd %xmm0, %eax
2260 ; AVX2-NEXT: movw %ax, 6(%rdi)
2261 ; AVX2-NEXT: vmovd %xmm3, %eax
2262 ; AVX2-NEXT: movw %ax, 4(%rdi)
2263 ; AVX2-NEXT: vmovd %xmm4, %eax
2264 ; AVX2-NEXT: movw %ax, 2(%rdi)
2265 ; AVX2-NEXT: vzeroupper
2266 ; AVX2-NEXT: retq
2270 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
2271 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm2
2272 ; AVX512-NEXT: vextractf128 $1, %ymm2, %xmm3
2273 ; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm4
2274 ; AVX512-NEXT: vmovd %xmm4, %eax
2275 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm4
2276 ; AVX512-NEXT: movw %ax, 24(%rdi)
2277 ; AVX512-NEXT: vmovd %xmm4, %eax
2278 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm4
2279 ; AVX512-NEXT: movw %ax, 16(%rdi)
2280 ; AVX512-NEXT: vmovd %xmm4, %eax
2281 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm4
2282 ; AVX512-NEXT: movw %ax, 8(%rdi)
2283 ; AVX512-NEXT: vmovd %xmm4, %eax
2284 ; AVX512-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3]
2285 ; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4
2286 ; AVX512-NEXT: movw %ax, (%rdi)
2287 ; AVX512-NEXT: vmovd %xmm4, %eax
2288 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
2289 ; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4
2290 ; AVX512-NEXT: movw %ax, 30(%rdi)
2291 ; AVX512-NEXT: vmovd %xmm4, %eax
2292 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
2293 ; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4
2294 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3]
2295 ; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3
2296 ; AVX512-NEXT: movw %ax, 28(%rdi)
2297 ; AVX512-NEXT: vmovd %xmm3, %eax
2298 ; AVX512-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[3,1,2,3]
2299 ; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3
2300 ; AVX512-NEXT: movw %ax, 26(%rdi)
2301 ; AVX512-NEXT: vmovd %xmm3, %eax
2302 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
2303 ; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3
2304 ; AVX512-NEXT: movw %ax, 22(%rdi)
2305 ; AVX512-NEXT: vmovd %xmm3, %eax
2306 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
2307 ; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3
2308 ; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
2309 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
2310 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3]
2311 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
2312 ; AVX512-NEXT: movw %ax, 20(%rdi)
2313 ; AVX512-NEXT: vmovd %xmm2, %eax
2314 ; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
2315 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
2316 ; AVX512-NEXT: movw %ax, 18(%rdi)
2317 ; AVX512-NEXT: vmovd %xmm2, %eax
2318 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
2319 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
2320 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
2321 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2322 ; AVX512-NEXT: movw %ax, 14(%rdi)
2323 ; AVX512-NEXT: vmovd %xmm1, %eax
2324 ; AVX512-NEXT: movw %ax, 12(%rdi)
2325 ; AVX512-NEXT: vmovd %xmm2, %eax
2326 ; AVX512-NEXT: movw %ax, 10(%rdi)
2327 ; AVX512-NEXT: vmovd %xmm0, %eax
2328 ; AVX512-NEXT: movw %ax, 6(%rdi)
2329 ; AVX512-NEXT: vmovd %xmm3, %eax
2330 ; AVX512-NEXT: movw %ax, 4(%rdi)
2331 ; AVX512-NEXT: vmovd %xmm4, %eax
2332 ; AVX512-NEXT: movw %ax, 2(%rdi)
2333 ; AVX512-NEXT: retq
2347 ; ALL-NEXT: jmp __truncdfhf2 # TAILCALL
2356 ; ALL-NEXT: pushq %rbx
2357 ; ALL-NEXT: .Ltmp0:
2358 ; ALL-NEXT: .cfi_def_cfa_offset 16
2359 ; ALL-NEXT: subq $16, %rsp
2360 ; ALL-NEXT: .Ltmp1:
2361 ; ALL-NEXT: .cfi_def_cfa_offset 32
2362 ; ALL-NEXT: .Ltmp2:
2363 ; ALL-NEXT: .cfi_offset %rbx, -16
2364 ; ALL-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
2365 ; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2366 ; ALL-NEXT: callq __truncdfhf2
2367 ; ALL-NEXT: movw %ax, %bx
2368 ; ALL-NEXT: shll $16, %ebx
2369 ; ALL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
2370 ; ALL-NEXT: callq __truncdfhf2
2371 ; ALL-NEXT: movzwl %ax, %eax
2372 ; ALL-NEXT: orl %ebx, %eax
2373 ; ALL-NEXT: vmovd %eax, %xmm0
2374 ; ALL-NEXT: addq $16, %rsp
2375 ; ALL-NEXT: popq %rbx
2376 ; ALL-NEXT: retq
2385 ; AVX1-NEXT: pushq %r14
2386 ; AVX1-NEXT: .Ltmp3:
2387 ; AVX1-NEXT: .cfi_def_cfa_offset 16
2388 ; AVX1-NEXT: pushq %rbx
2389 ; AVX1-NEXT: .Ltmp4:
2390 ; AVX1-NEXT: .cfi_def_cfa_offset 24
2391 ; AVX1-NEXT: subq $40, %rsp
2392 ; AVX1-NEXT: .Ltmp5:
2393 ; AVX1-NEXT: .cfi_def_cfa_offset 64
2394 ; AVX1-NEXT: .Ltmp6:
2395 ; AVX1-NEXT: .cfi_offset %rbx, -24
2396 ; AVX1-NEXT: .Ltmp7:
2397 ; AVX1-NEXT: .cfi_offset %r14, -16
2398 ; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
2399 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2400 ; AVX1-NEXT: vzeroupper
2401 ; AVX1-NEXT: callq __truncdfhf2
2402 ; AVX1-NEXT: movw %ax, %bx
2403 ; AVX1-NEXT: shll $16, %ebx
2404 ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
2405 ; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
2406 ; AVX1-NEXT: vzeroupper
2407 ; AVX1-NEXT: callq __truncdfhf2
2408 ; AVX1-NEXT: movzwl %ax, %r14d
2409 ; AVX1-NEXT: orl %ebx, %r14d
2410 ; AVX1-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
2411 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2412 ; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
2413 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2414 ; AVX1-NEXT: vzeroupper
2415 ; AVX1-NEXT: callq __truncdfhf2
2416 ; AVX1-NEXT: movw %ax, %bx
2417 ; AVX1-NEXT: shll $16, %ebx
2418 ; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
2419 ; AVX1-NEXT: callq __truncdfhf2
2420 ; AVX1-NEXT: movzwl %ax, %eax
2421 ; AVX1-NEXT: orl %ebx, %eax
2422 ; AVX1-NEXT: shlq $32, %rax
2423 ; AVX1-NEXT: orq %r14, %rax
2424 ; AVX1-NEXT: vmovq %rax, %xmm0
2425 ; AVX1-NEXT: addq $40, %rsp
2426 ; AVX1-NEXT: popq %rbx
2427 ; AVX1-NEXT: popq %r14
2428 ; AVX1-NEXT: retq
2432 ; AVX2-NEXT: pushq %r14
2433 ; AVX2-NEXT: .Ltmp3:
2434 ; AVX2-NEXT: .cfi_def_cfa_offset 16
2435 ; AVX2-NEXT: pushq %rbx
2436 ; AVX2-NEXT: .Ltmp4:
2437 ; AVX2-NEXT: .cfi_def_cfa_offset 24
2438 ; AVX2-NEXT: subq $40, %rsp
2439 ; AVX2-NEXT: .Ltmp5:
2440 ; AVX2-NEXT: .cfi_def_cfa_offset 64
2441 ; AVX2-NEXT: .Ltmp6:
2442 ; AVX2-NEXT: .cfi_offset %rbx, -24
2443 ; AVX2-NEXT: .Ltmp7:
2444 ; AVX2-NEXT: .cfi_offset %r14, -16
2445 ; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
2446 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2447 ; AVX2-NEXT: vzeroupper
2448 ; AVX2-NEXT: callq __truncdfhf2
2449 ; AVX2-NEXT: movw %ax, %bx
2450 ; AVX2-NEXT: shll $16, %ebx
2451 ; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
2452 ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
2453 ; AVX2-NEXT: vzeroupper
2454 ; AVX2-NEXT: callq __truncdfhf2
2455 ; AVX2-NEXT: movzwl %ax, %r14d
2456 ; AVX2-NEXT: orl %ebx, %r14d
2457 ; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
2458 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
2459 ; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
2460 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2461 ; AVX2-NEXT: vzeroupper
2462 ; AVX2-NEXT: callq __truncdfhf2
2463 ; AVX2-NEXT: movw %ax, %bx
2464 ; AVX2-NEXT: shll $16, %ebx
2465 ; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
2466 ; AVX2-NEXT: callq __truncdfhf2
2467 ; AVX2-NEXT: movzwl %ax, %eax
2468 ; AVX2-NEXT: orl %ebx, %eax
2469 ; AVX2-NEXT: shlq $32, %rax
2470 ; AVX2-NEXT: orq %r14, %rax
2471 ; AVX2-NEXT: vmovq %rax, %xmm0
2472 ; AVX2-NEXT: addq $40, %rsp
2473 ; AVX2-NEXT: popq %rbx
2474 ; AVX2-NEXT: popq %r14
2475 ; AVX2-NEXT: retq
2479 ; AVX512-NEXT: pushq %r14
2480 ; AVX512-NEXT: .Ltmp3:
2481 ; AVX512-NEXT: .cfi_def_cfa_offset 16
2482 ; AVX512-NEXT: pushq %rbx
2483 ; AVX512-NEXT: .Ltmp4:
2484 ; AVX512-NEXT: .cfi_def_cfa_offset 24
2485 ; AVX512-NEXT: subq $40, %rsp
2486 ; AVX512-NEXT: .Ltmp5:
2487 ; AVX512-NEXT: .cfi_def_cfa_offset 64
2488 ; AVX512-NEXT: .Ltmp6:
2489 ; AVX512-NEXT: .cfi_offset %rbx, -24
2490 ; AVX512-NEXT: .Ltmp7:
2491 ; AVX512-NEXT: .cfi_offset %r14, -16
2492 ; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
2493 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2494 ; AVX512-NEXT: callq __truncdfhf2
2495 ; AVX512-NEXT: movw %ax, %bx
2496 ; AVX512-NEXT: shll $16, %ebx
2497 ; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
2498 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
2499 ; AVX512-NEXT: callq __truncdfhf2
2500 ; AVX512-NEXT: movzwl %ax, %r14d
2501 ; AVX512-NEXT: orl %ebx, %r14d
2502 ; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
2503 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
2504 ; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
2505 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2506 ; AVX512-NEXT: callq __truncdfhf2
2507 ; AVX512-NEXT: movw %ax, %bx
2508 ; AVX512-NEXT: shll $16, %ebx
2509 ; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
2510 ; AVX512-NEXT: callq __truncdfhf2
2511 ; AVX512-NEXT: movzwl %ax, %eax
2512 ; AVX512-NEXT: orl %ebx, %eax
2513 ; AVX512-NEXT: shlq $32, %rax
2514 ; AVX512-NEXT: orq %r14, %rax
2515 ; AVX512-NEXT: vmovq %rax, %xmm0
2516 ; AVX512-NEXT: addq $40, %rsp
2517 ; AVX512-NEXT: popq %rbx
2518 ; AVX512-NEXT: popq %r14
2519 ; AVX512-NEXT: retq
2528 ; AVX1-NEXT: pushq %r14
2529 ; AVX1-NEXT: .Ltmp8:
2530 ; AVX1-NEXT: .cfi_def_cfa_offset 16
2531 ; AVX1-NEXT: pushq %rbx
2532 ; AVX1-NEXT: .Ltmp9:
2533 ; AVX1-NEXT: .cfi_def_cfa_offset 24
2534 ; AVX1-NEXT: subq $40, %rsp
2535 ; AVX1-NEXT: .Ltmp10:
2536 ; AVX1-NEXT: .cfi_def_cfa_offset 64
2537 ; AVX1-NEXT: .Ltmp11:
2538 ; AVX1-NEXT: .cfi_offset %rbx, -24
2539 ; AVX1-NEXT: .Ltmp12:
2540 ; AVX1-NEXT: .cfi_offset %r14, -16
2541 ; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
2542 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2543 ; AVX1-NEXT: vzeroupper
2544 ; AVX1-NEXT: callq __truncdfhf2
2545 ; AVX1-NEXT: movw %ax, %bx
2546 ; AVX1-NEXT: shll $16, %ebx
2547 ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
2548 ; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
2549 ; AVX1-NEXT: vzeroupper
2550 ; AVX1-NEXT: callq __truncdfhf2
2551 ; AVX1-NEXT: movzwl %ax, %r14d
2552 ; AVX1-NEXT: orl %ebx, %r14d
2553 ; AVX1-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
2554 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2555 ; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
2556 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2557 ; AVX1-NEXT: vzeroupper
2558 ; AVX1-NEXT: callq __truncdfhf2
2559 ; AVX1-NEXT: movw %ax, %bx
2560 ; AVX1-NEXT: shll $16, %ebx
2561 ; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
2562 ; AVX1-NEXT: callq __truncdfhf2
2563 ; AVX1-NEXT: movzwl %ax, %eax
2564 ; AVX1-NEXT: orl %ebx, %eax
2565 ; AVX1-NEXT: shlq $32, %rax
2566 ; AVX1-NEXT: orq %r14, %rax
2567 ; AVX1-NEXT: vmovq %rax, %xmm0
2568 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
2569 ; AVX1-NEXT: addq $40, %rsp
2570 ; AVX1-NEXT: popq %rbx
2571 ; AVX1-NEXT: popq %r14
2572 ; AVX1-NEXT: retq
2576 ; AVX2-NEXT: pushq %r14
2577 ; AVX2-NEXT: .Ltmp8:
2578 ; AVX2-NEXT: .cfi_def_cfa_offset 16
2579 ; AVX2-NEXT: pushq %rbx
2580 ; AVX2-NEXT: .Ltmp9:
2581 ; AVX2-NEXT: .cfi_def_cfa_offset 24
2582 ; AVX2-NEXT: subq $40, %rsp
2583 ; AVX2-NEXT: .Ltmp10:
2584 ; AVX2-NEXT: .cfi_def_cfa_offset 64
2585 ; AVX2-NEXT: .Ltmp11:
2586 ; AVX2-NEXT
2587 ; AVX2-NEXT: .Ltmp12:
2588 ; AVX2-NEXT: .cfi_offset %r14, -16
2589 ; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
2590 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2591 ; AVX2-NEXT: vzeroupper
2592 ; AVX2-NEXT: callq __truncdfhf2
2593 ; AVX2-NEXT: movw %ax, %bx
2594 ; AVX2-NEXT: shll $16, %ebx
2595 ; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
2596 ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
2597 ; AVX2-NEXT: vzeroupper
2598 ; AVX2-NEXT: callq __truncdfhf2
2599 ; AVX2-NEXT: movzwl %ax, %r14d
2600 ; AVX2-NEXT: orl %ebx, %r14d
2601 ; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
2602 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
2603 ; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
2604 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2605 ; AVX2-NEXT: vzeroupper
2606 ; AVX2-NEXT: callq __truncdfhf2
2607 ; AVX2-NEXT: movw %ax, %bx
2608 ; AVX2-NEXT: shll $16, %ebx
2609 ; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
2610 ; AVX2-NEXT: callq __truncdfhf2
2611 ; AVX2-NEXT: movzwl %ax, %eax
2612 ; AVX2-NEXT: orl %ebx, %eax
2613 ; AVX2-NEXT: shlq $32, %rax
2614 ; AVX2-NEXT: orq %r14, %rax
2615 ; AVX2-NEXT: vmovq %rax, %xmm0
2616 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
2617 ; AVX2-NEXT: addq $40, %rsp
2618 ; AVX2-NEXT: popq %rbx
2619 ; AVX2-NEXT: popq %r14
2620 ; AVX2-NEXT: retq
2624 ; AVX512-NEXT: pushq %r14
2625 ; AVX512-NEXT: .Ltmp8:
2626 ; AVX512-NEXT: .cfi_def_cfa_offset 16
2627 ; AVX512-NEXT: pushq %rbx
2628 ; AVX512-NEXT: .Ltmp9:
2629 ; AVX512-NEXT: .cfi_def_cfa_offset 24
2630 ; AVX512-NEXT: subq $40, %rsp
2631 ; AVX512-NEXT: .Ltmp10:
2632 ; AVX512-NEXT: .cfi_def_cfa_offset 64
2633 ; AVX512-NEXT: .Ltmp11:
2634 ; AVX512-NEXT: .cfi_offset %rbx, -24
2635 ; AVX512-NEXT: .Ltmp12:
2636 ; AVX512-NEXT: .cfi_offset %r14, -16
2637 ; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
2638 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2639 ; AVX512-NEXT: callq __truncdfhf2
2640 ; AVX512-NEXT: movw %ax, %bx
2641 ; AVX512-NEXT: shll $16, %ebx
2642 ; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
2643 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
2644 ; AVX512-NEXT: callq __truncdfhf2
2645 ; AVX512-NEXT: movzwl %ax, %r14d
2646 ; AVX512-NEXT: orl %ebx, %r14d
2647 ; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
2648 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
2649 ; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
2650 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2651 ; AVX512-NEXT: callq __truncdfhf2
2652 ; AVX512-NEXT: movw %ax, %bx
2653 ; AVX512-NEXT: shll $16, %ebx
2654 ; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
2655 ; AVX512-NEXT: callq __truncdfhf2
2656 ; AVX512-NEXT: movzwl %ax, %eax
2657 ; AVX512-NEXT: orl %ebx, %eax
2658 ; AVX512-NEXT: shlq $32, %rax
2659 ; AVX512-NEXT: orq %r14, %rax
2660 ; AVX512-NEXT: vmovq %rax, %xmm0
2661 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
2662 ; AVX512-NEXT: addq $40, %rsp
2663 ; AVX512-NEXT: popq %rbx
2664 ; AVX512-NEXT: popq %r14
2665 ; AVX512-NEXT: retq
2675 ; AVX1-NEXT: pushq %r14
2676 ; AVX1-NEXT: .Ltmp13:
2677 ; AVX1-NEXT: .cfi_def_cfa_offset 16
2678 ; AVX1-NEXT: pushq %rbx
2679 ; AVX1-NEXT: .Ltmp14:
2680 ; AVX1-NEXT: .cfi_def_cfa_offset 24
2681 ; AVX1-NEXT: subq $40, %rsp
2682 ; AVX1-NEXT: .Ltmp15:
2683 ; AVX1-NEXT: .cfi_def_cfa_offset 64
2684 ; AVX1-NEXT: .Ltmp16:
2685 ; AVX1-NEXT: .cfi_offset %rbx, -24
2686 ; AVX1-NEXT: .Ltmp17:
2687 ; AVX1-NEXT: .cfi_offset %r14, -16
2688 ; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
2689 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2690 ; AVX1-NEXT: vzeroupper
2691 ; AVX1-NEXT: callq __truncdfhf2
2692 ; AVX1-NEXT: movw %ax, %bx
2693 ; AVX1-NEXT: shll $16, %ebx
2694 ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
2695 ; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
2696 ; AVX1-NEXT: vzeroupper
2697 ; AVX1-NEXT: callq __truncdfhf2
2698 ; AVX1-NEXT: movzwl %ax, %r14d
2699 ; AVX1-NEXT: orl %ebx, %r14d
2700 ; AVX1-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
2701 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2702 ; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
2703 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2704 ; AVX1-NEXT: vzeroupper
2705 ; AVX1-NEXT: callq __truncdfhf2
2706 ; AVX1-NEXT: movw %ax, %bx
2707 ; AVX1-NEXT: shll $16, %ebx
2708 ; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
2709 ; AVX1-NEXT: callq __truncdfhf2
2710 ; AVX1-NEXT: movzwl %ax, %eax
2711 ; AVX1-NEXT: orl %ebx, %eax
2712 ; AVX1-NEXT: shlq $32, %rax
2713 ; AVX1-NEXT: orq %r14, %rax
2714 ; AVX1-NEXT: vmovq %rax, %xmm0
2715 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
2716 ; AVX1-NEXT: addq $40, %rsp
2717 ; AVX1-NEXT: popq %rbx
2718 ; AVX1-NEXT: popq %r14
2719 ; AVX1-NEXT: retq
2723 ; AVX2-NEXT: pushq %r14
2724 ; AVX2-NEXT: .Ltmp13:
2725 ; AVX2-NEXT: .cfi_def_cfa_offset 16
2726 ; AVX2-NEXT: pushq %rbx
2727 ; AVX2-NEXT: .Ltmp14:
2728 ; AVX2-NEXT: .cfi_def_cfa_offset 24
2729 ; AVX2-NEXT: subq $40, %rsp
2730 ; AVX2-NEXT: .Ltmp15:
2731 ; AVX2-NEXT: .cfi_def_cfa_offset 64
2732 ; AVX2-NEXT: .Ltmp16:
2733 ; AVX2-NEXT: .cfi_offset %rbx, -24
2734 ; AVX2-NEXT: .Ltmp17:
2735 ; AVX2-NEXT: .cfi_offset %r14, -16
2736 ; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
2737 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2738 ; AVX2-NEXT: vzeroupper
2739 ; AVX2-NEXT: callq __truncdfhf2
2740 ; AVX2-NEXT: movw %ax, %bx
2741 ; AVX2-NEXT: shll $16, %ebx
2742 ; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
2743 ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
2744 ; AVX2-NEXT: vzeroupper
2745 ; AVX2-NEXT: callq __truncdfhf2
2746 ; AVX2-NEXT: movzwl %ax, %r14d
2747 ; AVX2-NEXT: orl %ebx, %r14d
2748 ; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
2749 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
2750 ; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
2751 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2752 ; AVX2-NEXT: vzeroupper
2753 ; AVX2-NEXT: callq __truncdfhf2
2754 ; AVX2-NEXT: movw %ax, %bx
2755 ; AVX2-NEXT: shll $16, %ebx
2756 ; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
2757 ; AVX2-NEXT: callq __truncdfhf2
2758 ; AVX2-NEXT: movzwl %ax, %eax
2759 ; AVX2-NEXT: orl %ebx, %eax
2760 ; AVX2-NEXT: shlq $32, %rax
2761 ; AVX2-NEXT: orq %r14, %rax
2762 ; AVX2-NEXT: vmovq %rax, %xmm0
2763 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
2764 ; AVX2-NEXT: addq $40, %rsp
2765 ; AVX2-NEXT: popq %rbx
2766 ; AVX2-NEXT: popq %r14
2767 ; AVX2-NEXT: retq
2771 ; AVX512-NEXT: pushq %r14
2772 ; AVX512-NEXT: .Ltmp13:
2773 ; AVX512-NEXT: .cfi_def_cfa_offset 16
2774 ; AVX512-NEXT: pushq %rbx
2775 ; AVX512-NEXT: .Ltmp14:
2776 ; AVX512-NEXT: .cfi_def_cfa_offset 24
2777 ; AVX512-NEXT: subq $40, %rsp
2778 ; AVX512-NEXT: .Ltmp15:
2779 ; AVX512-NEXT: .cfi_def_cfa_offset 64
2780 ; AVX512-NEXT: .Ltmp16:
2781 ; AVX512-NEXT: .cfi_offset %rbx, -24
2782 ; AVX512-NEXT: .Ltmp17:
2783 ; AVX512-NEXT: .cfi_offset %r14, -16
2784 ; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
2785 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2786 ; AVX512-NEXT: callq __truncdfhf2
2787 ; AVX512-NEXT: movw %ax, %bx
2788 ; AVX512-NEXT: shll $16, %ebx
2789 ; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
2790 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
2791 ; AVX512-NEXT: callq __truncdfhf2
2792 ; AVX512-NEXT: movzwl %ax, %r14d
2793 ; AVX512-NEXT: orl %ebx, %r14d
2794 ; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
2795 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
2796 ; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
2797 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2798 ; AVX512-NEXT: callq __truncdfhf2
2799 ; AVX512-NEXT: movw %ax, %bx
2800 ; AVX512-NEXT: shll $16, %ebx
2801 ; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
2802 ; AVX512-NEXT: callq __truncdfhf2
2803 ; AVX512-NEXT: movzwl %ax, %eax
2804 ; AVX512-NEXT: orl %ebx, %eax
2805 ; AVX512-NEXT: shlq $32, %rax
2806 ; AVX512-NEXT: orq %r14, %rax
2807 ; AVX512-NEXT: vmovq %rax, %xmm0
2808 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
2809 ; AVX512-NEXT: addq $40, %rsp
2810 ; AVX512-NEXT: popq %rbx
2811 ; AVX512-NEXT: popq %r14
2812 ; AVX512-NEXT: retq
2822 ; AVX1-NEXT: pushq %r15
2823 ; AVX1-NEXT: .Ltmp18:
2824 ; AVX1-NEXT: .cfi_def_cfa_offset 16
2825 ; AVX1-NEXT: pushq %r14
2826 ; AVX1-NEXT: .Ltmp19:
2827 ; AVX1-NEXT: .cfi_def_cfa_offset 24
2828 ; AVX1-NEXT: pushq %rbx
2829 ; AVX1-NEXT: .Ltmp20:
2830 ; AVX1-NEXT: .cfi_def_cfa_offset 32
2831 ; AVX1-NEXT: subq $64, %rsp
2832 ; AVX1-NEXT: .Ltmp21:
2833 ; AVX1-NEXT: .cfi_def_cfa_offset 96
2834 ; AVX1-NEXT: .Ltmp22:
2835 ; AVX1-NEXT: .cfi_offset %rbx, -32
2836 ; AVX1-NEXT: .Ltmp23:
2837 ; AVX1-NEXT: .cfi_offset %r14, -24
2838 ; AVX1-NEXT: .Ltmp24:
2839 ; AVX1-NEXT: .cfi_offset %r15, -16
2840 ; AVX1-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill
2841 ; AVX1-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
2842 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2843 ; AVX1-NEXT: vzeroupper
2844 ; AVX1-NEXT: callq __truncdfhf2
2845 ; AVX1-NEXT: movw %ax, %bx
2846 ; AVX1-NEXT: shll $16, %ebx
2847 ; AVX1-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
2848 ; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
2849 ; AVX1-NEXT: vzeroupper
2850 ; AVX1-NEXT: callq __truncdfhf2
2851 ; AVX1-NEXT: movzwl %ax, %r15d
2852 ; AVX1-NEXT: orl %ebx, %r15d
2853 ; AVX1-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
2854 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2855 ; AVX1-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
2856 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2857 ; AVX1-NEXT: vzeroupper
2858 ; AVX1-NEXT: callq __truncdfhf2
2859 ; AVX1-NEXT: movw %ax, %bx
2860 ; AVX1-NEXT: shll $16, %ebx
2861 ; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
2862 ; AVX1-NEXT: callq __truncdfhf2
2863 ; AVX1-NEXT: movzwl %ax, %r14d
2864 ; AVX1-NEXT: orl %ebx, %r14d
2865 ; AVX1-NEXT: shlq $32, %r14
2866 ; AVX1-NEXT: orq %r15, %r14
2867 ; AVX1-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
2868 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2869 ; AVX1-NEXT: vzeroupper
2870 ; AVX1-NEXT: callq __truncdfhf2
2871 ; AVX1-NEXT: movw %ax, %bx
2872 ; AVX1-NEXT: shll $16, %ebx
2873 ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
2874 ; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
2875 ; AVX1-NEXT: vzeroupper
2876 ; AVX1-NEXT: callq __truncdfhf2
2877 ; AVX1-NEXT: movzwl %ax, %r15d
2878 ; AVX1-NEXT: orl %ebx, %r15d
2879 ; AVX1-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
2880 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2881 ; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
2882 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2883 ; AVX1-NEXT: vzeroupper
2884 ; AVX1-NEXT: callq __truncdfhf2
2885 ; AVX1-NEXT: movw %ax, %bx
2886 ; AVX1-NEXT: shll $16, %ebx
2887 ; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
2888 ; AVX1-NEXT: callq __truncdfhf2
2889 ; AVX1-NEXT: movzwl %ax, %eax
2890 ; AVX1-NEXT: orl %ebx, %eax
2891 ; AVX1-NEXT: shlq $32, %rax
2892 ; AVX1-NEXT: orq %r15, %rax
2893 ; AVX1-NEXT: vmovq %rax, %xmm0
2894 ; AVX1-NEXT: vmovq %r14, %xmm1
2895 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2896 ; AVX1-NEXT: addq $64, %rsp
2897 ; AVX1-NEXT: popq %rbx
2898 ; AVX1-NEXT: popq %r14
2899 ; AVX1-NEXT: popq %r15
2900 ; AVX1-NEXT: retq
2904 ; AVX2-NEXT: pushq %r15
2905 ; AVX2-NEXT: .Ltmp18:
2906 ; AVX2-NEXT: .cfi_def_cfa_offset 16
2907 ; AVX2-NEXT: pushq %r14
2908 ; AVX2-NEXT: .Ltmp19:
2909 ; AVX2-NEXT: .cfi_def_cfa_offset 24
2910 ; AVX2-NEXT: pushq %rbx
2911 ; AVX2-NEXT: .Ltmp20:
2912 ; AVX2-NEXT: .cfi_def_cfa_offset 32
2913 ; AVX2-NEXT: subq $64, %rsp
2914 ; AVX2-NEXT: .Ltmp21:
2915 ; AVX2-NEXT: .cfi_def_cfa_offset 96
2916 ; AVX2-NEXT: .Ltmp22:
2917 ; AVX2-NEXT: .cfi_offset %rbx, -32
2918 ; AVX2-NEXT: .Ltmp23:
2919 ; AVX2-NEXT: .cfi_offset %r14, -24
2920 ; AVX2-NEXT: .Ltmp24:
2921 ; AVX2-NEXT: .cfi_offset %r15, -16
2922 ; AVX2-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill
2923 ; AVX2-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
2924 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2925 ; AVX2-NEXT: vzeroupper
2926 ; AVX2-NEXT: callq __truncdfhf2
2927 ; AVX2-NEXT: movw %ax, %bx
2928 ; AVX2-NEXT: shll $16, %ebx
2929 ; AVX2-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
2930 ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
2931 ; AVX2-NEXT: vzeroupper
2932 ; AVX2-NEXT: callq __truncdfhf2
2933 ; AVX2-NEXT: movzwl %ax, %r15d
2934 ; AVX2-NEXT: orl %ebx, %r15d
2935 ; AVX2-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
2936 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
2937 ; AVX2-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
2938 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2939 ; AVX2-NEXT: vzeroupper
2940 ; AVX2-NEXT: callq __truncdfhf2
2941 ; AVX2-NEXT: movw %ax, %bx
2942 ; AVX2-NEXT: shll $16, %ebx
2943 ; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
2944 ; AVX2-NEXT: callq __truncdfhf2
2945 ; AVX2-NEXT: movzwl %ax, %r14d
2946 ; AVX2-NEXT: orl %ebx, %r14d
2947 ; AVX2-NEXT: shlq $32, %r14
2948 ; AVX2-NEXT: orq %r15, %r14
2949 ; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
2950 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2951 ; AVX2-NEXT: vzeroupper
2952 ; AVX2-NEXT: callq __truncdfhf2
2953 ; AVX2-NEXT: movw %ax, %bx
2954 ; AVX2-NEXT: shll $16, %ebx
2955 ; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
2956 ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
2957 ; AVX2-NEXT: vzeroupper
2958 ; AVX2-NEXT: callq __truncdfhf2
2959 ; AVX2-NEXT: movzwl %ax, %r15d
2960 ; AVX2-NEXT: orl %ebx, %r15d
2961 ; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
2962 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
2963 ; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
2964 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2965 ; AVX2-NEXT: vzeroupper
2966 ; AVX2-NEXT: callq __truncdfhf2
2967 ; AVX2-NEXT: movw %ax, %bx
2968 ; AVX2-NEXT: shll $16, %ebx
2969 ; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
2970 ; AVX2-NEXT: callq __truncdfhf2
2971 ; AVX2-NEXT: movzwl %ax, %eax
2972 ; AVX2-NEXT: orl %ebx, %eax
2973 ; AVX2-NEXT: shlq $32, %rax
2974 ; AVX2-NEXT: orq %r15, %rax
2975 ; AVX2-NEXT: vmovq %rax, %xmm0
2976 ; AVX2-NEXT: vmovq %r14, %xmm1
2977 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2978 ; AVX2-NEXT: addq $64, %rsp
2979 ; AVX2-NEXT: popq %rbx
2980 ; AVX2-NEXT: popq %r14
2981 ; AVX2-NEXT: popq %r15
2982 ; AVX2-NEXT: retq
2986 ; AVX512-NEXT: pushq %r15
2987 ; AVX512-NEXT: .Ltmp18:
2988 ; AVX512-NEXT: .cfi_def_cfa_offset 16
2989 ; AVX512-NEXT: pushq %r14
2990 ; AVX512-NEXT: .Ltmp19:
2991 ; AVX512-NEXT: .cfi_def_cfa_offset 24
2992 ; AVX512-NEXT: pushq %rbx
2993 ; AVX512-NEXT: .Ltmp20:
2994 ; AVX512-NEXT: .cfi_def_cfa_offset 32
2995 ; AVX512-NEXT: subq $96, %rsp
2996 ; AVX512-NEXT: .Ltmp21:
2997 ; AVX512-NEXT: .cfi_def_cfa_offset 128
2998 ; AVX512-NEXT: .Ltmp22:
2999 ; AVX512-NEXT: .cfi_offset %rbx, -32
3000 ; AVX512-NEXT: .Ltmp23:
3001 ; AVX512-NEXT: .cfi_offset %r14, -24
3002 ; AVX512-NEXT: .Ltmp24:
3003 ; AVX512-NEXT: .cfi_offset %r15, -16
3004 ; AVX512-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill
3005 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3006 ; AVX512-NEXT: callq __truncdfhf2
3007 ; AVX512-NEXT: movw %ax, %bx
3008 ; AVX512-NEXT: shll $16, %ebx
3009 ; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
3010 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
3011 ; AVX512-NEXT: callq __truncdfhf2
3012 ; AVX512-NEXT: movzwl %ax, %r15d
3013 ; AVX512-NEXT: orl %ebx, %r15d
3014 ; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
3015 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
3016 ; AVX512-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
3017 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3018 ; AVX512-NEXT: callq __truncdfhf2
3019 ; AVX512-NEXT: movw %ax, %bx
3020 ; AVX512-NEXT: shll $16, %ebx
3021 ; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
3022 ; AVX512-NEXT: callq __truncdfhf2
3023 ; AVX512-NEXT: movzwl %ax, %r14d
3024 ; AVX512-NEXT: orl %ebx, %r14d
3025 ; AVX512-NEXT: shlq $32, %r14
3026 ; AVX512-NEXT: orq %r15, %r14
3027 NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
3028 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm0
3029 ; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
3030 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3031 ; AVX512-NEXT: callq __truncdfhf2
3032 ; AVX512-NEXT: movw %ax, %bx
3033 ; AVX512-NEXT: shll $16, %ebx
3034 ; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
3035 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3036 ; AVX512-NEXT: callq __truncdfhf2
3037 ; AVX512-NEXT: movzwl %ax, %r15d
3038 ; AVX512-NEXT: orl %ebx, %r15d
3039 ; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
3040 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
3041 ; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
3042 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3043 ; AVX512-NEXT: callq __truncdfhf2
3044 ; AVX512-NEXT: movw %ax, %bx
3045 ; AVX512-NEXT: shll $16, %ebx
3046 ; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
3047 ; AVX512-NEXT: callq __truncdfhf2
3048 ; AVX512-NEXT: movzwl %ax, %eax
3049 ; AVX512-NEXT: orl %ebx, %eax
3050 ; AVX512-NEXT: shlq $32, %rax
3051 ; AVX512-NEXT: orq %r15, %rax
3052 ; AVX512-NEXT: vmovq %rax, %xmm0
3053 ; AVX512-NEXT: vmovq %r14, %xmm1
3054 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
3055 ; AVX512-NEXT: addq $96, %rsp
3056 ; AVX512-NEXT: popq %rbx
3057 ; AVX512-NEXT: popq %r14
3058 ; AVX512-NEXT: popq %r15
3059 ; AVX512-NEXT: retq
3072 ; ALL-NEXT: pushq %rbx
3073 ; ALL-NEXT: .Ltmp25:
3074 ; ALL-NEXT: .cfi_def_cfa_offset 16
3075 ; ALL-NEXT: .Ltmp26:
3076 ; ALL-NEXT: .cfi_offset %rbx, -16
3077 ; ALL-NEXT: movq %rdi, %rbx
3078 ; ALL-NEXT: callq __truncdfhf2
3079 ; ALL-NEXT: movw %ax, (%rbx)
3080 ; ALL-NEXT: popq %rbx
3081 ; ALL-NEXT: retq
3091 ; ALL-NEXT: pushq %rbp
3092 ; ALL-NEXT: .Ltmp27:
3093 ; ALL-NEXT: .cfi_def_cfa_offset 16
3094 ; ALL-NEXT: pushq %rbx
3095 ; ALL-NEXT: .Ltmp28:
3096 ; ALL-NEXT: .cfi_def_cfa_offset 24
3097 ; ALL-NEXT: subq $24, %rsp
3098 ; ALL-NEXT: .Ltmp29:
3099 ; ALL-NEXT: .cfi_def_cfa_offset 48
3100 ; ALL-NEXT: .Ltmp30:
3101 ; ALL-NEXT: .cfi_offset %rbx, -24
3102 ; ALL-NEXT: .Ltmp31:
3103 ; ALL-NEXT: .cfi_offset %rbp, -16
3104 ; ALL-NEXT: movq %rdi, %rbx
3105 ; ALL-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
3106 ; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3107 ; ALL-NEXT: callq __truncdfhf2
3108 ; ALL-NEXT: movl %eax, %ebp
3109 ; ALL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
3110 ; ALL-NEXT: callq __truncdfhf2
3111 ; ALL-NEXT: movw %ax, (%rbx)
3112 ; ALL-NEXT: movw %bp, 2(%rbx)
3113 ; ALL-NEXT: addq $24, %rsp
3114 ; ALL-NEXT: popq %rbx
3115 ; ALL-NEXT: popq %rbp
3116 ; ALL-NEXT: retq
3126 ; AVX1-NEXT: pushq %rbp
3127 ; AVX1-NEXT: .Ltmp32:
3128 ; AVX1-NEXT: .cfi_def_cfa_offset 16
3129 ; AVX1-NEXT: pushq %r15
3130 ; AVX1-NEXT: .Ltmp33:
3131 ; AVX1-NEXT: .cfi_def_cfa_offset 24
3132 ; AVX1-NEXT: pushq %r14
3133 ; AVX1-NEXT: .Ltmp34:
3134 ; AVX1-NEXT: .cfi_def_cfa_offset 32
3135 ; AVX1-NEXT: pushq %rbx
3136 ; AVX1-NEXT: .Ltmp35:
3137 ; AVX1-NEXT: .cfi_def_cfa_offset 40
3138 ; AVX1-NEXT: subq $88, %rsp
3139 ; AVX1-NEXT: .Ltmp36:
3140 ; AVX1-NEXT: .cfi_def_cfa_offset 128
3141 ; AVX1-NEXT: .Ltmp37:
3142 ; AVX1-NEXT: .cfi_offset %rbx, -40
3143 ; AVX1-NEXT: .Ltmp38:
3144 ; AVX1-NEXT: .cfi_offset %r14, -32
3145 ; AVX1-NEXT: .Ltmp39:
3146 ; AVX1-NEXT: .cfi_offset %r15, -24
3147 ; AVX1-NEXT: .Ltmp40:
3148 ; AVX1-NEXT: .cfi_offset %rbp, -16
3149 ; AVX1-NEXT: movq %rdi, %rbx
3150 ; AVX1-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
3151 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3152 ; AVX1-NEXT: vzeroupper
3153 ; AVX1-NEXT: callq __truncdfhf2
3154 ; AVX1-NEXT: movl %eax, %r14d
3155 ; AVX1-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
3156 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
3157 ; AVX1-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
3158 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3159 ; AVX1-NEXT: vzeroupper
3160 ; AVX1-NEXT: callq __truncdfhf2
3161 ; AVX1-NEXT: movl %eax, %r15d
3162 ; AVX1-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
3163 ; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3164 ; AVX1-NEXT: vzeroupper
3165 ; AVX1-NEXT: callq __truncdfhf2
3166 ; AVX1-NEXT: movl %eax, %ebp
3167 ; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
3168 ; AVX1-NEXT: callq __truncdfhf2
3169 ; AVX1-NEXT: movw %ax, 4(%rbx)
3170 ; AVX1-NEXT: movw %bp, (%rbx)
3171 ; AVX1-NEXT: movw %r15w, 6(%rbx)
3172 ; AVX1-NEXT: movw %r14w, 2(%rbx)
3173 ; AVX1-NEXT: addq $88, %rsp
3174 ; AVX1-NEXT: popq %rbx
3175 ; AVX1-NEXT: popq %r14
3176 ; AVX1-NEXT: popq %r15
3177 ; AVX1-NEXT: popq %rbp
3178 ; AVX1-NEXT: retq
3182 ; AVX2-NEXT: pushq %rbp
3183 ; AVX2-NEXT: .Ltmp32:
3184 ; AVX2-NEXT: .cfi_def_cfa_offset 16
3185 ; AVX2-NEXT: pushq %r15
3186 ; AVX2-NEXT: .Ltmp33:
3187 ; AVX2-NEXT: .cfi_def_cfa_offset 24
3188 ; AVX2-NEXT: pushq %r14
3189 ; AVX2-NEXT: .Ltmp34:
3190 ; AVX2-NEXT: .cfi_def_cfa_offset 32
3191 ; AVX2-NEXT: pushq %rbx
3192 ; AVX2-NEXT: .Ltmp35:
3193 ; AVX2-NEXT: .cfi_def_cfa_offset 40
3194 ; AVX2-NEXT: subq $88, %rsp
3195 ; AVX2-NEXT: .Ltmp36:
3196 ; AVX2-NEXT: .cfi_def_cfa_offset 128
3197 ; AVX2-NEXT: .Ltmp37:
3198 ; AVX2-NEXT: .cfi_offset %rbx, -40
3199 ; AVX2-NEXT: .Ltmp38:
3200 ; AVX2-NEXT: .cfi_offset %r14, -32
3201 ; AVX2-NEXT: .Ltmp39:
3202 ; AVX2-NEXT: .cfi_offset %r15, -24
3203 ; AVX2-NEXT: .Ltmp40:
3204 ; AVX2-NEXT: .cfi_offset %rbp, -16
3205 ; AVX2-NEXT: movq %rdi, %rbx
3206 ; AVX2-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
3207 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3208 ; AVX2-NEXT: vzeroupper
3209 ; AVX2-NEXT: callq __truncdfhf2
3210 ; AVX2-NEXT: movl %eax, %r14d
3211 ; AVX2-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
3212 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
3213 ; AVX2-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
3214 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3215 ; AVX2-NEXT: vzeroupper
3216 ; AVX2-NEXT: callq __truncdfhf2
3217 ; AVX2-NEXT: movl %eax, %r15d
3218 ; AVX2-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
3219 ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3220 ; AVX2-NEXT: vzeroupper
3221 ; AVX2-NEXT: callq __truncdfhf2
3222 ; AVX2-NEXT: movl %eax, %ebp
3223 ; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
3224 ; AVX2-NEXT: callq __truncdfhf2
3225 ; AVX2-NEXT: movw %ax, 4(%rbx)
3226 ; AVX2-NEXT: movw %bp, (%rbx)
3227 ; AVX2-NEXT: movw %r15w, 6(%rbx)
3228 ; AVX2-NEXT: movw %r14w, 2(%rbx)
3229 ; AVX2-NEXT: addq $88, %rsp
3230 ; AVX2-NEXT: popq %rbx
3231 ; AVX2-NEXT: popq %r14
3232 ; AVX2-NEXT: popq %r15
3233 ; AVX2-NEXT: popq %rbp
3234 ; AVX2-NEXT: retq
3238 ; AVX512-NEXT: pushq %rbp
3239 ; AVX512-NEXT: .Ltmp32:
3240 ; AVX512-NEXT: .cfi_def_cfa_offset 16
3241 ; AVX512-NEXT: pushq %r15
3242 ; AVX512-NEXT: .Ltmp33:
3243 ; AVX512-NEXT: .cfi_def_cfa_offset 24
3244 ; AVX512-NEXT: pushq %r14
3245 ; AVX512-NEXT: .Ltmp34:
3246 ; AVX512-NEXT: .cfi_def_cfa_offset 32
3247 ; AVX512-NEXT: pushq %rbx
3248 ; AVX512-NEXT: .Ltmp35:
3249 ; AVX512-NEXT: .cfi_def_cfa_offset 40
3250 ; AVX512-NEXT: subq $88, %rsp
3251 ; AVX512-NEXT: .Ltmp36:
3252 ; AVX512-NEXT: .cfi_def_cfa_offset 128
3253 ; AVX512-NEXT: .Ltmp37:
3254 ; AVX512-NEXT: .cfi_offset %rbx, -40
3255 ; AVX512-NEXT: .Ltmp38:
3256 ; AVX512-NEXT: .cfi_offset %r14, -32
3257 ; AVX512-NEXT: .Ltmp39:
3258 ; AVX512-NEXT: .cfi_offset %r15, -24
3259 ; AVX512-NEXT: .Ltmp40:
3260 ; AVX512-NEXT: .cfi_offset %rbp, -16
3261 ; AVX512-NEXT: movq %rdi, %rbx
3262 ; AVX512-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
3263 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3264 ; AVX512-NEXT: callq __truncdfhf2
3265 ; AVX512-NEXT: movl %eax, %r14d
3266 ; AVX512-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
3267 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
3268 ; AVX512-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
3269 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3270 ; AVX512-NEXT: callq __truncdfhf2
3271 ; AVX512-NEXT: movl %eax, %r15d
3272 ; AVX512-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
3273 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3274 ; AVX512-NEXT: callq __truncdfhf2
3275 ; AVX512-NEXT: movl %eax, %ebp
3276 ; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
3277 ; AVX512-NEXT: callq __truncdfhf2
3278 ; AVX512-NEXT: movw %ax, 4(%rbx)
3279 ; AVX512-NEXT: movw %bp, (%rbx)
3280 ; AVX512-NEXT: movw %r15w, 6(%rbx)
3281 ; AVX512-NEXT: movw %r14w, 2(%rbx)
3282 ; AVX512-NEXT: addq $88, %rsp
3283 ; AVX512-NEXT: popq %rbx
3284 ; AVX512-NEXT: popq %r14
3285 ; AVX512-NEXT: popq %r15
3286 ; AVX512-NEXT: popq %rbp
3287 ; AVX512-NEXT: retq
3297 ; AVX1-NEXT: pushq %rbp
3298 ; AVX1-NEXT: .Ltmp41:
3299 ; AVX1-NEXT: .cfi_def_cfa_offset 16
3300 ; AVX1-NEXT: pushq %r14
3301 ; AVX1-NEXT: .Ltmp42:
3302 ; AVX1-NEXT: .cfi_def_cfa_offset 24
3303 ; AVX1-NEXT: pushq %rbx
3304 ; AVX1-NEXT: .Ltmp43:
3305 ; AVX1-NEXT: .cfi_def_cfa_offset 32
3306 ; AVX1-NEXT: subq $32, %rsp
3307 ; AVX1-NEXT: .Ltmp44:
3308 ; AVX1-NEXT: .cfi_def_cfa_offset 64
3309 ; AVX1-NEXT: .Ltmp45:
3310 ; AVX1-NEXT: .cfi_offset %rbx, -32
3311 ; AVX1-NEXT: .Ltmp46:
3312 ; AVX1-NEXT: .cfi_offset %r14, -24
3313 ; AVX1-NEXT: .Ltmp47:
3314 ; AVX1-NEXT: .cfi_offset %rbp, -16
3315 ; AVX1-NEXT: movq %rdi, %r14
3316 ; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
3317 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3318 ; AVX1-NEXT: vzeroupper
3319 ; AVX1-NEXT: callq __truncdfhf2
3320 ; AVX1-NEXT: movw %ax, %bp
3321 ; AVX1-NEXT: shll $16, %ebp
3322 ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
3323 ; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3324 ; AVX1-NEXT: vzeroupper
3325 ; AVX1-NEXT: callq __truncdfhf2
3326 ; AVX1-NEXT: movzwl %ax, %ebx
3327 ; AVX1-NEXT: orl %ebp, %ebx
3328 ; AVX1-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
3329 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
3330 ; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
3331 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3332 ; AVX1-NEXT: vzeroupper
3333 ; AVX1-NEXT: callq __truncdfhf2
3334 ; AVX1-NEXT: movw %ax, %bp
3335 ; AVX1-NEXT: shll $16, %ebp
3336 ; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
3337 ; AVX1-NEXT: callq __truncdfhf2
3338 ; AVX1-NEXT: movzwl %ax, %eax
3339 ; AVX1-NEXT: orl %ebp, %eax
3340 ; AVX1-NEXT: shlq $32, %rax
3341 ; AVX1-NEXT: orq %rbx, %rax
3342 ; AVX1-NEXT: vmovq %rax, %xmm0
3343 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
3344 ; AVX1-NEXT: vmovdqa %xmm0, (%r14)
3345 ; AVX1-NEXT: addq $32, %rsp
3346 ; AVX1-NEXT: popq %rbx
3347 ; AVX1-NEXT: popq %r14
3348 ; AVX1-NEXT: popq %rbp
3349 ; AVX1-NEXT: retq
3353 ; AVX2-NEXT: pushq %rbp
3354 ; AVX2-NEXT: .Ltmp41:
3355 ; AVX2-NEXT: .cfi_def_cfa_offset 16
3356 ; AVX2-NEXT: pushq %r14
3357 ; AVX2-NEXT: .Ltmp42:
3358 ; AVX2-NEXT: .cfi_def_cfa_offset 24
3359 ; AVX2-NEXT: pushq %rbx
3360 ; AVX2-NEXT: .Ltmp43:
3361 ; AVX2-NEXT: .cfi_def_cfa_offset 32
3362 ; AVX2-NEXT: subq $32, %rsp
3363 ; AVX2-NEXT: .Ltmp44:
3364 ; AVX2-NEXT: .cfi_def_cfa_offset 64
3365 ; AVX2-NEXT: .Ltmp45:
3366 ; AVX2-NEXT: .cfi_offset %rbx, -32
3367 ; AVX2-NEXT: .Ltmp46:
3368 ; AVX2-NEXT: .cfi_offset %r14, -24
3369 ; AVX2-NEXT: .Ltmp47:
3370 ; AVX2-NEXT: .cfi_offset %rbp, -16
3371 ; AVX2-NEXT: movq %rdi, %r14
3372 ; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
3373 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3374 ; AVX2-NEXT: vzeroupper
3375 ; AVX2-NEXT: callq __truncdfhf2
3376 ; AVX2-NEXT: movw %ax, %bp
3377 ; AVX2-NEXT: shll $16, %ebp
3378 ; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
3379 ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3380 ; AVX2-NEXT: vzeroupper
3381 ; AVX2-NEXT: callq __truncdfhf2
3382 ; AVX2-NEXT: movzwl %ax, %ebx
3383 ; AVX2-NEXT: orl %ebp, %ebx
3384 ; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
3385 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
3386 ; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
3387 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3388 ; AVX2-NEXT: vzeroupper
3389 ; AVX2-NEXT: callq __truncdfhf2
3390 ; AVX2-NEXT: movw %ax, %bp
3391 ; AVX2-NEXT: shll $16, %ebp
3392 ; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
3393 ; AVX2-NEXT: callq __truncdfhf2
3394 ; AVX2-NEXT: movzwl %ax, %eax
3395 ; AVX2-NEXT: orl %ebp, %eax
3396 ; AVX2-NEXT: shlq $32, %rax
3397 ; AVX2-NEXT: orq %rbx, %rax
3398 ; AVX2-NEXT: vmovq %rax, %xmm0
3399 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
3400 ; AVX2-NEXT: vmovdqa %xmm0, (%r14)
3401 ; AVX2-NEXT: addq $32, %rsp
3402 ; AVX2-NEXT: popq %rbx
3403 ; AVX2-NEXT: popq %r14
3404 ; AVX2-NEXT: popq %rbp
3405 ; AVX2-NEXT: retq
3409 ; AVX512-NEXT: pushq %rbp
3410 ; AVX512-NEXT: .Ltmp41:
3411 ; AVX512-NEXT: .cfi_def_cfa_offset 16
3412 ; AVX512-NEXT: pushq %r14
3413 ; AVX512-NEXT: .Ltmp42:
3414 ; AVX512-NEXT: .cfi_def_cfa_offset 24
3415 ; AVX512-NEXT: pushq %rbx
3416 ; AVX512-NEXT: .Ltmp43:
3417 ; AVX512-NEXT: .cfi_def_cfa_offset 32
3418 ; AVX512-NEXT: subq $32, %rsp
3419 ; AVX512-NEXT: .Ltmp44:
3420 ; AVX512-NEXT: .cfi_def_cfa_offset 64
3421 ; AVX512-NEXT: .Ltmp45:
3422 ; AVX512-NEXT: .cfi_offset %rbx, -32
3423 ; AVX512-NEXT: .Ltmp46:
3424 ; AVX512-NEXT: .cfi_offset %r14, -24
3425 ; AVX512-NEXT: .Ltmp47:
3426 ; AVX512-NEXT: .cfi_offset %rbp, -16
3427 ; AVX512-NEXT: movq %rdi, %r14
3428 ; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
3429 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3430 ; AVX512-NEXT: callq __truncdfhf2
3431 ; AVX512-NEXT: movw %ax, %bp
3432 ; AVX512-NEXT: shll $16, %ebp
3433 ; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
3434 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3435 ; AVX512-NEXT: callq __truncdfhf2
3436 ; AVX512-NEXT: movzwl %ax, %ebx
3437 ; AVX512-NEXT: orl %ebp, %ebx
3438 ; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
3439 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
3440 ; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
3441 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3442 ; AVX512-NEXT: callq __truncdfhf2
3443 ; AVX512-NEXT: movw %ax, %bp
3444 ; AVX512-NEXT: shll $16, %ebp
3445 ; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
3446 ; AVX512-NEXT: callq __truncdfhf2
3447 ; AVX512-NEXT: movzwl %ax, %eax
3448 ; AVX512-NEXT: orl %ebp, %eax
3449 ; AVX512-NEXT: shlq $32, %rax
3450 ; AVX512-NEXT: orq %rbx, %rax
3451 ; AVX512-NEXT: vmovq %rax, %xmm0
3452 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
3453 ; AVX512-NEXT: vmovdqa %xmm0, (%r14)
3454 ; AVX512-NEXT: addq $32, %rsp
3455 ; AVX512-NEXT: popq %rbx
3456 ; AVX512-NEXT: popq %r14
3457 ; AVX512-NEXT: popq %rbp
3458 ; AVX512-NEXT: retq
3469 ; AVX1-NEXT: pushq %rbp
3470 ; AVX1-NEXT: .Ltmp48:
3471 ; AVX1-NEXT: .cfi_def_cfa_offset 16
3472 ; AVX1-NEXT: pushq %r14
3473 ; AVX1-NEXT: .Ltmp49:
3474 ; AVX1-NEXT: .cfi_def_cfa_offset 24
3475 ; AVX1-NEXT: pushq %rbx
3476 ; AVX1-NEXT: .Ltmp50:
3477 ; AVX1-NEXT: .cfi_def_cfa_offset 32
3478 ; AVX1-NEXT: subq $32, %rsp
3479 ; AVX1-NEXT: .Ltmp51:
3480 ; AVX1-NEXT: .cfi_def_cfa_offset 64
3481 ; AVX1-NEXT: .Ltmp52:
3482 ; AVX1-NEXT: .cfi_offset %rbx, -32
3483 ; AVX1-NEXT: .Ltmp53:
3484 ; AVX1-NEXT: .cfi_offset %r14, -24
3485 ; AVX1-NEXT: .Ltmp54:
3486 ; AVX1-NEXT: .cfi_offset %rbp, -16
3487 ; AVX1-NEXT: movq %rdi, %r14
3488 ; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
3489 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3490 ; AVX1-NEXT: vzeroupper
3491 ; AVX1-NEXT: callq __truncdfhf2
3492 ; AVX1-NEXT: movw %ax, %bp
3493 ; AVX1-NEXT: shll $16, %ebp
3494 ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
3495 ; AVX1-NEXT
3496 ; AVX1-NEXT: vzeroupper
3497 ; AVX1-NEXT: callq __truncdfhf2
3498 ; AVX1-NEXT: movzwl %ax, %ebx
3499 ; AVX1-NEXT: orl %ebp, %ebx
3500 ; AVX1-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
3501 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
3502 ; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
3503 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3504 ; AVX1-NEXT: vzeroupper
3505 ; AVX1-NEXT: callq __truncdfhf2
3506 ; AVX1-NEXT: movw %ax, %bp
3507 ; AVX1-NEXT: shll $16, %ebp
3508 ; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
3509 ; AVX1-NEXT: callq __truncdfhf2
3510 ; AVX1-NEXT: movzwl %ax, %eax
3511 ; AVX1-NEXT: orl %ebp, %eax
3512 ; AVX1-NEXT: shlq $32, %rax
3513 ; AVX1-NEXT: orq %rbx, %rax
3514 ; AVX1-NEXT: vmovq %rax, %xmm0
3515 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
3516 ; AVX1-NEXT: vmovdqa %xmm0, (%r14)
3517 ; AVX1-NEXT: addq $32, %rsp
3518 ; AVX1-NEXT: popq %rbx
3519 ; AVX1-NEXT: popq %r14
3520 ; AVX1-NEXT: popq %rbp
3521 ; AVX1-NEXT: retq
3525 ; AVX2-NEXT: pushq %rbp
3526 ; AVX2-NEXT: .Ltmp48:
3527 ; AVX2-NEXT: .cfi_def_cfa_offset 16
3528 ; AVX2-NEXT: pushq %r14
3529 ; AVX2-NEXT: .Ltmp49:
3530 ; AVX2-NEXT: .cfi_def_cfa_offset 24
3531 ; AVX2-NEXT: pushq %rbx
3532 ; AVX2-NEXT: .Ltmp50:
3533 ; AVX2-NEXT: .cfi_def_cfa_offset 32
3534 ; AVX2-NEXT: subq $32, %rsp
3535 ; AVX2-NEXT: .Ltmp51:
3536 ; AVX2-NEXT: .cfi_def_cfa_offset 64
3537 ; AVX2-NEXT: .Ltmp52:
3538 ; AVX2-NEXT: .cfi_offset %rbx, -32
3539 ; AVX2-NEXT: .Ltmp53:
3540 ; AVX2-NEXT: .cfi_offset %r14, -24
3541 ; AVX2-NEXT: .Ltmp54:
3542 ; AVX2-NEXT: .cfi_offset %rbp, -16
3543 ; AVX2-NEXT: movq %rdi, %r14
3544 ; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
3545 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3546 ; AVX2-NEXT: vzeroupper
3547 ; AVX2-NEXT: callq __truncdfhf2
3548 ; AVX2-NEXT: movw %ax, %bp
3549 ; AVX2-NEXT: shll $16, %ebp
3550 ; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
3551 ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3552 ; AVX2-NEXT: vzeroupper
3553 ; AVX2-NEXT: callq __truncdfhf2
3554 ; AVX2-NEXT: movzwl %ax, %ebx
3555 ; AVX2-NEXT: orl %ebp, %ebx
3556 ; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
3557 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
3558 ; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
3559 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3560 ; AVX2-NEXT: vzeroupper
3561 ; AVX2-NEXT: callq __truncdfhf2
3562 ; AVX2-NEXT: movw %ax, %bp
3563 ; AVX2-NEXT: shll $16, %ebp
3564 ; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
3565 ; AVX2-NEXT: callq __truncdfhf2
3566 ; AVX2-NEXT: movzwl %ax, %eax
3567 ; AVX2-NEXT: orl %ebp, %eax
3568 ; AVX2-NEXT: shlq $32, %rax
3569 ; AVX2-NEXT: orq %rbx, %rax
3570 ; AVX2-NEXT: vmovq %rax, %xmm0
3571 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
3572 ; AVX2-NEXT: vmovdqa %xmm0, (%r14)
3573 ; AVX2-NEXT: addq $32, %rsp
3574 ; AVX2-NEXT: popq %rbx
3575 ; AVX2-NEXT: popq %r14
3576 ; AVX2-NEXT: popq %rbp
3577 ; AVX2-NEXT: retq
3581 ; AVX512-NEXT: pushq %rbp
3582 ; AVX512-NEXT: .Ltmp48:
3583 ; AVX512-NEXT: .cfi_def_cfa_offset 16
3584 ; AVX512-NEXT: pushq %r14
3585 ; AVX512-NEXT: .Ltmp49:
3586 ; AVX512-NEXT: .cfi_def_cfa_offset 24
3587 ; AVX512-NEXT: pushq %rbx
3588 ; AVX512-NEXT: .Ltmp50:
3589 ; AVX512-NEXT: .cfi_def_cfa_offset 32
3590 ; AVX512-NEXT: subq $32, %rsp
3591 ; AVX512-NEXT: .Ltmp51:
3592 ; AVX512-NEXT: .cfi_def_cfa_offset 64
3593 ; AVX512-NEXT: .Ltmp52:
3594 ; AVX512-NEXT: .cfi_offset %rbx, -32
3595 ; AVX512-NEXT: .Ltmp53:
3596 ; AVX512-NEXT: .cfi_offset %r14, -24
3597 ; AVX512-NEXT: .Ltmp54:
3598 ; AVX512-NEXT: .cfi_offset %rbp, -16
3599 ; AVX512-NEXT: movq %rdi, %r14
3600 ; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
3601 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3602 ; AVX512-NEXT: callq __truncdfhf2
3603 ; AVX512-NEXT: movw %ax, %bp
3604 ; AVX512-NEXT: shll $16, %ebp
3605 ; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
3606 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3607 ; AVX512-NEXT: callq __truncdfhf2
3608 ; AVX512-NEXT: movzwl %ax, %ebx
3609 ; AVX512-NEXT: orl %ebp, %ebx
3610 ; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
3611 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
3612 ; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
3613 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3614 ; AVX512-NEXT: callq __truncdfhf2
3615 ; AVX512-NEXT: movw %ax, %bp
3616 ; AVX512-NEXT: shll $16, %ebp
3617 ; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
3618 ; AVX512-NEXT: callq __truncdfhf2
3619 ; AVX512-NEXT: movzwl %ax, %eax
3620 ; AVX512-NEXT: orl %ebp, %eax
3621 ; AVX512-NEXT: shlq $32, %rax
3622 ; AVX512-NEXT: orq %rbx, %rax
3623 ; AVX512-NEXT: vmovq %rax, %xmm0
3624 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
3625 ; AVX512-NEXT: vmovdqa %xmm0, (%r14)
3626 ; AVX512-NEXT: addq $32, %rsp
3627 ; AVX512-NEXT: popq %rbx
3628 ; AVX512-NEXT: popq %r14
3629 ; AVX512-NEXT: popq %rbp
3630 ; AVX512-NEXT: retq
3641 ; AVX1-NEXT: pushq %rbp
3642 ; AVX1-NEXT: .Ltmp55:
3643 ; AVX1-NEXT: .cfi_def_cfa_offset 16
3644 ; AVX1-NEXT: pushq %r15
3645 ; AVX1-NEXT: .Ltmp56:
3646 ; AVX1-NEXT: .cfi_def_cfa_offset 24
3647 ; AVX1-NEXT: pushq %r14
3648 ; AVX1-NEXT: .Ltmp57:
3649 ; AVX1-NEXT: .cfi_def_cfa_offset 32
3650 ; AVX1-NEXT: pushq %r13
3651 ; AVX1-NEXT: .Ltmp58:
3652 ; AVX1-NEXT: .cfi_def_cfa_offset 40
3653 ; AVX1-NEXT: pushq %r12
3654 ; AVX1-NEXT: .Ltmp59:
3655 ; AVX1-NEXT: .cfi_def_cfa_offset 48
3656 ; AVX1-NEXT: pushq %rbx
3657 ; AVX1-NEXT: .Ltmp60:
3658 ; AVX1-NEXT: .cfi_def_cfa_offset 56
3659 ; AVX1-NEXT: subq $136, %rsp
3660 ; AVX1-NEXT: .Ltmp61:
3661 ; AVX1-NEXT: .cfi_def_cfa_offset 192
3662 ; AVX1-NEXT: .Ltmp62:
3663 ; AVX1-NEXT: .cfi_offset %rbx, -56
3664 ; AVX1-NEXT: .Ltmp63:
3665 ; AVX1-NEXT: .cfi_offset %r12, -48
3666 ; AVX1-NEXT: .Ltmp64:
3667 ; AVX1-NEXT: .cfi_offset %r13, -40
3668 ; AVX1-NEXT: .Ltmp65:
3669 ; AVX1-NEXT: .cfi_offset %r14, -32
3670 ; AVX1-NEXT: .Ltmp66:
3671 ; AVX1-NEXT: .cfi_offset %r15, -24
3672 ; AVX1-NEXT: .Ltmp67:
3673 ; AVX1-NEXT: .cfi_offset %rbp, -16
3674 ; AVX1-NEXT: movq %rdi, %rbx
3675 ; AVX1-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) # 32-byte Spill
3676 ; AVX1-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
3677 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3678 ; AVX1-NEXT: vzeroupper
3679 ; AVX1-NEXT: callq __truncdfhf2
3680 ; AVX1-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
3681 ; AVX1-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
3682 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
3683 ; AVX1-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
3684 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3685 ; AVX1-NEXT: vzeroupper
3686 ; AVX1-NEXT: callq __truncdfhf2
3687 ; AVX1-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
3688 ; AVX1-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
3689 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3690 ; AVX1-NEXT: vzeroupper
3691 ; AVX1-NEXT: callq __truncdfhf2
3692 ; AVX1-NEXT: movl %eax, %r12d
3693 ; AVX1-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
3694 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
3695 ; AVX1-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
3696 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3697 ; AVX1-NEXT: vzeroupper
3698 ; AVX1-NEXT: callq __truncdfhf2
3699 ; AVX1-NEXT: movl %eax, %r13d
3700 ; AVX1-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
3701 ; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3702 ; AVX1-NEXT: vzeroupper
3703 ; AVX1-NEXT: callq __truncdfhf2
3704 ; AVX1-NEXT: movl %eax, %ebp
3705 ; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
3706 ; AVX1-NEXT: callq __truncdfhf2
3707 ; AVX1-NEXT: movl %eax, %r14d
3708 ; AVX1-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
3709 ; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3710 ; AVX1-NEXT: vzeroupper
3711 ; AVX1-NEXT: callq __truncdfhf2
3712 ; AVX1-NEXT: movl %eax, %r15d
3713 ; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
3714 ; AVX1-NEXT: callq __truncdfhf2
3715 ; AVX1-NEXT: movw %ax, 12(%rbx)
3716 ; AVX1-NEXT: movw %r15w, 8(%rbx)
3717 ; AVX1-NEXT: movw %r14w, 4(%rbx)
3718 ; AVX1-NEXT: movw %bp, (%rbx)
3719 ; AVX1-NEXT: movw %r13w, 14(%rbx)
3720 ; AVX1-NEXT: movw %r12w, 10(%rbx)
3721 ; AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload
3722 ; AVX1-NEXT: movw %ax, 6(%rbx)
3723 ; AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload
3724 ; AVX1-NEXT: movw %ax, 2(%rbx)
3725 ; AVX1-NEXT: addq $136, %rsp
3726 ; AVX1-NEXT: popq %rbx
3727 ; AVX1-NEXT: popq %r12
3728 ; AVX1-NEXT: popq %r13
3729 ; AVX1-NEXT: popq %r14
3730 ; AVX1-NEXT: popq %r15
3731 ; AVX1-NEXT: popq %rbp
3732 ; AVX1-NEXT: retq
3736 ; AVX2-NEXT: pushq %rbp
3737 ; AVX2-NEXT: .Ltmp55:
3738 ; AVX2-NEXT: .cfi_def_cfa_offset 16
3739 ; AVX2-NEXT: pushq %r15
3740 ; AVX2-NEXT: .Ltmp56:
3741 ; AVX2-NEXT: .cfi_def_cfa_offset 24
3742 ; AVX2-NEXT: pushq %r14
3743 ; AVX2-NEXT: .Ltmp57:
3744 ; AVX2-NEXT: .cfi_def_cfa_offset 32
3745 ; AVX2-NEXT: pushq %r13
3746 ; AVX2-NEXT: .Ltmp58:
3747 ; AVX2-NEXT: .cfi_def_cfa_offset 40
3748 ; AVX2-NEXT: pushq %r12
3749 ; AVX2-NEXT: .Ltmp59:
3750 ; AVX2-NEXT: .cfi_def_cfa_offset 48
3751 ; AVX2-NEXT: pushq %rbx
3752 ; AVX2-NEXT: .Ltmp60:
3753 ; AVX2-NEXT: .cfi_def_cfa_offset 56
3754 ; AVX2-NEXT: subq $136, %rsp
3755 ; AVX2-NEXT: .Ltmp61:
3756 ; AVX2-NEXT: .cfi_def_cfa_offset 192
3757 ; AVX2-NEXT: .Ltmp62:
3758 ; AVX2-NEXT: .cfi_offset %rbx, -56
3759 ; AVX2-NEXT: .Ltmp63:
3760 ; AVX2-NEXT: .cfi_offset %r12, -48
3761 ; AVX2-NEXT: .Ltmp64:
3762 ; AVX2-NEXT: .cfi_offset %r13, -40
3763 ; AVX2-NEXT: .Ltmp65:
3764 ; AVX2-NEXT: .cfi_offset %r14, -32
3765 ; AVX2-NEXT: .Ltmp66:
3766 ; AVX2-NEXT: .cfi_offset %r15, -24
3767 ; AVX2-NEXT: .Ltmp67:
3768 ; AVX2-NEXT: .cfi_offset %rbp, -16
3769 ; AVX2-NEXT: movq %rdi, %rbx
3770 ; AVX2-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) # 32-byte Spill
3771 ; AVX2-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
3772 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3773 ; AVX2-NEXT: vzeroupper
3774 ; AVX2-NEXT: callq __truncdfhf2
3775 ; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
3776 ; AVX2-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
3777 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
3778 ; AVX2-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
3779 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3780 ; AVX2-NEXT: vzeroupper
3781 ; AVX2-NEXT: callq __truncdfhf2
3782 ; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
3783 ; AVX2-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
3784 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3785 ; AVX2-NEXT: vzeroupper
3786 ; AVX2-NEXT: callq __truncdfhf2
3787 ; AVX2-NEXT: movl %eax, %r12d
3788 ; AVX2-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
3789 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
3790 ; AVX2-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
3791 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3792 ; AVX2-NEXT: vzeroupper
3793 ; AVX2-NEXT: callq __truncdfhf2
3794 ; AVX2-NEXT: movl %eax, %r13d
3795 ; AVX2-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
3796 ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3797 ; AVX2-NEXT: vzeroupper
3798 ; AVX2-NEXT: callq __truncdfhf2
3799 ; AVX2-NEXT: movl %eax, %ebp
3800 ; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
3801 ; AVX2-NEXT: callq __truncdfhf2
3802 ; AVX2-NEXT: movl %eax, %r14d
3803 ; AVX2-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
3804 ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3805 ; AVX2-NEXT: vzeroupper
3806 ; AVX2-NEXT: callq __truncdfhf2
3807 ; AVX2-NEXT: movl %eax, %r15d
3808 ; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
3809 ; AVX2-NEXT: callq __truncdfhf2
3810 ; AVX2-NEXT: movw %ax, 12(%rbx)
3811 ; AVX2-NEXT: movw %r15w, 8(%rbx)
3812 ; AVX2-NEXT: movw %r14w, 4(%rbx)
3813 ; AVX2-NEXT: movw %bp, (%rbx)
3814 ; AVX2-NEXT: movw %r13w, 14(%rbx)
3815 ; AVX2-NEXT: movw %r12w, 10(%rbx)
3816 ; AVX2-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload
3817 ; AVX2-NEXT: movw %ax, 6(%rbx)
3818 ; AVX2-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload
3819 ; AVX2-NEXT: movw %ax, 2(%rbx)
3820 ; AVX2-NEXT: addq $136, %rsp
3821 ; AVX2-NEXT: popq %rbx
3822 ; AVX2-NEXT: popq %r12
3823 ; AVX2-NEXT: popq %r13
3824 ; AVX2-NEXT: popq %r14
3825 ; AVX2-NEXT: popq %r15
3826 ; AVX2-NEXT: popq %rbp
3827 ; AVX2-NEXT: retq
3831 ; AVX512-NEXT: pushq %rbp
3832 ; AVX512-NEXT: .Ltmp55:
3833 ; AVX512-NEXT: .cfi_def_cfa_offset 16
3834 ; AVX512-NEXT: pushq %r15
3835 ; AVX512-NEXT: .Ltmp56:
3836 ; AVX512-NEXT: .cfi_def_cfa_offset 24
3837 ; AVX512-NEXT: pushq %r14
3838 ; AVX512-NEXT: .Ltmp57:
3839 ; AVX512-NEXT: .cfi_def_cfa_offset 32
3840 ; AVX512-NEXT: pushq %r13
3841 ; AVX512-NEXT: .Ltmp58:
3842 ; AVX512-NEXT: .cfi_def_cfa_offset 40
3843 ; AVX512-NEXT: pushq %r12
3844 ; AVX512-NEXT: .Ltmp59:
3845 ; AVX512-NEXT: .cfi_def_cfa_offset 48
3846 ; AVX512-NEXT: pushq %rbx
3847 ; AVX512-NEXT: .Ltmp60:
3848 ; AVX512-NEXT: .cfi_def_cfa_offset 56
3849 ; AVX512-NEXT: subq $200, %rsp
3850 ; AVX512-NEXT: .Ltmp61:
3851 ; AVX512-NEXT: .cfi_def_cfa_offset 256
3852 ; AVX512-NEXT: .Ltmp62:
3853 ; AVX512-NEXT: .cfi_offset %rbx, -56
3854 ; AVX512-NEXT: .Ltmp63:
3855 ; AVX512-NEXT: .cfi_offset %r12, -48
3856 ; AVX512-NEXT: .Ltmp64:
3857 ; AVX512-NEXT: .cfi_offset %r13, -40
3858 ; AVX512-NEXT: .Ltmp65:
3859 ; AVX512-NEXT: .cfi_offset %r14, -32
3860 ; AVX512-NEXT: .Ltmp66:
3861 ; AVX512-NEXT: .cfi_offset %r15, -24
3862 ; AVX512-NEXT: .Ltmp67:
3863 ; AVX512-NEXT: .cfi_offset %rbp, -16
3864 ; AVX512-NEXT: movq %rdi, %rbx
3865 ; AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) # 64-byte Spill
3866 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3867 ; AVX512-NEXT: callq __truncdfhf2
3868 ; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
3869 ; AVX512-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload
3870 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
3871 ; AVX512-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
3872 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3873 ; AVX512-NEXT: callq __truncdfhf2
3874 ; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
3875 ; AVX512-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload
3876 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm0
3877 ; AVX512-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
3878 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3879 ; AVX512-NEXT: callq __truncdfhf2
3880 ; AVX512-NEXT: movl %eax, %r12d
3881 ; AVX512-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
3882 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
3883 ; AVX512-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
3884 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3885 ; AVX512-NEXT: callq __truncdfhf2
3886 ; AVX512-NEXT: movl %eax, %r13d
3887 ; AVX512-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload
3888 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
3889 ; AVX512-NEXT: callq __truncdfhf2
3890 ; AVX512-NEXT: movl %eax, %ebp
3891 ; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
3892 ; AVX512-NEXT: callq __truncdfhf2
3893 ; AVX512-NEXT: movl %eax, %r14d
3894 ; AVX512-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
3895 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3896 ; AVX512-NEXT: callq __truncdfhf2
3897 ; AVX512-NEXT: movl %eax, %r15d
3898 ; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
3899 ; AVX512-NEXT: callq __truncdfhf2
3900 ; AVX512-NEXT: movw %ax, 12(%rbx)
3901 ; AVX512-NEXT: movw %r15w, 8(%rbx)
3902 ; AVX512-NEXT: movw %r14w, 4(%rbx)
3903 ; AVX512-NEXT: movw %bp, (%rbx)
3904 ; AVX512-NEXT: movw %r13w, 14(%rbx)
3905 ; AVX512-NEXT: movw %r12w, 10(%rbx)
3906 ; AVX512-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload
3907 ; AVX512-NEXT: movw %ax, 6(%rbx)
3908 ; AVX512-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload
3909 ; AVX512-NEXT: movw %ax, 2(%rbx)
3910 ; AVX512-NEXT: addq $200, %rsp
3911 ; AVX512-NEXT: popq %rbx
3912 ; AVX512-NEXT: popq %r12
3913 ; AVX512-NEXT: popq %r13
3914 ; AVX512-NEXT: popq %r14
3915 ; AVX512-NEXT: popq %r15
3916 ; AVX512-NEXT: popq %rbp
3917 ; AVX512-NEXT: retq