Home | History | Annotate | Download | only in X86

Lines Matching full:next

14 ; SSE-NEXT:    movl %edi, %eax
15 ; SSE-NEXT: shlb $7, %al
16 ; SSE-NEXT: movl %edi, %ecx
17 ; SSE-NEXT: shlb $5, %cl
18 ; SSE-NEXT: andb $64, %cl
19 ; SSE-NEXT: movl %edi, %edx
20 ; SSE-NEXT: shlb $3, %dl
21 ; SSE-NEXT: andb $32, %dl
22 ; SSE-NEXT: orb %cl, %dl
23 ; SSE-NEXT: movl %edi, %ecx
24 ; SSE-NEXT: addb %cl, %cl
25 ; SSE-NEXT: andb $16, %cl
26 ; SSE-NEXT: orb %dl, %cl
27 ; SSE-NEXT: movl %edi, %edx
28 ; SSE-NEXT: shrb %dl
29 ; SSE-NEXT: andb $8, %dl
30 ; SSE-NEXT: orb %cl, %dl
31 ; SSE-NEXT: movl %edi, %ecx
32 ; SSE-NEXT: shrb $3, %cl
33 ; SSE-NEXT: andb $4, %cl
34 ; SSE-NEXT: orb %dl, %cl
35 ; SSE-NEXT: movl %edi, %edx
36 ; SSE-NEXT: shrb $5, %dl
37 ; SSE-NEXT: andb $2, %dl
38 ; SSE-NEXT: orb %cl, %dl
39 ; SSE-NEXT: shrb $7, %dil
40 ; SSE-NEXT: orb %dl, %dil
41 ; SSE-NEXT: orb %al, %dil
42 ; SSE-NEXT: movl %edi, %eax
43 ; SSE-NEXT: retq
47 ; AVX-NEXT: movl %edi, %eax
48 ; AVX-NEXT: shlb $7, %al
49 ; AVX-NEXT: movl %edi, %ecx
50 ; AVX-NEXT: shlb $5, %cl
51 ; AVX-NEXT: andb $64, %cl
52 ; AVX-NEXT: movl %edi, %edx
53 ; AVX-NEXT: shlb $3, %dl
54 ; AVX-NEXT: andb $32, %dl
55 ; AVX-NEXT: orb %cl, %dl
56 ; AVX-NEXT: movl %edi, %ecx
57 ; AVX-NEXT: addb %cl, %cl
58 ; AVX-NEXT: andb $16, %cl
59 ; AVX-NEXT: orb %dl, %cl
60 ; AVX-NEXT: movl %edi, %edx
61 ; AVX-NEXT: shrb %dl
62 ; AVX-NEXT: andb $8, %dl
63 ; AVX-NEXT: orb %cl, %dl
64 ; AVX-NEXT: movl %edi, %ecx
65 ; AVX-NEXT: shrb $3, %cl
66 ; AVX-NEXT: andb $4, %cl
67 ; AVX-NEXT: orb %dl, %cl
68 ; AVX-NEXT: movl %edi, %edx
69 ; AVX-NEXT: shrb $5, %dl
70 ; AVX-NEXT: andb $2, %dl
71 ; AVX-NEXT: orb %cl, %dl
72 ; AVX-NEXT: shrb $7, %dil
73 ; AVX-NEXT: orb %dl, %dil
74 ; AVX-NEXT: orb %al, %dil
75 ; AVX-NEXT: movl %edi, %eax
76 ; AVX-NEXT: retq
80 ; XOP-NEXT: vmovd %edi, %xmm0
81 ; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
82 ; XOP-NEXT: vpextrb $0, %xmm0, %eax
83 ; XOP-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
84 ; XOP-NEXT: retq
92 ; SSE-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
93 ; SSE-NEXT: movl %edi, %ecx
94 ; SSE-NEXT: andl $32768, %ecx # imm = 0x8000
95 ; SSE-NEXT: movl %edi, %eax
96 ; SSE-NEXT: shll $15, %eax
97 ; SSE-NEXT: movl %edi, %edx
98 ; SSE-NEXT: andl $2, %edx
99 ; SSE-NEXT: shll $13, %edx
100 ; SSE-NEXT: leal (%rdx,%rax), %eax
101 ; SSE-NEXT: movl %edi, %edx
102 ; SSE-NEXT: andl $4, %edx
103 ; SSE-NEXT: shll $11, %edx
104 ; SSE-NEXT: orl %edx, %eax
105 ; SSE-NEXT: movl %edi, %edx
106 ; SSE-NEXT: andl $8, %edx
107 ; SSE-NEXT: shll $9, %edx
108 ; SSE-NEXT: orl %edx, %eax
109 ; SSE-NEXT: movl %edi, %edx
110 ; SSE-NEXT: andl $16, %edx
111 ; SSE-NEXT: shll $7, %edx
112 ; SSE-NEXT: orl %edx, %eax
113 ; SSE-NEXT: movl %edi, %edx
114 ; SSE-NEXT: andl $32, %edx
115 ; SSE-NEXT: shll $5, %edx
116 ; SSE-NEXT: orl %edx, %eax
117 ; SSE-NEXT: movl %edi, %edx
118 ; SSE-NEXT: andl $64, %edx
119 ; SSE-NEXT: shll $3, %edx
120 ; SSE-NEXT: leal (%rdi,%rdi), %esi
121 ; SSE-NEXT: andl $256, %esi # imm = 0x100
122 ; SSE-NEXT: orl %edx, %esi
123 ; SSE-NEXT: movl %edi, %edx
124 ; SSE-NEXT: shrl %edx
125 ; SSE-NEXT: andl $128, %edx
126 ; SSE-NEXT: orl %esi, %edx
127 ; SSE-NEXT: movl %edi, %esi
128 ; SSE-NEXT: shrl $3, %esi
129 ; SSE-NEXT: andl $64, %esi
130 ; SSE-NEXT: orl %edx, %esi
131 ; SSE-NEXT: movl %edi, %edx
132 ; SSE-NEXT: shrl $5, %edx
133 ; SSE-NEXT: andl $32, %edx
134 ; SSE-NEXT: orl %esi, %edx
135 ; SSE-NEXT: movl %edi, %esi
136 ; SSE-NEXT: shrl $7, %esi
137 ; SSE-NEXT: andl $16, %esi
138 ; SSE-NEXT: orl %edx, %esi
139 ; SSE-NEXT: movl %edi, %edx
140 ; SSE-NEXT: shrl $9, %edx
141 ; SSE-NEXT: andl $8, %edx
142 ; SSE-NEXT: orl %esi, %edx
143 ; SSE-NEXT: movl %edi, %esi
144 ; SSE-NEXT: shrl $11, %esi
145 ; SSE-NEXT: andl $4, %esi
146 ; SSE-NEXT: orl %edx, %esi
147 ; SSE-NEXT: shrl $13, %edi
148 ; SSE-NEXT: andl $2, %edi
149 ; SSE-NEXT: orl %esi, %edi
150 ; SSE-NEXT: shrl $15, %ecx
151 ; SSE-NEXT: orl %edi, %ecx
152 ; SSE-NEXT: orl %ecx, %eax
153 ; SSE-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
154 ; SSE-NEXT: retq
158 ; AVX-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
159 ; AVX-NEXT: movl %edi, %ecx
160 ; AVX-NEXT: andl $32768, %ecx # imm = 0x8000
161 ; AVX-NEXT: movl %edi, %eax
162 ; AVX-NEXT: shll $15, %eax
163 ; AVX-NEXT: movl %edi, %edx
164 ; AVX-NEXT: andl $2, %edx
165 ; AVX-NEXT: shll $13, %edx
166 ; AVX-NEXT: leal (%rdx,%rax), %eax
167 ; AVX-NEXT: movl %edi, %edx
168 ; AVX-NEXT: andl $4, %edx
169 ; AVX-NEXT: shll $11, %edx
170 ; AVX-NEXT: orl %edx, %eax
171 ; AVX-NEXT: movl %edi, %edx
172 ; AVX-NEXT: andl $8, %edx
173 ; AVX-NEXT: shll $9, %edx
174 ; AVX-NEXT: orl %edx, %eax
175 ; AVX-NEXT: movl %edi, %edx
176 ; AVX-NEXT: andl $16, %edx
177 ; AVX-NEXT: shll $7, %edx
178 ; AVX-NEXT: orl %edx, %eax
179 ; AVX-NEXT: movl %edi, %edx
180 ; AVX-NEXT: andl $32, %edx
181 ; AVX-NEXT: shll $5, %edx
182 ; AVX-NEXT: orl %edx, %eax
183 ; AVX-NEXT: movl %edi, %edx
184 ; AVX-NEXT: andl $64, %edx
185 ; AVX-NEXT: shll $3, %edx
186 ; AVX-NEXT: leal (%rdi,%rdi), %esi
187 ; AVX-NEXT: andl $256, %esi # imm = 0x100
188 ; AVX-NEXT: orl %edx, %esi
189 ; AVX-NEXT: movl %edi, %edx
190 ; AVX-NEXT: shrl %edx
191 ; AVX-NEXT: andl $128, %edx
192 ; AVX-NEXT: orl %esi, %edx
193 ; AVX-NEXT: movl %edi, %esi
194 ; AVX-NEXT: shrl $3, %esi
195 ; AVX-NEXT: andl $64, %esi
196 ; AVX-NEXT: orl %edx, %esi
197 ; AVX-NEXT: movl %edi, %edx
198 ; AVX-NEXT: shrl $5, %edx
199 ; AVX-NEXT: andl $32, %edx
200 ; AVX-NEXT: orl %esi, %edx
201 ; AVX-NEXT: movl %edi, %esi
202 ; AVX-NEXT: shrl $7, %esi
203 ; AVX-NEXT: andl $16, %esi
204 ; AVX-NEXT: orl %edx, %esi
205 ; AVX-NEXT: movl %edi, %edx
206 ; AVX-NEXT: shrl $9, %edx
207 ; AVX-NEXT: andl $8, %edx
208 ; AVX-NEXT: orl %esi, %edx
209 ; AVX-NEXT: movl %edi, %esi
210 ; AVX-NEXT: shrl $11, %esi
211 ; AVX-NEXT: andl $4, %esi
212 ; AVX-NEXT: orl %edx, %esi
213 ; AVX-NEXT: shrl $13, %edi
214 ; AVX-NEXT: andl $2, %edi
215 ; AVX-NEXT: orl %esi, %edi
216 ; AVX-NEXT: shrl $15, %ecx
217 ; AVX-NEXT: orl %edi, %ecx
218 ; AVX-NEXT: orl %ecx, %eax
219 ; AVX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
220 ; AVX-NEXT: retq
224 ; XOP-NEXT: vmovd %edi, %xmm0
225 ; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
226 ; XOP-NEXT: vmovd %xmm0, %eax
227 ; XOP-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
228 ; XOP-NEXT: retq
236 ; SSE-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
237 ; SSE-NEXT: movl %edi, %eax
238 ; SSE-NEXT: shll $31, %eax
239 ; SSE-NEXT: movl %edi, %ecx
240 ; SSE-NEXT: andl $2, %ecx
241 ; SSE-NEXT: shll $29, %ecx
242 ; SSE-NEXT: leal (%rcx,%rax), %eax
243 ; SSE-NEXT: movl %edi, %ecx
244 ; SSE-NEXT: andl $4, %ecx
245 ; SSE-NEXT: shll $27, %ecx
246 ; SSE-NEXT: orl %ecx, %eax
247 ; SSE-NEXT: movl %edi, %ecx
248 ; SSE-NEXT: andl $8, %ecx
249 ; SSE-NEXT: shll $25, %ecx
250 ; SSE-NEXT: orl %ecx, %eax
251 ; SSE-NEXT: movl %edi, %ecx
252 ; SSE-NEXT: andl $16, %ecx
253 ; SSE-NEXT: shll $23, %ecx
254 ; SSE-NEXT: orl %ecx, %eax
255 ; SSE-NEXT: movl %edi, %ecx
256 ; SSE-NEXT: andl $32, %ecx
257 ; SSE-NEXT: shll $21, %ecx
258 ; SSE-NEXT: orl %ecx, %eax
259 ; SSE-NEXT: movl %edi, %ecx
260 ; SSE-NEXT: andl $64, %ecx
261 ; SSE-NEXT: shll $19, %ecx
262 ; SSE-NEXT: movl %edi, %edx
263 ; SSE-NEXT: shll $17, %edx
264 ; SSE-NEXT: andl $16777216, %edx # imm = 0x1000000
265 ; SSE-NEXT: orl %ecx, %edx
266 ; SSE-NEXT: movl %edi, %ecx
267 ; SSE-NEXT: shll $15, %ecx
268 ; SSE-NEXT: andl $8388608, %ecx # imm = 0x800000
269 ; SSE-NEXT: orl %edx, %ecx
270 ; SSE-NEXT: movl %edi, %edx
271 ; SSE-NEXT: shll $13, %edx
272 ; SSE-NEXT: andl $4194304, %edx # imm = 0x400000
273 ; SSE-NEXT: orl %ecx, %edx
274 ; SSE-NEXT: movl %edi, %ecx
275 ; SSE-NEXT: shll $11, %ecx
276 ; SSE-NEXT: andl $2097152, %ecx # imm = 0x200000
277 ; SSE-NEXT: orl %edx, %ecx
278 ; SSE-NEXT: movl %edi, %edx
279 ; SSE-NEXT: shll $9, %edx
280 ; SSE-NEXT: andl $1048576, %edx # imm = 0x100000
281 ; SSE-NEXT: orl %ecx, %edx
282 ; SSE-NEXT: movl %edi, %ecx
283 ; SSE-NEXT: shll $7, %ecx
284 ; SSE-NEXT: andl $524288, %ecx # imm = 0x80000
285 ; SSE-NEXT: orl %edx, %ecx
286 ; SSE-NEXT: movl %edi, %edx
287 ; SSE-NEXT: shll $5, %edx
288 ; SSE-NEXT: andl $262144, %edx # imm = 0x40000
289 ; SSE-NEXT: orl %ecx, %edx
290 ; SSE-NEXT: leal (,%rdi,8), %ecx
291 ; SSE-NEXT: andl $131072, %ecx # imm = 0x20000
292 ; SSE-NEXT: orl %edx, %ecx
293 ; SSE-NEXT: leal (%rdi,%rdi), %edx
294 ; SSE-NEXT: andl $65536, %edx # imm = 0x10000
295 ; SSE-NEXT: orl %ecx, %edx
296 ; SSE-NEXT: movl %edi, %ecx
297 ; SSE-NEXT: shrl %ecx
298 ; SSE-NEXT: andl $32768, %ecx # imm = 0x8000
299 ; SSE-NEXT: orl %edx, %ecx
300 ; SSE-NEXT: movl %edi, %edx
301 ; SSE-NEXT: shrl $3, %edx
302 ; SSE-NEXT: andl $16384, %edx # imm = 0x4000
303 ; SSE-NEXT: orl %ecx, %edx
304 ; SSE-NEXT: movl %edi, %ecx
305 ; SSE-NEXT: shrl $5, %ecx
306 ; SSE-NEXT: andl $8192, %ecx # imm = 0x2000
307 ; SSE-NEXT: orl %edx, %ecx
308 ; SSE-NEXT: movl %edi, %edx
309 ; SSE-NEXT: shrl $7, %edx
310 ; SSE-NEXT: andl $4096, %edx # imm = 0x1000
311 ; SSE-NEXT: orl %ecx, %edx
312 ; SSE-NEXT: movl %edi, %ecx
313 ; SSE-NEXT: shrl $9, %ecx
314 ; SSE-NEXT: andl $2048, %ecx # imm = 0x800
315 ; SSE-NEXT: orl %edx, %ecx
316 ; SSE-NEXT: movl %edi, %edx
317 ; SSE-NEXT: shrl $11, %edx
318 ; SSE-NEXT: andl $1024, %edx # imm = 0x400
319 ; SSE-NEXT: orl %ecx, %edx
320 ; SSE-NEXT: movl %edi, %ecx
321 ; SSE-NEXT: shrl $13, %ecx
322 ; SSE-NEXT: andl $512, %ecx # imm = 0x200
323 ; SSE-NEXT: orl %edx, %ecx
324 ; SSE-NEXT: movl %edi, %edx
325 ; SSE-NEXT: shrl $15, %edx
326 ; SSE-NEXT: andl $256, %edx # imm = 0x100
327 ; SSE-NEXT: orl %ecx, %edx
328 ; SSE-NEXT: movl %edi, %ecx
329 ; SSE-NEXT: shrl $17, %ecx
330 ; SSE-NEXT: andl $128, %ecx
331 ; SSE-NEXT: orl %edx, %ecx
332 ; SSE-NEXT: movl %edi, %edx
333 ; SSE-NEXT: shrl $19, %edx
334 ; SSE-NEXT: andl $64, %edx
335 ; SSE-NEXT: orl %ecx, %edx
336 ; SSE-NEXT: movl %edi, %ecx
337 ; SSE-NEXT: shrl $21, %ecx
338 ; SSE-NEXT: andl $32, %ecx
339 ; SSE-NEXT: orl %edx, %ecx
340 ; SSE-NEXT: movl %edi, %edx
341 ; SSE-NEXT: shrl $23, %edx
342 ; SSE-NEXT: andl $16, %edx
343 ; SSE-NEXT: orl %ecx, %edx
344 ; SSE-NEXT: movl %edi, %ecx
345 ; SSE-NEXT: shrl $25, %ecx
346 ; SSE-NEXT: andl $8, %ecx
347 ; SSE-NEXT: orl %edx, %ecx
348 ; SSE-NEXT: movl %edi, %edx
349 ; SSE-NEXT: shrl $27, %edx
350 ; SSE-NEXT: andl $4, %edx
351 ; SSE-NEXT: orl %ecx, %edx
352 ; SSE-NEXT: movl %edi, %ecx
353 ; SSE-NEXT: shrl $29, %ecx
354 ; SSE-NEXT: andl $2, %ecx
355 ; SSE-NEXT: orl %edx, %ecx
356 ; SSE-NEXT: shrl $31, %edi
357 ; SSE-NEXT: orl %ecx, %edi
358 ; SSE-NEXT: orl %edi, %eax
359 ; SSE-NEXT: retq
363 ; AVX-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
364 ; AVX-NEXT: movl %edi, %eax
365 ; AVX-NEXT: shll $31, %eax
366 ; AVX-NEXT: movl %edi, %ecx
367 ; AVX-NEXT: andl $2, %ecx
368 ; AVX-NEXT: shll $29, %ecx
369 ; AVX-NEXT: leal (%rcx,%rax), %eax
370 ; AVX-NEXT: movl %edi, %ecx
371 ; AVX-NEXT: andl $4, %ecx
372 ; AVX-NEXT: shll $27, %ecx
373 ; AVX-NEXT: orl %ecx, %eax
374 ; AVX-NEXT: movl %edi, %ecx
375 ; AVX-NEXT: andl $8, %ecx
376 ; AVX-NEXT: shll $25, %ecx
377 ; AVX-NEXT: orl %ecx, %eax
378 ; AVX-NEXT: movl %edi, %ecx
379 ; AVX-NEXT: andl $16, %ecx
380 ; AVX-NEXT: shll $23, %ecx
381 ; AVX-NEXT: orl %ecx, %eax
382 ; AVX-NEXT: movl %edi, %ecx
383 ; AVX-NEXT: andl $32, %ecx
384 ; AVX-NEXT: shll $21, %ecx
385 ; AVX-NEXT: orl %ecx, %eax
386 ; AVX-NEXT: movl %edi, %ecx
387 ; AVX-NEXT: andl $64, %ecx
388 ; AVX-NEXT: shll $19, %ecx
389 ; AVX-NEXT: movl %edi, %edx
390 ; AVX-NEXT: shll $17, %edx
391 ; AVX-NEXT: andl $16777216, %edx # imm = 0x1000000
392 ; AVX-NEXT: orl %ecx, %edx
393 ; AVX-NEXT: movl %edi, %ecx
394 ; AVX-NEXT: shll $15, %ecx
395 ; AVX-NEXT: andl $8388608, %ecx # imm = 0x800000
396 ; AVX-NEXT: orl %edx, %ecx
397 ; AVX-NEXT: movl %edi, %edx
398 ; AVX-NEXT: shll $13, %edx
399 ; AVX-NEXT: andl $4194304, %edx # imm = 0x400000
400 ; AVX-NEXT: orl %ecx, %edx
401 ; AVX-NEXT: movl %edi, %ecx
402 ; AVX-NEXT: shll $11, %ecx
403 ; AVX-NEXT: andl $2097152, %ecx # imm = 0x200000
404 ; AVX-NEXT: orl %edx, %ecx
405 ; AVX-NEXT: movl %edi, %edx
406 ; AVX-NEXT: shll $9, %edx
407 ; AVX-NEXT: andl $1048576, %edx # imm = 0x100000
408 ; AVX-NEXT: orl %ecx, %edx
409 ; AVX-NEXT: movl %edi, %ecx
410 ; AVX-NEXT: shll $7, %ecx
411 ; AVX-NEXT: andl $524288, %ecx # imm = 0x80000
412 ; AVX-NEXT: orl %edx, %ecx
413 ; AVX-NEXT: movl %edi, %edx
414 ; AVX-NEXT: shll $5, %edx
415 ; AVX-NEXT: andl $262144, %edx # imm = 0x40000
416 ; AVX-NEXT: orl %ecx, %edx
417 ; AVX-NEXT: leal (,%rdi,8), %ecx
418 ; AVX-NEXT: andl $131072, %ecx # imm = 0x20000
419 ; AVX-NEXT: orl %edx, %ecx
420 ; AVX-NEXT: leal (%rdi,%rdi), %edx
421 ; AVX-NEXT: andl $65536, %edx # imm = 0x10000
422 ; AVX-NEXT: orl %ecx, %edx
423 ; AVX-NEXT: movl %edi, %ecx
424 ; AVX-NEXT: shrl %ecx
425 ; AVX-NEXT: andl $32768, %ecx # imm = 0x8000
426 ; AVX-NEXT: orl %edx, %ecx
427 ; AVX-NEXT: movl %edi, %edx
428 ; AVX-NEXT: shrl $3, %edx
429 ; AVX-NEXT: andl $16384, %edx # imm = 0x4000
430 ; AVX-NEXT: orl %ecx, %edx
431 ; AVX-NEXT: movl %edi, %ecx
432 ; AVX-NEXT: shrl $5, %ecx
433 ; AVX-NEXT: andl $8192, %ecx # imm = 0x2000
434 ; AVX-NEXT: orl %edx, %ecx
435 ; AVX-NEXT: movl %edi, %edx
436 ; AVX-NEXT: shrl $7, %edx
437 ; AVX-NEXT: andl $4096, %edx # imm = 0x1000
438 ; AVX-NEXT: orl %ecx, %edx
439 ; AVX-NEXT: movl %edi, %ecx
440 ; AVX-NEXT: shrl $9, %ecx
441 ; AVX-NEXT: andl $2048, %ecx # imm = 0x800
442 ; AVX-NEXT: orl %edx, %ecx
443 ; AVX-NEXT: movl %edi, %edx
444 ; AVX-NEXT: shrl $11, %edx
445 ; AVX-NEXT: andl $1024, %edx # imm = 0x400
446 ; AVX-NEXT: orl %ecx, %edx
447 ; AVX-NEXT: movl %edi, %ecx
448 ; AVX-NEXT: shrl $13, %ecx
449 ; AVX-NEXT: andl $512, %ecx # imm = 0x200
450 ; AVX-NEXT: orl %edx, %ecx
451 ; AVX-NEXT: movl %edi, %edx
452 ; AVX-NEXT: shrl $15, %edx
453 ; AVX-NEXT: andl $256, %edx # imm = 0x100
454 ; AVX-NEXT: orl %ecx, %edx
455 ; AVX-NEXT: movl %edi, %ecx
456 ; AVX-NEXT: shrl $17, %ecx
457 ; AVX-NEXT: andl $128, %ecx
458 ; AVX-NEXT: orl %edx, %ecx
459 ; AVX-NEXT: movl %edi, %edx
460 ; AVX-NEXT: shrl $19, %edx
461 ; AVX-NEXT: andl $64, %edx
462 ; AVX-NEXT: orl %ecx, %edx
463 ; AVX-NEXT: movl %edi, %ecx
464 ; AVX-NEXT: shrl $21, %ecx
465 ; AVX-NEXT: andl $32, %ecx
466 ; AVX-NEXT: orl %edx, %ecx
467 ; AVX-NEXT: movl %edi, %edx
468 ; AVX-NEXT: shrl $23, %edx
469 ; AVX-NEXT: andl $16, %edx
470 ; AVX-NEXT: orl %ecx, %edx
471 ; AVX-NEXT: movl %edi, %ecx
472 ; AVX-NEXT: shrl $25, %ecx
473 ; AVX-NEXT: andl $8, %ecx
474 ; AVX-NEXT: orl %edx, %ecx
475 ; AVX-NEXT: movl %edi, %edx
476 ; AVX-NEXT: shrl $27, %edx
477 ; AVX-NEXT: andl $4, %edx
478 ; AVX-NEXT: orl %ecx, %edx
479 ; AVX-NEXT: movl %edi, %ecx
480 ; AVX-NEXT: shrl $29, %ecx
481 ; AVX-NEXT: andl $2, %ecx
482 ; AVX-NEXT: orl %edx, %ecx
483 ; AVX-NEXT: shrl $31, %edi
484 ; AVX-NEXT: orl %ecx, %edi
485 ; AVX-NEXT: orl %edi, %eax
486 ; AVX-NEXT: retq
490 ; XOP-NEXT: vmovd %edi, %xmm0
491 ; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
492 ; XOP-NEXT: vmovd %xmm0, %eax
493 ; XOP-NEXT
501 ; SSE-NEXT: leaq (%rdi,%rdi), %rax
502 ; SSE-NEXT: movabsq $4294967296, %rcx # imm = 0x100000000
503 ; SSE-NEXT: andq %rax, %rcx
504 ; SSE-NEXT: movq %rdi, %rax
505 ; SSE-NEXT: shlq $63, %rax
506 ; SSE-NEXT: movq %rdi, %rdx
507 ; SSE-NEXT: andq $2, %rdx
508 ; SSE-NEXT: shlq $61, %rdx
509 ; SSE-NEXT: leaq (%rdx,%rax), %rax
510 ; SSE-NEXT: movq %rdi, %rdx
511 ; SSE-NEXT: andq $4, %rdx
512 ; SSE-NEXT: shlq $59, %rdx
513 ; SSE-NEXT: orq %rdx, %rax
514 ; SSE-NEXT: movq %rdi, %rdx
515 ; SSE-NEXT: andq $8, %rdx
516 ; SSE-NEXT: shlq $57, %rdx
517 ; SSE-NEXT: orq %rdx, %rax
518 ; SSE-NEXT: movq %rdi, %rdx
519 ; SSE-NEXT: andq $16, %rdx
520 ; SSE-NEXT: shlq $55, %rdx
521 ; SSE-NEXT: orq %rdx, %rax
522 ; SSE-NEXT: movq %rdi, %rdx
523 ; SSE-NEXT: andq $32, %rdx
524 ; SSE-NEXT: shlq $53, %rdx
525 ; SSE-NEXT: orq %rdx, %rax
526 ; SSE-NEXT: movq %rdi, %rdx
527 ; SSE-NEXT: andq $64, %rdx
528 ; SSE-NEXT: shlq $51, %rdx
529 ; SSE-NEXT: movq %rdi, %rsi
530 ; SSE-NEXT: andq $128, %rsi
531 ; SSE-NEXT: shlq $49, %rsi
532 ; SSE-NEXT: orq %rdx, %rsi
533 ; SSE-NEXT: movq %rdi, %rdx
534 ; SSE-NEXT: andq $256, %rdx # imm = 0x100
535 ; SSE-NEXT: shlq $47, %rdx
536 ; SSE-NEXT: orq %rsi, %rdx
537 ; SSE-NEXT: movq %rdi, %rsi
538 ; SSE-NEXT: andq $512, %rsi # imm = 0x200
539 ; SSE-NEXT: shlq $45, %rsi
540 ; SSE-NEXT: orq %rdx, %rsi
541 ; SSE-NEXT: movq %rdi, %rdx
542 ; SSE-NEXT: andq $1024, %rdx # imm = 0x400
543 ; SSE-NEXT: shlq $43, %rdx
544 ; SSE-NEXT: orq %rsi, %rdx
545 ; SSE-NEXT: movq %rdi, %rsi
546 ; SSE-NEXT: andq $2048, %rsi # imm = 0x800
547 ; SSE-NEXT: shlq $41, %rsi
548 ; SSE-NEXT: orq %rdx, %rsi
549 ; SSE-NEXT: movq %rdi, %rdx
550 ; SSE-NEXT: andq $4096, %rdx # imm = 0x1000
551 ; SSE-NEXT: shlq $39, %rdx
552 ; SSE-NEXT: orq %rsi, %rdx
553 ; SSE-NEXT: movq %rdi, %rsi
554 ; SSE-NEXT: andq $8192, %rsi # imm = 0x2000
555 ; SSE-NEXT: shlq $37, %rsi
556 ; SSE-NEXT: orq %rdx, %rsi
557 ; SSE-NEXT: movq %rdi, %rdx
558 ; SSE-NEXT: andq $16384, %rdx # imm = 0x4000
559 ; SSE-NEXT: shlq $35, %rdx
560 ; SSE-NEXT: orq %rsi, %rdx
561 ; SSE-NEXT: movq %rdi, %rsi
562 ; SSE-NEXT: andq $32768, %rsi # imm = 0x8000
563 ; SSE-NEXT: shlq $33, %rsi
564 ; SSE-NEXT: orq %rdx, %rsi
565 ; SSE-NEXT: movq %rdi, %rdx
566 ; SSE-NEXT: andq $65536, %rdx # imm = 0x10000
567 ; SSE-NEXT: shlq $31, %rdx
568 ; SSE-NEXT: orq %rsi, %rdx
569 ; SSE-NEXT: movq %rdi, %rsi
570 ; SSE-NEXT: andq $131072, %rsi # imm = 0x20000
571 ; SSE-NEXT: shlq $29, %rsi
572 ; SSE-NEXT: orq %rdx, %rsi
573 ; SSE-NEXT: movq %rdi, %rdx
574 ; SSE-NEXT: andq $262144, %rdx # imm = 0x40000
575 ; SSE-NEXT: shlq $27, %rdx
576 ; SSE-NEXT: orq %rsi, %rdx
577 ; SSE-NEXT: movq %rdi, %rsi
578 ; SSE-NEXT: andq $524288, %rsi # imm = 0x80000
579 ; SSE-NEXT: shlq $25, %rsi
580 ; SSE-NEXT: orq %rdx, %rsi
581 ; SSE-NEXT: movq %rdi, %rdx
582 ; SSE-NEXT: andq $1048576, %rdx # imm = 0x100000
583 ; SSE-NEXT: shlq $23, %rdx
584 ; SSE-NEXT: orq %rsi, %rdx
585 ; SSE-NEXT: movq %rdi, %rsi
586 ; SSE-NEXT: andq $2097152, %rsi # imm = 0x200000
587 ; SSE-NEXT: shlq $21, %rsi
588 ; SSE-NEXT: orq %rdx, %rsi
589 ; SSE-NEXT: movq %rdi, %rdx
590 ; SSE-NEXT: andq $4194304, %rdx # imm = 0x400000
591 ; SSE-NEXT: shlq $19, %rdx
592 ; SSE-NEXT: orq %rsi, %rdx
593 ; SSE-NEXT: movq %rdi, %rsi
594 ; SSE-NEXT: andq $8388608, %rsi # imm = 0x800000
595 ; SSE-NEXT: shlq $17, %rsi
596 ; SSE-NEXT: orq %rdx, %rsi
597 ; SSE-NEXT: movq %rdi, %rdx
598 ; SSE-NEXT: andq $16777216, %rdx # imm = 0x1000000
599 ; SSE-NEXT: shlq $15, %rdx
600 ; SSE-NEXT: orq %rsi, %rdx
601 ; SSE-NEXT: movq %rdi, %rsi
602 ; SSE-NEXT: andq $33554432, %rsi # imm = 0x2000000
603 ; SSE-NEXT: shlq $13, %rsi
604 ; SSE-NEXT: orq %rdx, %rsi
605 ; SSE-NEXT: movq %rdi, %rdx
606 ; SSE-NEXT: andq $67108864, %rdx # imm = 0x4000000
607 ; SSE-NEXT: shlq $11, %rdx
608 ; SSE-NEXT: orq %rsi, %rdx
609 ; SSE-NEXT: movq %rdi, %rsi
610 ; SSE-NEXT: andq $134217728, %rsi # imm = 0x8000000
611 ; SSE-NEXT: shlq $9, %rsi
612 ; SSE-NEXT: orq %rdx, %rsi
613 ; SSE-NEXT: movq %rdi, %rdx
614 ; SSE-NEXT: andq $268435456, %rdx # imm = 0x10000000
615 ; SSE-NEXT: shlq $7, %rdx
616 ; SSE-NEXT: orq %rsi, %rdx
617 ; SSE-NEXT: movq %rdi, %rsi
618 ; SSE-NEXT: andq $536870912, %rsi # imm = 0x20000000
619 ; SSE-NEXT: shlq $5, %rsi
620 ; SSE-NEXT: orq %rdx, %rsi
621 ; SSE-NEXT: movq %rdi, %rdx
622 ; SSE-NEXT: andq $1073741824, %rdx # imm = 0x40000000
623 ; SSE-NEXT: shlq $3, %rdx
624 ; SSE-NEXT: orq %rsi, %rdx
625 ; SSE-NEXT: orq %rcx, %rdx
626 ; SSE-NEXT: movq %rdi, %rcx
627 ; SSE-NEXT: shrq %rcx
628 ; SSE-NEXT: andl $-2147483648, %ecx # imm = 0x80000000
629 ; SSE-NEXT: orq %rdx, %rcx
630 ; SSE-NEXT: movq %rdi, %rdx
631 ; SSE-NEXT: shrq $3, %rdx
632 ; SSE-NEXT: andl $1073741824, %edx # imm = 0x40000000
633 ; SSE-NEXT: orq %rcx, %rdx
634 ; SSE-NEXT: movq %rdi, %rcx
635 ; SSE-NEXT: shrq $5, %rcx
636 ; SSE-NEXT: andl $536870912, %ecx # imm = 0x20000000
637 ; SSE-NEXT: orq %rdx, %rcx
638 ; SSE-NEXT: movq %rdi, %rdx
639 ; SSE-NEXT: shrq $7, %rdx
640 ; SSE-NEXT: andl $268435456, %edx # imm = 0x10000000
641 ; SSE-NEXT: orq %rcx, %rdx
642 ; SSE-NEXT: movq %rdi, %rcx
643 ; SSE-NEXT: shrq $9, %rcx
644 ; SSE-NEXT: andl $134217728, %ecx # imm = 0x8000000
645 ; SSE-NEXT: orq %rdx, %rcx
646 ; SSE-NEXT: movq %rdi, %rdx
647 ; SSE-NEXT: shrq $11, %rdx
648 ; SSE-NEXT: andl $67108864, %edx # imm = 0x4000000
649 ; SSE-NEXT: orq %rcx, %rdx
650 ; SSE-NEXT: movq %rdi, %rcx
651 ; SSE-NEXT: shrq $13, %rcx
652 ; SSE-NEXT: andl $33554432, %ecx # imm = 0x2000000
653 ; SSE-NEXT: orq %rdx, %rcx
654 ; SSE-NEXT: movq %rdi, %rdx
655 ; SSE-NEXT: shrq $15, %rdx
656 ; SSE-NEXT: andl $16777216, %edx # imm = 0x1000000
657 ; SSE-NEXT: orq %rcx, %rdx
658 ; SSE-NEXT: movq %rdi, %rcx
659 ; SSE-NEXT: shrq $17, %rcx
660 ; SSE-NEXT: andl $8388608, %ecx # imm = 0x800000
661 ; SSE-NEXT: orq %rdx, %rcx
662 ; SSE-NEXT: movq %rdi, %rdx
663 ; SSE-NEXT: shrq $19, %rdx
664 ; SSE-NEXT: andl $4194304, %edx # imm = 0x400000
665 ; SSE-NEXT: orq %rcx, %rdx
666 ; SSE-NEXT: movq %rdi, %rcx
667 ; SSE-NEXT: shrq $21, %rcx
668 ; SSE-NEXT: andl $2097152, %ecx # imm = 0x200000
669 ; SSE-NEXT: orq %rdx, %rcx
670 ; SSE-NEXT: movq %rdi, %rdx
671 ; SSE-NEXT: shrq $23, %rdx
672 ; SSE-NEXT: andl $1048576, %edx # imm = 0x100000
673 ; SSE-NEXT: orq %rcx, %rdx
674 ; SSE-NEXT: movq %rdi, %rcx
675 ; SSE-NEXT: shrq $25, %rcx
676 ; SSE-NEXT: andl $524288, %ecx # imm = 0x80000
677 ; SSE-NEXT: orq %rdx, %rcx
678 ; SSE-NEXT: movq %rdi, %rdx
679 ; SSE-NEXT: shrq $27, %rdx
680 ; SSE-NEXT: andl $262144, %edx # imm = 0x40000
681 ; SSE-NEXT: orq %rcx, %rdx
682 ; SSE-NEXT: movq %rdi, %rcx
683 ; SSE-NEXT: shrq $29, %rcx
684 ; SSE-NEXT: andl $131072, %ecx # imm = 0x20000
685 ; SSE-NEXT: orq %rdx, %rcx
686 ; SSE-NEXT: movq %rdi, %rdx
687 ; SSE-NEXT: shrq $31, %rdx
688 ; SSE-NEXT: andl $65536, %edx # imm = 0x10000
689 ; SSE-NEXT: orq %rcx, %rdx
690 ; SSE-NEXT: movq %rdi, %rcx
691 ; SSE-NEXT: shrq $33, %rcx
692 ; SSE-NEXT: andl $32768, %ecx # imm = 0x8000
693 ; SSE-NEXT: orq %rdx, %rcx
694 ; SSE-NEXT: movq %rdi, %rdx
695 ; SSE-NEXT: shrq $35, %rdx
696 ; SSE-NEXT: andl $16384, %edx # imm = 0x4000
697 ; SSE-NEXT: orq %rcx, %rdx
698 ; SSE-NEXT: movq %rdi, %rcx
699 ; SSE-NEXT: shrq $37, %rcx
700 ; SSE-NEXT: andl $8192, %ecx # imm = 0x2000
701 ; SSE-NEXT: orq %rdx, %rcx
702 ; SSE-NEXT: movq %rdi, %rdx
703 ; SSE-NEXT: shrq $39, %rdx
704 ; SSE-NEXT: andl $4096, %edx # imm = 0x1000
705 ; SSE-NEXT: orq %rcx, %rdx
706 ; SSE-NEXT: movq %rdi, %rcx
707 ; SSE-NEXT: shrq $41, %rcx
708 ; SSE-NEXT: andl $2048, %ecx # imm = 0x800
709 ; SSE-NEXT: orq %rdx, %rcx
710 ; SSE-NEXT: movq %rdi, %rdx
711 ; SSE-NEXT: shrq $43, %rdx
712 ; SSE-NEXT: andl $1024, %edx # imm = 0x400
713 ; SSE-NEXT: orq %rcx, %rdx
714 ; SSE-NEXT: movq %rdi, %rcx
715 ; SSE-NEXT: shrq $45, %rcx
716 ; SSE-NEXT: andl $512, %ecx # imm = 0x200
717 ; SSE-NEXT: orq %rdx, %rcx
718 ; SSE-NEXT: movq %rdi, %rdx
719 ; SSE-NEXT: shrq $47, %rdx
720 ; SSE-NEXT: andl $256, %edx # imm = 0x100
721 ; SSE-NEXT: orq %rcx, %rdx
722 ; SSE-NEXT: movq %rdi, %rcx
723 ; SSE-NEXT: shrq $49, %rcx
724 ; SSE-NEXT: andl $128, %ecx
725 ; SSE-NEXT: orq %rdx, %rcx
726 ; SSE-NEXT: movq %rdi, %rdx
727 ; SSE-NEXT: shrq $51, %rdx
728 ; SSE-NEXT: andl $64, %edx
729 ; SSE-NEXT: orq %rcx, %rdx
730 ; SSE-NEXT: movq %rdi, %rcx
731 ; SSE-NEXT: shrq $53, %rcx
732 ; SSE-NEXT: andl $32, %ecx
733 ; SSE-NEXT: orq %rdx, %rcx
734 ; SSE-NEXT: movq %rdi, %rdx
735 ; SSE-NEXT: shrq $55, %rdx
736 ; SSE-NEXT: andl $16, %edx
737 ; SSE-NEXT: orq %rcx, %rdx
738 ; SSE-NEXT: movq %rdi, %rcx
739 ; SSE-NEXT: shrq $57, %rcx
740 ; SSE-NEXT: andl $8, %ecx
741 ; SSE-NEXT: orq %rdx, %rcx
742 ; SSE-NEXT: movq %rdi, %rdx
743 ; SSE-NEXT: shrq $59, %rdx
744 ; SSE-NEXT: andl $4, %edx
745 ; SSE-NEXT: orq %rcx, %rdx
746 ; SSE-NEXT: movq %rdi, %rcx
747 ; SSE-NEXT: shrq $61, %rcx
748 ; SSE-NEXT: andl $2, %ecx
749 ; SSE-NEXT: orq %rdx, %rcx
750 ; SSE-NEXT: shrq $63, %rdi
751 ; SSE-NEXT: orq %rcx, %rdi
752 ; SSE-NEXT: orq %rdi, %rax
753 ; SSE-NEXT: retq
757 ; AVX-NEXT: leaq (%rdi,%rdi), %rax
758 ; AVX-NEXT: movabsq $4294967296, %rcx # imm = 0x100000000
759 ; AVX-NEXT: andq %rax, %rcx
760 ; AVX-NEXT: movq %rdi, %rax
761 ; AVX-NEXT: shlq $63, %rax
762 ; AVX-NEXT: movq %rdi, %rdx
763 ; AVX-NEXT: andq $2, %rdx
764 ; AVX-NEXT: shlq $61, %rdx
765 ; AVX-NEXT: leaq (%rdx,%rax), %rax
766 ; AVX-NEXT: movq %rdi, %rdx
767 ; AVX-NEXT: andq $4, %rdx
768 ; AVX-NEXT: shlq $59, %rdx
769 ; AVX-NEXT: orq %rdx, %rax
770 ; AVX-NEXT: movq %rdi, %rdx
771 ; AVX-NEXT: andq $8, %rdx
772 ; AVX-NEXT: shlq $57, %rdx
773 ; AVX-NEXT: orq %rdx, %rax
774 ; AVX-NEXT: movq %rdi, %rdx
775 ; AVX-NEXT: andq $16, %rdx
776 ; AVX-NEXT: shlq $55, %rdx
777 ; AVX-NEXT: orq %rdx, %rax
778 ; AVX-NEXT: movq %rdi, %rdx
779 ; AVX-NEXT: andq $32, %rdx
780 ; AVX-NEXT: shlq $53, %rdx
781 ; AVX-NEXT: orq %rdx, %rax
782 ; AVX-NEXT: movq %rdi, %rdx
783 ; AVX-NEXT: andq $64, %rdx
784 ; AVX-NEXT: shlq $51, %rdx
785 ; AVX-NEXT: movq %rdi, %rsi
786 ; AVX-NEXT: andq $128, %rsi
787 ; AVX-NEXT: shlq $49, %rsi
788 ; AVX-NEXT: orq %rdx, %rsi
789 ; AVX-NEXT: movq %rdi, %rdx
790 ; AVX-NEXT: andq $256, %rdx # imm = 0x100
791 ; AVX-NEXT: shlq $47, %rdx
792 ; AVX-NEXT: orq %rsi, %rdx
793 ; AVX-NEXT: movq %rdi, %rsi
794 ; AVX-NEXT: andq $512, %rsi # imm = 0x200
795 ; AVX-NEXT: shlq $45, %rsi
796 ; AVX-NEXT: orq %rdx, %rsi
797 ; AVX-NEXT: movq %rdi, %rdx
798 ; AVX-NEXT: andq $1024, %rdx # imm = 0x400
799 ; AVX-NEXT: shlq $43, %rdx
800 ; AVX-NEXT: orq %rsi, %rdx
801 ; AVX-NEXT: movq %rdi, %rsi
802 ; AVX-NEXT: andq $2048, %rsi # imm = 0x800
803 ; AVX-NEXT: shlq $41, %rsi
804 ; AVX-NEXT: orq %rdx, %rsi
805 ; AVX-NEXT: movq %rdi, %rdx
806 ; AVX-NEXT: andq $4096, %rdx # imm = 0x1000
807 ; AVX-NEXT: shlq $39, %rdx
808 ; AVX-NEXT: orq %rsi, %rdx
809 ; AVX-NEXT: movq %rdi, %rsi
810 ; AVX-NEXT: andq $8192, %rsi # imm = 0x2000
811 ; AVX-NEXT: shlq $37, %rsi
812 ; AVX-NEXT: orq %rdx, %rsi
813 ; AVX-NEXT: movq %rdi, %rdx
814 ; AVX-NEXT: andq $16384, %rdx # imm = 0x4000
815 ; AVX-NEXT: shlq $35, %rdx
816 ; AVX-NEXT: orq %rsi, %rdx
817 ; AVX-NEXT: movq %rdi, %rsi
818 ; AVX-NEXT: andq $32768, %rsi # imm = 0x8000
819 ; AVX-NEXT: shlq $33, %rsi
820 ; AVX-NEXT: orq %rdx, %rsi
821 ; AVX-NEXT: movq %rdi, %rdx
822 ; AVX-NEXT: andq $65536, %rdx # imm = 0x10000
823 ; AVX-NEXT: shlq $31, %rdx
824 ; AVX-NEXT: orq %rsi, %rdx
825 ; AVX-NEXT: movq %rdi, %rsi
826 ; AVX-NEXT: andq $131072, %rsi # imm = 0x20000
827 ; AVX-NEXT: shlq $29, %rsi
828 ; AVX-NEXT: orq %rdx, %rsi
829 ; AVX-NEXT: movq %rdi, %rdx
830 ; AVX-NEXT: andq $262144, %rdx # imm = 0x40000
831 ; AVX-NEXT: shlq $27, %rdx
832 ; AVX-NEXT: orq %rsi, %rdx
833 ; AVX-NEXT: movq %rdi, %rsi
834 ; AVX-NEXT: andq $524288, %rsi # imm = 0x80000
835 ; AVX-NEXT: shlq $25, %rsi
836 ; AVX-NEXT: orq %rdx, %rsi
837 ; AVX-NEXT: movq %rdi, %rdx
838 ; AVX-NEXT: andq $1048576, %rdx # imm = 0x100000
839 ; AVX-NEXT: shlq $23, %rdx
840 ; AVX-NEXT: orq %rsi, %rdx
841 ; AVX-NEXT: movq %rdi, %rsi
842 ; AVX-NEXT: andq $2097152, %rsi # imm = 0x200000
843 ; AVX-NEXT: shlq $21, %rsi
844 ; AVX-NEXT: orq %rdx, %rsi
845 ; AVX-NEXT: movq %rdi, %rdx
846 ; AVX-NEXT: andq $4194304, %rdx # imm = 0x400000
847 ; AVX-NEXT: shlq $19, %rdx
848 ; AVX-NEXT: orq %rsi, %rdx
849 ; AVX-NEXT: movq %rdi, %rsi
850 ; AVX-NEXT: andq $8388608, %rsi # imm = 0x800000
851 ; AVX-NEXT: shlq $17, %rsi
852 ; AVX-NEXT: orq %rdx, %rsi
853 ; AVX-NEXT: movq %rdi, %rdx
854 ; AVX-NEXT: andq $16777216, %rdx # imm = 0x1000000
855 ; AVX-NEXT: shlq $15, %rdx
856 ; AVX-NEXT: orq %rsi, %rdx
857 ; AVX-NEXT: movq %rdi, %rsi
858 ; AVX-NEXT: andq $33554432, %rsi # imm = 0x2000000
859 ; AVX-NEXT: shlq $13, %rsi
860 ; AVX-NEXT: orq %rdx, %rsi
861 ; AVX-NEXT: movq %rdi, %rdx
862 ; AVX-NEXT: andq $67108864, %rdx # imm = 0x4000000
863 ; AVX-NEXT: shlq $11, %rdx
864 ; AVX-NEXT: orq %rsi, %rdx
865 ; AVX-NEXT: movq %rdi, %rsi
866 ; AVX-NEXT: andq $134217728, %rsi # imm = 0x8000000
867 ; AVX-NEXT: shlq $9, %rsi
868 ; AVX-NEXT: orq %rdx, %rsi
869 ; AVX-NEXT: movq %rdi, %rdx
870 ; AVX-NEXT: andq $268435456, %rdx # imm = 0x10000000
871 ; AVX-NEXT: shlq $7, %rdx
872 ; AVX-NEXT: orq %rsi, %rdx
873 ; AVX-NEXT: movq %rdi, %rsi
874 ; AVX-NEXT: andq $536870912, %rsi # imm = 0x20000000
875 ; AVX-NEXT: shlq $5, %rsi
876 ; AVX-NEXT: orq %rdx, %rsi
877 ; AVX-NEXT: movq %rdi, %rdx
878 ; AVX-NEXT: andq $1073741824, %rdx # imm = 0x40000000
879 ; AVX-NEXT: shlq $3, %rdx
880 ; AVX-NEXT: orq %rsi, %rdx
881 ; AVX-NEXT: orq %rcx, %rdx
882 ; AVX-NEXT: movq %rdi, %rcx
883 ; AVX-NEXT: shrq %rcx
884 ; AVX-NEXT: andl $-2147483648, %ecx # imm = 0x80000000
885 ; AVX-NEXT: orq %rdx, %rcx
886 ; AVX-NEXT: movq %rdi, %rdx
887 ; AVX-NEXT: shrq $3, %rdx
888 ; AVX-NEXT: andl $1073741824, %edx # imm = 0x40000000
889 ; AVX-NEXT: orq %rcx, %rdx
890 ; AVX-NEXT: movq %rdi, %rcx
891 ; AVX-NEXT: shrq $5, %rcx
892 ; AVX-NEXT: andl $536870912, %ecx # imm = 0x20000000
893 ; AVX-NEXT: orq %rdx, %rcx
894 ; AVX-NEXT: movq %rdi, %rdx
895 ; AVX-NEXT: shrq $7, %rdx
896 ; AVX-NEXT: andl $268435456, %edx # imm = 0x10000000
897 ; AVX-NEXT: orq %rcx, %rdx
898 ; AVX-NEXT: movq %rdi, %rcx
899 ; AVX-NEXT: shrq $9, %rcx
900 ; AVX-NEXT: andl $134217728, %ecx # imm = 0x8000000
901 ; AVX-NEXT: orq %rdx, %rcx
902 ; AVX-NEXT: movq %rdi, %rdx
903 ; AVX-NEXT: shrq $11, %rdx
904 ; AVX-NEXT: andl $67108864, %edx # imm = 0x4000000
905 ; AVX-NEXT: orq %rcx, %rdx
906 ; AVX-NEXT: movq %rdi, %rcx
907 ; AVX-NEXT: shrq $13, %rcx
908 ; AVX-NEXT: andl $33554432, %ecx # imm = 0x2000000
909 ; AVX-NEXT: orq %rdx, %rcx
910 ; AVX-NEXT: movq %rdi, %rdx
911 ; AVX-NEXT: shrq $15, %rdx
912 ; AVX-NEXT: andl $16777216, %edx # imm = 0x1000000
913 ; AVX-NEXT: orq %rcx, %rdx
914 ; AVX-NEXT: movq %rdi, %rcx
915 ; AVX-NEXT: shrq $17, %rcx
916 ; AVX-NEXT: andl $8388608, %ecx # imm = 0x800000
917 ; AVX-NEXT: orq %rdx, %rcx
918 ; AVX-NEXT: movq %rdi, %rdx
919 ; AVX-NEXT: shrq $19, %rdx
920 ; AVX-NEXT: andl $4194304, %edx # imm = 0x400000
921 ; AVX-NEXT: orq %rcx, %rdx
922 ; AVX-NEXT: movq %rdi, %rcx
923 ; AVX-NEXT: shrq $21, %rcx
924 ; AVX-NEXT: andl $2097152, %ecx # imm = 0x200000
925 ; AVX-NEXT: orq %rdx, %rcx
926 ; AVX-NEXT: movq %rdi, %rdx
927 ; AVX-NEXT: shrq $23, %rdx
928 ; AVX-NEXT: andl $1048576, %edx # imm = 0x100000
929 ; AVX-NEXT: orq %rcx, %rdx
930 ; AVX-NEXT: movq %rdi, %rcx
931 ; AVX-NEXT: shrq $25, %rcx
932 ; AVX-NEXT: andl $524288, %ecx # imm = 0x80000
933 ; AVX-NEXT: orq %rdx, %rcx
934 ; AVX-NEXT: movq %rdi, %rdx
935 ; AVX-NEXT: shrq $27, %rdx
936 ; AVX-NEXT: andl $262144, %edx # imm = 0x40000
937 ; AVX-NEXT: orq %rcx, %rdx
938 ; AVX-NEXT: movq %rdi, %rcx
939 ; AVX-NEXT: shrq $29, %rcx
940 ; AVX-NEXT: andl $131072, %ecx # imm = 0x20000
941 ; AVX-NEXT: orq %rdx, %rcx
942 ; AVX-NEXT: movq %rdi, %rdx
943 ; AVX-NEXT: shrq $31, %rdx
944 ; AVX-NEXT: andl $65536, %edx # imm = 0x10000
945 ; AVX-NEXT: orq %rcx, %rdx
946 ; AVX-NEXT: movq %rdi, %rcx
947 ; AVX-NEXT: shrq $33, %rcx
948 ; AVX-NEXT: andl $32768, %ecx # imm = 0x8000
949 ; AVX-NEXT: orq %rdx, %rcx
950 ; AVX-NEXT: movq %rdi, %rdx
951 ; AVX-NEXT: shrq $35, %rdx
952 ; AVX-NEXT: andl $16384, %edx # imm = 0x4000
953 ; AVX-NEXT: orq %rcx, %rdx
954 ; AVX-NEXT: movq %rdi, %rcx
955 ; AVX-NEXT: shrq $37, %rcx
956 ; AVX-NEXT: andl $8192, %ecx # imm = 0x2000
957 ; AVX-NEXT: orq %rdx, %rcx
958 ; AVX-NEXT: movq %rdi, %rdx
959 ; AVX-NEXT: shrq $39, %rdx
960 ; AVX-NEXT: andl $4096, %edx # imm = 0x1000
961 ; AVX-NEXT: orq %rcx, %rdx
962 ; AVX-NEXT: movq %rdi, %rcx
963 ; AVX-NEXT: shrq $41, %rcx
964 ; AVX-NEXT: andl $2048, %ecx # imm = 0x800
965 ; AVX-NEXT: orq %rdx, %rcx
966 ; AVX-NEXT: movq %rdi, %rdx
967 ; AVX-NEXT: shrq $43, %rdx
968 ; AVX-NEXT: andl $1024, %edx # imm = 0x400
969 ; AVX-NEXT: orq %rcx, %rdx
970 ; AVX-NEXT: movq %rdi, %rcx
971 ; AVX-NEXT: shrq $45, %rcx
972 ; AVX-NEXT
973 ; AVX-NEXT: orq %rdx, %rcx
974 ; AVX-NEXT: movq %rdi, %rdx
975 ; AVX-NEXT: shrq $47, %rdx
976 ; AVX-NEXT: andl $256, %edx # imm = 0x100
977 ; AVX-NEXT: orq %rcx, %rdx
978 ; AVX-NEXT: movq %rdi, %rcx
979 ; AVX-NEXT: shrq $49, %rcx
980 ; AVX-NEXT: andl $128, %ecx
981 ; AVX-NEXT: orq %rdx, %rcx
982 ; AVX-NEXT: movq %rdi, %rdx
983 ; AVX-NEXT: shrq $51, %rdx
984 ; AVX-NEXT: andl $64, %edx
985 ; AVX-NEXT: orq %rcx, %rdx
986 ; AVX-NEXT: movq %rdi, %rcx
987 ; AVX-NEXT: shrq $53, %rcx
988 ; AVX-NEXT: andl $32, %ecx
989 ; AVX-NEXT: orq %rdx, %rcx
990 ; AVX-NEXT: movq %rdi, %rdx
991 ; AVX-NEXT: shrq $55, %rdx
992 ; AVX-NEXT: andl $16, %edx
993 ; AVX-NEXT: orq %rcx, %rdx
994 ; AVX-NEXT: movq %rdi, %rcx
995 ; AVX-NEXT: shrq $57, %rcx
996 ; AVX-NEXT: andl $8, %ecx
997 ; AVX-NEXT: orq %rdx, %rcx
998 ; AVX-NEXT: movq %rdi, %rdx
999 ; AVX-NEXT: shrq $59, %rdx
1000 ; AVX-NEXT: andl $4, %edx
1001 ; AVX-NEXT: orq %rcx, %rdx
1002 ; AVX-NEXT: movq %rdi, %rcx
1003 ; AVX-NEXT: shrq $61, %rcx
1004 ; AVX-NEXT: andl $2, %ecx
1005 ; AVX-NEXT: orq %rdx, %rcx
1006 ; AVX-NEXT: shrq $63, %rdi
1007 ; AVX-NEXT: orq %rcx, %rdi
1008 ; AVX-NEXT: orq %rdi, %rax
1009 ; AVX-NEXT: retq
1013 ; XOP-NEXT: vmovq %rdi, %xmm0
1014 ; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
1015 ; XOP-NEXT: vmovq %xmm0, %rax
1016 ; XOP-NEXT: retq
1024 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1025 ; SSE2-NEXT: psrlw $7, %xmm2
1026 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1027 ; SSE2-NEXT: pand %xmm1, %xmm1
1028 ; SSE2-NEXT: pand %xmm2, %xmm1
1029 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1030 ; SSE2-NEXT: psllw $7, %xmm2
1031 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
1032 ; SSE2-NEXT: pand %xmm3, %xmm3
1033 ; SSE2-NEXT: pand %xmm3, %xmm2
1034 ; SSE2-NEXT: movdqa %xmm0, %xmm3
1035 ; SSE2-NEXT: psllw $5, %xmm3
1036 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
1037 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
1038 ; SSE2-NEXT: movdqa %xmm0, %xmm4
1039 ; SSE2-NEXT: psllw $3, %xmm4
1040 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm4
1041 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm4
1042 ; SSE2-NEXT: por %xmm3, %xmm4
1043 ; SSE2-NEXT: movdqa %xmm0, %xmm3
1044 ; SSE2-NEXT: paddb %xmm3, %xmm3
1045 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
1046 ; SSE2-NEXT: por %xmm4, %xmm3
1047 ; SSE2-NEXT: movdqa %xmm0, %xmm4
1048 ; SSE2-NEXT: psrlw $1, %xmm4
1049 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm4
1050 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm4
1051 ; SSE2-NEXT: por %xmm3, %xmm4
1052 ; SSE2-NEXT: movdqa %xmm0, %xmm3
1053 ; SSE2-NEXT: psrlw $3, %xmm3
1054 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
1055 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
1056 ; SSE2-NEXT: por %xmm4, %xmm3
1057 ; SSE2-NEXT: psrlw $5, %xmm0
1058 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
1059 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
1060 ; SSE2-NEXT: por %xmm3, %xmm0
1061 ; SSE2-NEXT: por %xmm1, %xmm0
1062 ; SSE2-NEXT: por %xmm2, %xmm0
1063 ; SSE2-NEXT: retq
1067 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1068 ; SSSE3-NEXT: movdqa %xmm0, %xmm2
1069 ; SSSE3-NEXT: pand %xmm1, %xmm2
1070 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1071 ; SSSE3-NEXT: pshufb %xmm2, %xmm3
1072 ; SSSE3-NEXT: psrlw $4, %xmm0
1073 ; SSSE3-NEXT: pand %xmm1, %xmm0
1074 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1075 ; SSSE3-NEXT: pshufb %xmm0, %xmm1
1076 ; SSSE3-NEXT: por %xmm3, %xmm1
1077 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
1078 ; SSSE3-NEXT: retq
1082 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1083 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
1084 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1085 ; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
1086 ; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
1087 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
1088 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1089 ; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0
1090 ; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0
1091 ; AVX-NEXT: retq
1095 ; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
1096 ; XOP-NEXT: retq
1104 ; SSE2-NEXT: pxor %xmm1, %xmm1
1105 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1106 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
1107 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7]
1108 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6]
1109 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1110 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
1111 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,5,4,7,6]
1112 ; SSE2-NEXT: packuswb %xmm2, %xmm1
1113 ; SSE2-NEXT: movdqa %xmm1, %xmm0
1114 ; SSE2-NEXT: psllw $7, %xmm0
1115 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
1116 ; SSE2-NEXT: pand %xmm2, %xmm2
1117 ; SSE2-NEXT: pand %xmm0, %xmm2
1118 ; SSE2-NEXT: movdqa %xmm1, %xmm0
1119 ; SSE2-NEXT: psllw $5, %xmm0
1120 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
1121 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
1122 ; SSE2-NEXT: movdqa %xmm1, %xmm3
1123 ; SSE2-NEXT: psllw $3, %xmm3
1124 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
1125 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
1126 ; SSE2-NEXT: por %xmm0, %xmm3
1127 ; SSE2-NEXT: movdqa %xmm1, %xmm0
1128 ; SSE2-NEXT: paddb %xmm0, %xmm0
1129 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
1130 ; SSE2-NEXT: por %xmm3, %xmm0
1131 ; SSE2-NEXT: movdqa %xmm1, %xmm3
1132 ; SSE2-NEXT: psrlw $1, %xmm3
1133 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
1134 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
1135 ; SSE2-NEXT: por %xmm0, %xmm3
1136 ; SSE2-NEXT: movdqa %xmm1, %xmm0
1137 ; SSE2-NEXT: psrlw $3, %xmm0
1138 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
1139 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
1140 ; SSE2-NEXT: por %xmm3, %xmm0
1141 ; SSE2-NEXT: movdqa %xmm1, %xmm3
1142 ; SSE2-NEXT: psrlw $5, %xmm3
1143 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
1144 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
1145 ; SSE2-NEXT: por %xmm0, %xmm3
1146 ; SSE2-NEXT: psrlw $7, %xmm1
1147 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1148 ; SSE2-NEXT: pand %xmm0, %xmm0
1149 ; SSE2-NEXT: pand %xmm1, %xmm0
1150 ; SSE2-NEXT: por %xmm3, %xmm0
1151 ; SSE2-NEXT: por %xmm2, %xmm0
1152 ; SSE2-NEXT: retq
1156 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
1157 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1158 ; SSSE3-NEXT: movdqa %xmm0, %xmm2
1159 ; SSSE3-NEXT: pand %xmm1, %xmm2
1160 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1161 ; SSSE3-NEXT: pshufb %xmm2, %xmm3
1162 ; SSSE3-NEXT: psrlw $4, %xmm0
1163 ; SSSE3-NEXT: pand %xmm1, %xmm0
1164 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1165 ; SSSE3-NEXT: pshufb %xmm0, %xmm1
1166 ; SSSE3-NEXT: por %xmm3, %xmm1
1167 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
1168 ; SSSE3-NEXT: retq
1172 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
1173 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1174 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
1175 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1176 ; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
1177 ; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
1178 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
1179 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1180 ; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0
1181 ; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0
1182 ; AVX-NEXT: retq
1186 ; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
1187 ; XOP-NEXT: retq
1195 ; SSE2-NEXT: pxor %xmm1, %xmm1
1196 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1197 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
1198 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
1199 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
1200 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1201 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
1202 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,6,5,4]
1203 ; SSE2-NEXT: packuswb %xmm2, %xmm1
1204 ; SSE2-NEXT: movdqa %xmm1, %xmm0
1205 ; SSE2-NEXT: psllw $7, %xmm0
1206 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
1207 ; SSE2-NEXT: pand %xmm2, %xmm2
1208 ; SSE2-NEXT: pand %xmm0, %xmm2
1209 ; SSE2-NEXT: movdqa %xmm1, %xmm0
1210 ; SSE2-NEXT: psllw $5, %xmm0
1211 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
1212 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
1213 ; SSE2-NEXT: movdqa %xmm1, %xmm3
1214 ; SSE2-NEXT: psllw $3, %xmm3
1215 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
1216 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
1217 ; SSE2-NEXT: por %xmm0, %xmm3
1218 ; SSE2-NEXT: movdqa %xmm1, %xmm0
1219 ; SSE2-NEXT: paddb %xmm0, %xmm0
1220 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
1221 ; SSE2-NEXT: por %xmm3, %xmm0
1222 ; SSE2-NEXT: movdqa %xmm1, %xmm3
1223 ; SSE2-NEXT: psrlw $1, %xmm3
1224 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
1225 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
1226 ; SSE2-NEXT: por %xmm0, %xmm3
1227 ; SSE2-NEXT: movdqa %xmm1, %xmm0
1228 ; SSE2-NEXT: psrlw $3, %xmm0
1229 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
1230 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
1231 ; SSE2-NEXT: por %xmm3, %xmm0
1232 ; SSE2-NEXT: movdqa %xmm1, %xmm3
1233 ; SSE2-NEXT: psrlw $5, %xmm3
1234 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
1235 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
1236 ; SSE2-NEXT: por %xmm0, %xmm3
1237 ; SSE2-NEXT: psrlw $7, %xmm1
1238 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1239 ; SSE2-NEXT: pand %xmm0, %xmm0
1240 ; SSE2-NEXT: pand %xmm1, %xmm0
1241 ; SSE2-NEXT: por %xmm3, %xmm0
1242 ; SSE2-NEXT: por %xmm2, %xmm0
1243 ; SSE2-NEXT: retq
1247 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1248 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1249 ; SSSE3-NEXT: movdqa %xmm0, %xmm2
1250 ; SSSE3-NEXT: pand %xmm1, %xmm2
1251 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1252 ; SSSE3-NEXT: pshufb %xmm2, %xmm3
1253 ; SSSE3-NEXT: psrlw $4, %xmm0
1254 ; SSSE3-NEXT: pand %xmm1, %xmm0
1255 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1256 ; SSSE3-NEXT: pshufb %xmm0, %xmm1
1257 ; SSSE3-NEXT: por %xmm3, %xmm1
1258 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
1259 ; SSSE3-NEXT: retq
1263 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1264 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1265 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
1266 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1267 ; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
1268 ; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
1269 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
1270 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1271 ; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0
1272 ; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0
1273 ; AVX-NEXT: retq
1277 ; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
1278 ; XOP-NEXT: retq
1286 ; SSE2-NEXT: pxor %xmm1, %xmm1
1287 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1288 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
1289 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
1290 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
1291 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
1292 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1293 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1294 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
1295 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,6,5,4]
1296 ; SSE2-NEXT: packuswb %xmm2, %xmm1
1297 ; SSE2-NEXT: movdqa %xmm1, %xmm0
1298 ; SSE2-NEXT: psllw $7, %xmm0
1299 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
1300 ; SSE2-NEXT: pand %xmm2, %xmm2
1301 ; SSE2-NEXT: pand %xmm0, %xmm2
1302 ; SSE2-NEXT: movdqa %xmm1, %xmm0
1303 ; SSE2-NEXT: psllw $5, %xmm0
1304 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
1305 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
1306 ; SSE2-NEXT: movdqa %xmm1, %xmm3
1307 ; SSE2-NEXT: psllw $3, %xmm3
1308 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
1309 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
1310 ; SSE2-NEXT: por %xmm0, %xmm3
1311 ; SSE2-NEXT: movdqa %xmm1, %xmm0
1312 ; SSE2-NEXT: paddb %xmm0, %xmm0
1313 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
1314 ; SSE2-NEXT: por %xmm3, %xmm0
1315 ; SSE2-NEXT: movdqa %xmm1, %xmm3
1316 ; SSE2-NEXT: psrlw $1, %xmm3
1317 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
1318 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
1319 ; SSE2-NEXT: por %xmm0, %xmm3
1320 ; SSE2-NEXT: movdqa %xmm1, %xmm0
1321 ; SSE2-NEXT: psrlw $3, %xmm0
1322 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
1323 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
1324 ; SSE2-NEXT: por %xmm3, %xmm0
1325 ; SSE2-NEXT: movdqa %xmm1, %xmm3
1326 ; SSE2-NEXT: psrlw $5, %xmm3
1327 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
1328 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
1329 ; SSE2-NEXT: por %xmm0, %xmm3
1330 ; SSE2-NEXT: psrlw $7, %xmm1
1331 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1332 ; SSE2-NEXT: pand %xmm0, %xmm0
1333 ; SSE2-NEXT: pand %xmm1, %xmm0
1334 ; SSE2-NEXT: por %xmm3, %xmm0
1335 ; SSE2-NEXT: por %xmm2, %xmm0
1336 ; SSE2-NEXT: retq
1340 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
1341 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1342 ; SSSE3-NEXT: movdqa %xmm0, %xmm2
1343 ; SSSE3-NEXT: pand %xmm1, %xmm2
1344 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1345 ; SSSE3-NEXT: pshufb %xmm2, %xmm3
1346 ; SSSE3-NEXT: psrlw $4, %xmm0
1347 ; SSSE3-NEXT: pand %xmm1, %xmm0
1348 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1349 ; SSSE3-NEXT: pshufb %xmm0, %xmm1
1350 ; SSSE3-NEXT: por %xmm3, %xmm1
1351 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
1352 ; SSSE3-NEXT: retq
1356 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
1357 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1358 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
1359 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1360 ; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
1361 ; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
1362 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
1363 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1364 ; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0
1365 ; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0
1366 ; AVX-NEXT: retq
1370 ; XOP-NEXT
1371 ; XOP-NEXT: retq
1379 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1380 ; SSE2-NEXT: psllw $5, %xmm2
1381 ; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
1382 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm9
1383 ; SSE2-NEXT: pand %xmm9, %xmm2
1384 ; SSE2-NEXT: movdqa %xmm0, %xmm5
1385 ; SSE2-NEXT: psllw $7, %xmm5
1386 ; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
1387 ; SSE2-NEXT: pand %xmm10, %xmm10
1388 ; SSE2-NEXT: pand %xmm10, %xmm5
1389 ; SSE2-NEXT: movdqa %xmm0, %xmm3
1390 ; SSE2-NEXT: psllw $3, %xmm3
1391 ; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
1392 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm11
1393 ; SSE2-NEXT: pand %xmm11, %xmm3
1394 ; SSE2-NEXT: por %xmm2, %xmm3
1395 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1396 ; SSE2-NEXT: paddb %xmm2, %xmm2
1397 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
1398 ; SSE2-NEXT: pand %xmm8, %xmm2
1399 ; SSE2-NEXT: por %xmm3, %xmm2
1400 ; SSE2-NEXT: movdqa %xmm0, %xmm3
1401 ; SSE2-NEXT: psrlw $1, %xmm3
1402 ; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1403 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm12
1404 ; SSE2-NEXT: pand %xmm12, %xmm3
1405 ; SSE2-NEXT: por %xmm2, %xmm3
1406 ; SSE2-NEXT: movdqa %xmm0, %xmm4
1407 ; SSE2-NEXT: psrlw $3, %xmm4
1408 ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
1409 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm6
1410 ; SSE2-NEXT: pand %xmm6, %xmm4
1411 ; SSE2-NEXT: por %xmm3, %xmm4
1412 ; SSE2-NEXT: movdqa %xmm0, %xmm7
1413 ; SSE2-NEXT: psrlw $5, %xmm7
1414 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
1415 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
1416 ; SSE2-NEXT: pand %xmm2, %xmm7
1417 ; SSE2-NEXT: por %xmm4, %xmm7
1418 ; SSE2-NEXT: psrlw $7, %xmm0
1419 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1420 ; SSE2-NEXT: pand %xmm3, %xmm3
1421 ; SSE2-NEXT: pand %xmm3, %xmm0
1422 ; SSE2-NEXT: por %xmm7, %xmm0
1423 ; SSE2-NEXT: por %xmm5, %xmm0
1424 ; SSE2-NEXT: movdqa %xmm1, %xmm4
1425 ; SSE2-NEXT: psllw $5, %xmm4
1426 ; SSE2-NEXT: pand %xmm9, %xmm4
1427 ; SSE2-NEXT: movdqa %xmm1, %xmm5
1428 ; SSE2-NEXT: psllw $7, %xmm5
1429 ; SSE2-NEXT: pand %xmm10, %xmm5
1430 ; SSE2-NEXT: movdqa %xmm1, %xmm7
1431 ; SSE2-NEXT: psllw $3, %xmm7
1432 ; SSE2-NEXT: pand %xmm11, %xmm7
1433 ; SSE2-NEXT: por %xmm4, %xmm7
1434 ; SSE2-NEXT: movdqa %xmm1, %xmm4
1435 ; SSE2-NEXT: paddb %xmm4, %xmm4
1436 ; SSE2-NEXT: pand %xmm8, %xmm4
1437 ; SSE2-NEXT: por %xmm7, %xmm4
1438 ; SSE2-NEXT: movdqa %xmm1, %xmm7
1439 ; SSE2-NEXT: psrlw $1, %xmm7
1440 ; SSE2-NEXT: pand %xmm12, %xmm7
1441 ; SSE2-NEXT: por %xmm4, %xmm7
1442 ; SSE2-NEXT: movdqa %xmm1, %xmm4
1443 ; SSE2-NEXT: psrlw $3, %xmm4
1444 ; SSE2-NEXT: pand %xmm6, %xmm4
1445 ; SSE2-NEXT: por %xmm7, %xmm4
1446 ; SSE2-NEXT: movdqa %xmm1, %xmm6
1447 ; SSE2-NEXT: psrlw $5, %xmm6
1448 ; SSE2-NEXT: pand %xmm2, %xmm6
1449 ; SSE2-NEXT: por %xmm4, %xmm6
1450 ; SSE2-NEXT: psrlw $7, %xmm1
1451 ; SSE2-NEXT: pand %xmm3, %xmm1
1452 ; SSE2-NEXT: por %xmm6, %xmm1
1453 ; SSE2-NEXT: por %xmm5, %xmm1
1454 ; SSE2-NEXT: retq
1458 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1459 ; SSSE3-NEXT: movdqa %xmm0, %xmm2
1460 ; SSSE3-NEXT: pand %xmm4, %xmm2
1461 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1462 ; SSSE3-NEXT: movdqa %xmm5, %xmm6
1463 ; SSSE3-NEXT: pshufb %xmm2, %xmm6
1464 ; SSSE3-NEXT: psrlw $4, %xmm0
1465 ; SSSE3-NEXT: pand %xmm4, %xmm0
1466 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1467 ; SSSE3-NEXT: movdqa %xmm2, %xmm3
1468 ; SSSE3-NEXT: pshufb %xmm0, %xmm3
1469 ; SSSE3-NEXT: por %xmm6, %xmm3
1470 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
1471 ; SSSE3-NEXT: pand %xmm4, %xmm0
1472 ; SSSE3-NEXT: pshufb %xmm0, %xmm5
1473 ; SSSE3-NEXT: psrlw $4, %xmm1
1474 ; SSSE3-NEXT: pand %xmm4, %xmm1
1475 ; SSSE3-NEXT: pshufb %xmm1, %xmm2
1476 ; SSSE3-NEXT: por %xmm5, %xmm2
1477 ; SSSE3-NEXT: movdqa %xmm3, %xmm0
1478 ; SSSE3-NEXT: movdqa %xmm2, %xmm1
1479 ; SSSE3-NEXT: retq
1483 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1484 ; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1485 ; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm3
1486 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1487 ; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
1488 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
1489 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
1490 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1491 ; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1
1492 ; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1
1493 ; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm3
1494 ; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
1495 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
1496 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
1497 ; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0
1498 ; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0
1499 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1500 ; AVX1-NEXT: retq
1504 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1505 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
1506 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1507 ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
1508 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
1509 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
1510 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1511 ; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0
1512 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
1513 ; AVX2-NEXT: retq
1517 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1518 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2
1519 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1520 ; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2
1521 ; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0
1522 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
1523 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1524 ; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0
1525 ; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0
1526 ; AVX512-NEXT: retq
1530 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1531 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
1532 ; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1
1533 ; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0
1534 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1535 ; XOPAVX1-NEXT: retq
1539 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1540 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
1541 ; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1
1542 ; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0
1543 ; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1544 ; XOPAVX2-NEXT: retq
1552 ; SSE2-NEXT: pxor %xmm9, %xmm9
1553 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1554 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15]
1555 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7]
1556 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6]
1557 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
1558 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
1559 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
1560 ; SSE2-NEXT: packuswb %xmm2, %xmm0
1561 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1562 ; SSE2-NEXT: psllw $5, %xmm2
1563 ; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
1564 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm10
1565 ; SSE2-NEXT: pand %xmm10, %xmm2
1566 ; SSE2-NEXT: movdqa %xmm0, %xmm3
1567 ; SSE2-NEXT: psllw $7, %xmm3
1568 ; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
1569 ; SSE2-NEXT: pand %xmm11, %xmm11
1570 ; SSE2-NEXT: pand %xmm11, %xmm3
1571 ; SSE2-NEXT: movdqa %xmm0, %xmm4
1572 ; SSE2-NEXT: psllw $3, %xmm4
1573 ; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
1574 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm12
1575 ; SSE2-NEXT: pand %xmm12, %xmm4
1576 ; SSE2-NEXT: por %xmm2, %xmm4
1577 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1578 ; SSE2-NEXT: paddb %xmm2, %xmm2
1579 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
1580 ; SSE2-NEXT: pand %xmm8, %xmm2
1581 ; SSE2-NEXT: por %xmm4, %xmm2
1582 ; SSE2-NEXT: movdqa %xmm0, %xmm4
1583 ; SSE2-NEXT: psrlw $1, %xmm4
1584 ; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1585 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm13
1586 ; SSE2-NEXT: pand %xmm13, %xmm4
1587 ; SSE2-NEXT: por %xmm2, %xmm4
1588 ; SSE2-NEXT: movdqa %xmm0, %xmm5
1589 ; SSE2-NEXT: psrlw $3, %xmm5
1590 ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
1591 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm6
1592 ; SSE2-NEXT: pand %xmm6, %xmm5
1593 ; SSE2-NEXT: por %xmm4, %xmm5
1594 ; SSE2-NEXT: movdqa %xmm0, %xmm7
1595 ; SSE2-NEXT: psrlw $5, %xmm7
1596 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
1597 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
1598 ; SSE2-NEXT: pand %xmm2, %xmm7
1599 ; SSE2-NEXT: por %xmm5, %xmm7
1600 ; SSE2-NEXT: psrlw $7, %xmm0
1601 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1602 ; SSE2-NEXT: pand %xmm4, %xmm4
1603 ; SSE2-NEXT: pand %xmm4, %xmm0
1604 ; SSE2-NEXT: por %xmm7, %xmm0
1605 ; SSE2-NEXT: por %xmm3, %xmm0
1606 ; SSE2-NEXT: movdqa %xmm1, %xmm3
1607 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15]
1608 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7]
1609 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,7,6]
1610 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
1611 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
1612 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,6]
1613 ; SSE2-NEXT: packuswb %xmm3, %xmm1
1614 ; SSE2-NEXT: movdqa %xmm1, %xmm5
1615 ; SSE2-NEXT: psllw $5, %xmm5
1616 ; SSE2-NEXT: pand %xmm10, %xmm5
1617 ; SSE2-NEXT: movdqa %xmm1, %xmm3
1618 ; SSE2-NEXT: psllw $7, %xmm3
1619 ; SSE2-NEXT: pand %xmm11, %xmm3
1620 ; SSE2-NEXT: movdqa %xmm1, %xmm7
1621 ; SSE2-NEXT: psllw $3, %xmm7
1622 ; SSE2-NEXT: pand %xmm12, %xmm7
1623 ; SSE2-NEXT: por %xmm5, %xmm7
1624 ; SSE2-NEXT: movdqa %xmm1, %xmm5
1625 ; SSE2-NEXT: paddb %xmm5, %xmm5
1626 ; SSE2-NEXT: pand %xmm8, %xmm5
1627 ; SSE2-NEXT: por %xmm7, %xmm5
1628 ; SSE2-NEXT: movdqa %xmm1, %xmm7
1629 ; SSE2-NEXT: psrlw $1, %xmm7
1630 ; SSE2-NEXT: pand %xmm13, %xmm7
1631 ; SSE2-NEXT: por %xmm5, %xmm7
1632 ; SSE2-NEXT: movdqa %xmm1, %xmm5
1633 ; SSE2-NEXT: psrlw $3, %xmm5
1634 ; SSE2-NEXT: pand %xmm6, %xmm5
1635 ; SSE2-NEXT: por %xmm7, %xmm5
1636 ; SSE2-NEXT: movdqa %xmm1, %xmm6
1637 ; SSE2-NEXT: psrlw $5, %xmm6
1638 ; SSE2-NEXT: pand %xmm2, %xmm6
1639 ; SSE2-NEXT: por %xmm5, %xmm6
1640 ; SSE2-NEXT: psrlw $7, %xmm1
1641 ; SSE2-NEXT: pand %xmm4, %xmm1
1642 ; SSE2-NEXT: por %xmm6, %xmm1
1643 ; SSE2-NEXT: por %xmm3, %xmm1
1644 ; SSE2-NEXT: retq
1648 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
1649 ; SSSE3-NEXT: pshufb %xmm4, %xmm0
1650 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1651 ; SSSE3-NEXT: movdqa %xmm0, %xmm2
1652 ; SSSE3-NEXT: pand %xmm5, %xmm2
1653 ; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1654 ; SSSE3-NEXT: movdqa %xmm6, %xmm7
1655 ; SSSE3-NEXT: pshufb %xmm2, %xmm7
1656 ; SSSE3-NEXT: psrlw $4, %xmm0
1657 ; SSSE3-NEXT: pand %xmm5, %xmm0
1658 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1659 ; SSSE3-NEXT: movdqa %xmm2, %xmm3
1660 ; SSSE3-NEXT: pshufb %xmm0, %xmm3
1661 ; SSSE3-NEXT: por %xmm7, %xmm3
1662 ; SSSE3-NEXT: pshufb %xmm4, %xmm1
1663 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
1664 ; SSSE3-NEXT: pand %xmm5, %xmm0
1665 ; SSSE3-NEXT: pshufb %xmm0, %xmm6
1666 ; SSSE3-NEXT: psrlw $4, %xmm1
1667 ; SSSE3-NEXT: pand %xmm5, %xmm1
1668 ; SSSE3-NEXT: pshufb %xmm1, %xmm2
1669 ; SSSE3-NEXT: por %xmm6, %xmm2
1670 ; SSSE3-NEXT: movdqa %xmm3, %xmm0
1671 ; SSSE3-NEXT: movdqa %xmm2, %xmm1
1672 ; SSSE3-NEXT: retq
1676 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1677 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
1678 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1679 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1680 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4
1681 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1682 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
1683 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
1684 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
1685 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1686 ; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1
1687 ; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1
1688 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1689 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2
1690 ; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2
1691 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
1692 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
1693 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0
1694 ; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0
1695 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1696 ; AVX1-NEXT: retq
1700 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
1701 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1702 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
1703 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1704 ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
1705 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
1706 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
1707 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1708 ; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0
1709 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
1710 ; AVX2-NEXT: retq
1714 ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
1715 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1716 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2
1717 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1718 ; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2
1719 ; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0
1720 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
1721 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1722 ; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0
1723 ; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0
1724 ; AVX512-NEXT: retq
1728 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1729 ; XOPAVX1-NEXT
1730 ; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1
1731 ; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0
1732 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1733 ; XOPAVX1-NEXT: retq
1737 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1738 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
1739 ; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1
1740 ; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0
1741 ; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1742 ; XOPAVX2-NEXT: retq
1750 ; SSE2-NEXT: pxor %xmm9, %xmm9
1751 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1752 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15]
1753 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
1754 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
1755 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
1756 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
1757 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
1758 ; SSE2-NEXT: packuswb %xmm2, %xmm0
1759 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1760 ; SSE2-NEXT: psllw $5, %xmm2
1761 ; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
1762 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm10
1763 ; SSE2-NEXT: pand %xmm10, %xmm2
1764 ; SSE2-NEXT: movdqa %xmm0, %xmm3
1765 ; SSE2-NEXT: psllw $7, %xmm3
1766 ; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
1767 ; SSE2-NEXT: pand %xmm11, %xmm11
1768 ; SSE2-NEXT: pand %xmm11, %xmm3
1769 ; SSE2-NEXT: movdqa %xmm0, %xmm4
1770 ; SSE2-NEXT: psllw $3, %xmm4
1771 ; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
1772 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm12
1773 ; SSE2-NEXT: pand %xmm12, %xmm4
1774 ; SSE2-NEXT: por %xmm2, %xmm4
1775 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1776 ; SSE2-NEXT: paddb %xmm2, %xmm2
1777 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
1778 ; SSE2-NEXT: pand %xmm8, %xmm2
1779 ; SSE2-NEXT: por %xmm4, %xmm2
1780 ; SSE2-NEXT: movdqa %xmm0, %xmm4
1781 ; SSE2-NEXT: psrlw $1, %xmm4
1782 ; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1783 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm13
1784 ; SSE2-NEXT: pand %xmm13, %xmm4
1785 ; SSE2-NEXT: por %xmm2, %xmm4
1786 ; SSE2-NEXT: movdqa %xmm0, %xmm5
1787 ; SSE2-NEXT: psrlw $3, %xmm5
1788 ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
1789 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm6
1790 ; SSE2-NEXT: pand %xmm6, %xmm5
1791 ; SSE2-NEXT: por %xmm4, %xmm5
1792 ; SSE2-NEXT: movdqa %xmm0, %xmm7
1793 ; SSE2-NEXT: psrlw $5, %xmm7
1794 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
1795 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
1796 ; SSE2-NEXT: pand %xmm2, %xmm7
1797 ; SSE2-NEXT: por %xmm5, %xmm7
1798 ; SSE2-NEXT: psrlw $7, %xmm0
1799 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1800 ; SSE2-NEXT: pand %xmm4, %xmm4
1801 ; SSE2-NEXT: pand %xmm4, %xmm0
1802 ; SSE2-NEXT: por %xmm7, %xmm0
1803 ; SSE2-NEXT: por %xmm3, %xmm0
1804 ; SSE2-NEXT: movdqa %xmm1, %xmm3
1805 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15]
1806 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
1807 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
1808 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
1809 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
1810 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
1811 ; SSE2-NEXT: packuswb %xmm3, %xmm1
1812 ; SSE2-NEXT: movdqa %xmm1, %xmm5
1813 ; SSE2-NEXT: psllw $5, %xmm5
1814 ; SSE2-NEXT: pand %xmm10, %xmm5
1815 ; SSE2-NEXT: movdqa %xmm1, %xmm3
1816 ; SSE2-NEXT: psllw $7, %xmm3
1817 ; SSE2-NEXT: pand %xmm11, %xmm3
1818 ; SSE2-NEXT: movdqa %xmm1, %xmm7
1819 ; SSE2-NEXT: psllw $3, %xmm7
1820 ; SSE2-NEXT: pand %xmm12, %xmm7
1821 ; SSE2-NEXT: por %xmm5, %xmm7
1822 ; SSE2-NEXT: movdqa %xmm1, %xmm5
1823 ; SSE2-NEXT: paddb %xmm5, %xmm5
1824 ; SSE2-NEXT: pand %xmm8, %xmm5
1825 ; SSE2-NEXT: por %xmm7, %xmm5
1826 ; SSE2-NEXT: movdqa %xmm1, %xmm7
1827 ; SSE2-NEXT: psrlw $1, %xmm7
1828 ; SSE2-NEXT: pand %xmm13, %xmm7
1829 ; SSE2-NEXT: por %xmm5, %xmm7
1830 ; SSE2-NEXT: movdqa %xmm1, %xmm5
1831 ; SSE2-NEXT: psrlw $3, %xmm5
1832 ; SSE2-NEXT: pand %xmm6, %xmm5
1833 ; SSE2-NEXT: por %xmm7, %xmm5
1834 ; SSE2-NEXT: movdqa %xmm1, %xmm6
1835 ; SSE2-NEXT: psrlw $5, %xmm6
1836 ; SSE2-NEXT: pand %xmm2, %xmm6
1837 ; SSE2-NEXT: por %xmm5, %xmm6
1838 ; SSE2-NEXT: psrlw $7, %xmm1
1839 ; SSE2-NEXT: pand %xmm4, %xmm1
1840 ; SSE2-NEXT: por %xmm6, %xmm1
1841 ; SSE2-NEXT: por %xmm3, %xmm1
1842 ; SSE2-NEXT: retq
1846 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1847 ; SSSE3-NEXT: pshufb %xmm4, %xmm0
1848 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1849 ; SSSE3-NEXT: movdqa %xmm0, %xmm2
1850 ; SSSE3-NEXT: pand %xmm5, %xmm2
1851 ; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1852 ; SSSE3-NEXT: movdqa %xmm6, %xmm7
1853 ; SSSE3-NEXT: pshufb %xmm2, %xmm7
1854 ; SSSE3-NEXT: psrlw $4, %xmm0
1855 ; SSSE3-NEXT: pand %xmm5, %xmm0
1856 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1857 ; SSSE3-NEXT: movdqa %xmm2, %xmm3
1858 ; SSSE3-NEXT: pshufb %xmm0, %xmm3
1859 ; SSSE3-NEXT: por %xmm7, %xmm3
1860 ; SSSE3-NEXT: pshufb %xmm4, %xmm1
1861 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
1862 ; SSSE3-NEXT: pand %xmm5, %xmm0
1863 ; SSSE3-NEXT: pshufb %xmm0, %xmm6
1864 ; SSSE3-NEXT: psrlw $4, %xmm1
1865 ; SSSE3-NEXT: pand %xmm5, %xmm1
1866 ; SSSE3-NEXT: pshufb %xmm1, %xmm2
1867 ; SSSE3-NEXT: por %xmm6, %xmm2
1868 ; SSSE3-NEXT: movdqa %xmm3, %xmm0
1869 ; SSSE3-NEXT: movdqa %xmm2, %xmm1
1870 ; SSSE3-NEXT: retq
1874 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1875 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1876 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1877 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1878 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4
1879 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1880 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
1881 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
1882 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
1883 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1884 ; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1
1885 ; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1
1886 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1887 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2
1888 ; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2
1889 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
1890 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
1891 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0
1892 ; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0
1893 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1894 ; AVX1-NEXT: retq
1898 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
1899 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1900 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
1901 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1902 ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
1903 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
1904 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
1905 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1906 ; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0
1907 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
1908 ; AVX2-NEXT: retq
1912 ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
1913 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1914 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2
1915 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1916 ; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2
1917 ; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0
1918 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
1919 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1920 ; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0
1921 ; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0
1922 ; AVX512-NEXT: retq
1926 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1927 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
1928 ; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1
1929 ; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0
1930 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1931 ; XOPAVX1-NEXT: retq
1935 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1936 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
1937 ; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1
1938 ; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0
1939 ; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1940 ; XOPAVX2-NEXT: retq
1948 ; SSE2-NEXT: pxor %xmm9, %xmm9
1949 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1950 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15]
1951 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
1952 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
1953 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
1954 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
1955 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1956 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
1957 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
1958 ; SSE2-NEXT: packuswb %xmm2, %xmm0
1959 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1960 ; SSE2-NEXT: psllw $5, %xmm2
1961 ; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
1962 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm10
1963 ; SSE2-NEXT: pand %xmm10, %xmm2
1964 ; SSE2-NEXT: movdqa %xmm0, %xmm4
1965 ; SSE2-NEXT: psllw $7, %xmm4
1966 ; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
1967 ; SSE2-NEXT: pand %xmm11, %xmm11
1968 ; SSE2-NEXT: pand %xmm11, %xmm4
1969 ; SSE2-NEXT: movdqa %xmm0, %xmm3
1970 ; SSE2-NEXT: psllw $3, %xmm3
1971 ; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
1972 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm12
1973 ; SSE2-NEXT: pand %xmm12, %xmm3
1974 ; SSE2-NEXT: por %xmm2, %xmm3
1975 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1976 ; SSE2-NEXT: paddb %xmm2, %xmm2
1977 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
1978 ; SSE2-NEXT: pand %xmm8, %xmm2
1979 ; SSE2-NEXT: por %xmm3, %xmm2
1980 ; SSE2-NEXT: movdqa %xmm0, %xmm3
1981 ; SSE2-NEXT: psrlw $1, %xmm3
1982 ; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1983 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm13
1984 ; SSE2-NEXT: pand %xmm13, %xmm3
1985 ; SSE2-NEXT: por %xmm2, %xmm3
1986 ; SSE2-NEXT: movdqa %xmm0, %xmm5
1987 ; SSE2-NEXT: psrlw $3, %xmm5
1988 ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
1989 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm6
1990 ; SSE2-NEXT: pand %xmm6, %xmm5
1991 ; SSE2-NEXT: por %xmm3, %xmm5
1992 ; SSE2-NEXT: movdqa %xmm0, %xmm7
1993 ; SSE2-NEXT: psrlw $5, %xmm7
1994 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
1995 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
1996 ; SSE2-NEXT: pand %xmm2, %xmm7
1997 ; SSE2-NEXT: por %xmm5, %xmm7
1998 ; SSE2-NEXT: psrlw $7, %xmm0
1999 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
2000 ; SSE2-NEXT: pand %xmm3, %xmm3
2001 ; SSE2-NEXT: pand %xmm3, %xmm0
2002 ; SSE2-NEXT: por %xmm7, %xmm0
2003 ; SSE2-NEXT: por %xmm4, %xmm0
2004 ; SSE2-NEXT: movdqa %xmm1, %xmm4
2005 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
2006 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
2007 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
2008 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
2009 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
2010 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
2011 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
2012 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
2013 ; SSE2-NEXT: packuswb %xmm4, %xmm1
2014 ; SSE2-NEXT: movdqa %xmm1, %xmm5
2015 ; SSE2-NEXT: psllw $5, %xmm5
2016 ; SSE2-NEXT: pand %xmm10, %xmm5
2017 ; SSE2-NEXT: movdqa %xmm1, %xmm4
2018 ; SSE2-NEXT: psllw $7, %xmm4
2019 ; SSE2-NEXT: pand %xmm11, %xmm4
2020 ; SSE2-NEXT: movdqa %xmm1, %xmm7
2021 ; SSE2-NEXT: psllw $3, %xmm7
2022 ; SSE2-NEXT: pand %xmm12, %xmm7
2023 ; SSE2-NEXT: por %xmm5, %xmm7
2024 ; SSE2-NEXT: movdqa %xmm1, %xmm5
2025 ; SSE2-NEXT: paddb %xmm5, %xmm5
2026 ; SSE2-NEXT: pand %xmm8, %xmm5
2027 ; SSE2-NEXT: por %xmm7, %xmm5
2028 ; SSE2-NEXT: movdqa %xmm1, %xmm7
2029 ; SSE2-NEXT: psrlw $1, %xmm7
2030 ; SSE2-NEXT: pand %xmm13, %xmm7
2031 ; SSE2-NEXT: por %xmm5, %xmm7
2032 ; SSE2-NEXT: movdqa %xmm1, %xmm5
2033 ; SSE2-NEXT: psrlw $3, %xmm5
2034 ; SSE2-NEXT: pand %xmm6, %xmm5
2035 ; SSE2-NEXT: por %xmm7, %xmm5
2036 ; SSE2-NEXT: movdqa %xmm1, %xmm6
2037 ; SSE2-NEXT: psrlw $5, %xmm6
2038 ; SSE2-NEXT: pand %xmm2, %xmm6
2039 ; SSE2-NEXT: por %xmm5, %xmm6
2040 ; SSE2-NEXT: psrlw $7, %xmm1
2041 ; SSE2-NEXT: pand %xmm3, %xmm1
2042 ; SSE2-NEXT: por %xmm6, %xmm1
2043 ; SSE2-NEXT: por %xmm4, %xmm1
2044 ; SSE2-NEXT: retq
2048 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
2049 ; SSSE3-NEXT: pshufb %xmm4, %xmm0
2050 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2051 ; SSSE3-NEXT: movdqa %xmm0, %xmm2
2052 ; SSSE3-NEXT: pand %xmm5, %xmm2
2053 ; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2054 ; SSSE3-NEXT: movdqa %xmm6, %xmm7
2055 ; SSSE3-NEXT: pshufb %xmm2, %xmm7
2056 ; SSSE3-NEXT: psrlw $4, %xmm0
2057 ; SSSE3-NEXT: pand %xmm5, %xmm0
2058 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2059 ; SSSE3-NEXT: movdqa %xmm2, %xmm3
2060 ; SSSE3-NEXT: pshufb %xmm0, %xmm3
2061 ; SSSE3-NEXT: por %xmm7, %xmm3
2062 ; SSSE3-NEXT: pshufb %xmm4, %xmm1
2063 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
2064 ; SSSE3-NEXT: pand %xmm5, %xmm0
2065 ; SSSE3-NEXT: pshufb %xmm0, %xmm6
2066 ; SSSE3-NEXT: psrlw $4, %xmm1
2067 ; SSSE3-NEXT: pand %xmm5, %xmm1
2068 ; SSSE3-NEXT: pshufb %xmm1, %xmm2
2069 ; SSSE3-NEXT: por %xmm6, %xmm2
2070 ; SSSE3-NEXT: movdqa %xmm3, %xmm0
2071 ; SSSE3-NEXT: movdqa %xmm2, %xmm1
2072 ; SSSE3-NEXT: retq
2076 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
2077 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
2078 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
2079 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2080 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4
2081 ; AVX1-NEXT
2082 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
2083 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
2084 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
2085 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2086 ; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1
2087 ; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1
2088 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
2089 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2
2090 ; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2
2091 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
2092 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
2093 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0
2094 ; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0
2095 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
2096 ; AVX1-NEXT: retq
2100 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
2101 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2102 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
2103 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2104 ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
2105 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
2106 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
2107 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2108 ; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0
2109 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
2110 ; AVX2-NEXT: retq
2114 ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
2115 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2116 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2
2117 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2118 ; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2
2119 ; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0
2120 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
2121 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2122 ; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0
2123 ; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0
2124 ; AVX512-NEXT: retq
2128 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
2129 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
2130 ; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1
2131 ; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0
2132 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
2133 ; XOPAVX1-NEXT: retq
2137 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2138 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
2139 ; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1
2140 ; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0
2141 ; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2142 ; XOPAVX2-NEXT: retq
2150 ; SSE2-NEXT: movdqa %xmm0, %xmm4
2151 ; SSE2-NEXT: psllw $5, %xmm4
2152 ; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
2153 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm9
2154 ; SSE2-NEXT: pand %xmm9, %xmm4
2155 ; SSE2-NEXT: movdqa %xmm0, %xmm7
2156 ; SSE2-NEXT: psllw $7, %xmm7
2157 ; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
2158 ; SSE2-NEXT: pand %xmm10, %xmm10
2159 ; SSE2-NEXT: pand %xmm10, %xmm7
2160 ; SSE2-NEXT: movdqa %xmm0, %xmm5
2161 ; SSE2-NEXT: psllw $3, %xmm5
2162 ; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
2163 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm11
2164 ; SSE2-NEXT: pand %xmm11, %xmm5
2165 ; SSE2-NEXT: por %xmm4, %xmm5
2166 ; SSE2-NEXT: movdqa %xmm0, %xmm4
2167 ; SSE2-NEXT: paddb %xmm4, %xmm4
2168 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
2169 ; SSE2-NEXT: pand %xmm8, %xmm4
2170 ; SSE2-NEXT: por %xmm5, %xmm4
2171 ; SSE2-NEXT: movdqa %xmm0, %xmm5
2172 ; SSE2-NEXT: psrlw $1, %xmm5
2173 ; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
2174 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm12
2175 ; SSE2-NEXT: pand %xmm12, %xmm5
2176 ; SSE2-NEXT: por %xmm4, %xmm5
2177 ; SSE2-NEXT: movdqa %xmm0, %xmm6
2178 ; SSE2-NEXT: psrlw $3, %xmm6
2179 ; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
2180 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm13
2181 ; SSE2-NEXT: pand %xmm13, %xmm6
2182 ; SSE2-NEXT: por %xmm5, %xmm6
2183 ; SSE2-NEXT: movdqa %xmm0, %xmm4
2184 ; SSE2-NEXT: psrlw $5, %xmm4
2185 ; SSE2-NEXT: movdqa {{.*#+}} xmm14 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
2186 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm14
2187 ; SSE2-NEXT: pand %xmm14, %xmm4
2188 ; SSE2-NEXT: por %xmm6, %xmm4
2189 ; SSE2-NEXT: psrlw $7, %xmm0
2190 ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
2191 ; SSE2-NEXT: pand %xmm6, %xmm6
2192 ; SSE2-NEXT: pand %xmm6, %xmm0
2193 ; SSE2-NEXT: por %xmm4, %xmm0
2194 ; SSE2-NEXT: por %xmm7, %xmm0
2195 ; SSE2-NEXT: movdqa %xmm1, %xmm4
2196 ; SSE2-NEXT: psllw $5, %xmm4
2197 ; SSE2-NEXT: pand %xmm9, %xmm4
2198 ; SSE2-NEXT: movdqa %xmm1, %xmm7
2199 ; SSE2-NEXT: psllw $7, %xmm7
2200 ; SSE2-NEXT: pand %xmm10, %xmm7
2201 ; SSE2-NEXT: movdqa %xmm1, %xmm5
2202 ; SSE2-NEXT: psllw $3, %xmm5
2203 ; SSE2-NEXT: pand %xmm11, %xmm5
2204 ; SSE2-NEXT: por %xmm4, %xmm5
2205 ; SSE2-NEXT: movdqa %xmm1, %xmm4
2206 ; SSE2-NEXT: paddb %xmm4, %xmm4
2207 ; SSE2-NEXT: pand %xmm8, %xmm4
2208 ; SSE2-NEXT: por %xmm5, %xmm4
2209 ; SSE2-NEXT: movdqa %xmm1, %xmm5
2210 ; SSE2-NEXT: psrlw $1, %xmm5
2211 ; SSE2-NEXT: pand %xmm12, %xmm5
2212 ; SSE2-NEXT: por %xmm4, %xmm5
2213 ; SSE2-NEXT: movdqa %xmm1, %xmm4
2214 ; SSE2-NEXT: psrlw $3, %xmm4
2215 ; SSE2-NEXT: pand %xmm13, %xmm4
2216 ; SSE2-NEXT: por %xmm5, %xmm4
2217 ; SSE2-NEXT: movdqa %xmm1, %xmm5
2218 ; SSE2-NEXT: psrlw $5, %xmm5
2219 ; SSE2-NEXT: pand %xmm14, %xmm5
2220 ; SSE2-NEXT: por %xmm4, %xmm5
2221 ; SSE2-NEXT: psrlw $7, %xmm1
2222 ; SSE2-NEXT: pand %xmm6, %xmm1
2223 ; SSE2-NEXT: por %xmm5, %xmm1
2224 ; SSE2-NEXT: por %xmm7, %xmm1
2225 ; SSE2-NEXT: movdqa %xmm2, %xmm4
2226 ; SSE2-NEXT: psllw $5, %xmm4
2227 ; SSE2-NEXT: pand %xmm9, %xmm4
2228 ; SSE2-NEXT: movdqa %xmm2, %xmm7
2229 ; SSE2-NEXT: psllw $7, %xmm7
2230 ; SSE2-NEXT: pand %xmm10, %xmm7
2231 ; SSE2-NEXT: movdqa %xmm2, %xmm5
2232 ; SSE2-NEXT: psllw $3, %xmm5
2233 ; SSE2-NEXT: pand %xmm11, %xmm5
2234 ; SSE2-NEXT: por %xmm4, %xmm5
2235 ; SSE2-NEXT: movdqa %xmm2, %xmm4
2236 ; SSE2-NEXT: paddb %xmm4, %xmm4
2237 ; SSE2-NEXT: pand %xmm8, %xmm4
2238 ; SSE2-NEXT: por %xmm5, %xmm4
2239 ; SSE2-NEXT: movdqa %xmm2, %xmm5
2240 ; SSE2-NEXT: psrlw $1, %xmm5
2241 ; SSE2-NEXT: pand %xmm12, %xmm5
2242 ; SSE2-NEXT: por %xmm4, %xmm5
2243 ; SSE2-NEXT: movdqa %xmm2, %xmm4
2244 ; SSE2-NEXT: psrlw $3, %xmm4
2245 ; SSE2-NEXT: pand %xmm13, %xmm4
2246 ; SSE2-NEXT: por %xmm5, %xmm4
2247 ; SSE2-NEXT: movdqa %xmm2, %xmm5
2248 ; SSE2-NEXT: psrlw $5, %xmm5
2249 ; SSE2-NEXT: pand %xmm14, %xmm5
2250 ; SSE2-NEXT: por %xmm4, %xmm5
2251 ; SSE2-NEXT: psrlw $7, %xmm2
2252 ; SSE2-NEXT: pand %xmm6, %xmm2
2253 ; SSE2-NEXT: por %xmm5, %xmm2
2254 ; SSE2-NEXT: por %xmm7, %xmm2
2255 ; SSE2-NEXT: movdqa %xmm3, %xmm4
2256 ; SSE2-NEXT: psllw $5, %xmm4
2257 ; SSE2-NEXT: pand %xmm9, %xmm4
2258 ; SSE2-NEXT: movdqa %xmm3, %xmm7
2259 ; SSE2-NEXT: psllw $7, %xmm7
2260 ; SSE2-NEXT: pand %xmm10, %xmm7
2261 ; SSE2-NEXT: movdqa %xmm3, %xmm5
2262 ; SSE2-NEXT: psllw $3, %xmm5
2263 ; SSE2-NEXT: pand %xmm11, %xmm5
2264 ; SSE2-NEXT: por %xmm4, %xmm5
2265 ; SSE2-NEXT: movdqa %xmm3, %xmm4
2266 ; SSE2-NEXT: paddb %xmm4, %xmm4
2267 ; SSE2-NEXT: pand %xmm8, %xmm4
2268 ; SSE2-NEXT: por %xmm5, %xmm4
2269 ; SSE2-NEXT: movdqa %xmm3, %xmm5
2270 ; SSE2-NEXT: psrlw $1, %xmm5
2271 ; SSE2-NEXT: pand %xmm12, %xmm5
2272 ; SSE2-NEXT: por %xmm4, %xmm5
2273 ; SSE2-NEXT: movdqa %xmm3, %xmm4
2274 ; SSE2-NEXT: psrlw $3, %xmm4
2275 ; SSE2-NEXT: pand %xmm13, %xmm4
2276 ; SSE2-NEXT: por %xmm5, %xmm4
2277 ; SSE2-NEXT: movdqa %xmm3, %xmm5
2278 ; SSE2-NEXT: psrlw $5, %xmm5
2279 ; SSE2-NEXT: pand %xmm14, %xmm5
2280 ; SSE2-NEXT: por %xmm4, %xmm5
2281 ; SSE2-NEXT: psrlw $7, %xmm3
2282 ; SSE2-NEXT: pand %xmm6, %xmm3
2283 ; SSE2-NEXT: por %xmm5, %xmm3
2284 ; SSE2-NEXT: por %xmm7, %xmm3
2285 ; SSE2-NEXT: retq
2289 ; SSSE3-NEXT: movdqa %xmm0, %xmm5
2290 ; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2291 ; SSSE3-NEXT: pand %xmm8, %xmm0
2292 ; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2293 ; SSSE3-NEXT: movdqa %xmm9, %xmm6
2294 ; SSSE3-NEXT: pshufb %xmm0, %xmm6
2295 ; SSSE3-NEXT: psrlw $4, %xmm5
2296 ; SSSE3-NEXT: pand %xmm8, %xmm5
2297 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2298 ; SSSE3-NEXT: movdqa %xmm4, %xmm0
2299 ; SSSE3-NEXT: pshufb %xmm5, %xmm0
2300 ; SSSE3-NEXT: por %xmm6, %xmm0
2301 ; SSSE3-NEXT: movdqa %xmm1, %xmm5
2302 ; SSSE3-NEXT: pand %xmm8, %xmm5
2303 ; SSSE3-NEXT: movdqa %xmm9, %xmm6
2304 ; SSSE3-NEXT: pshufb %xmm5, %xmm6
2305 ; SSSE3-NEXT: psrlw $4, %xmm1
2306 ; SSSE3-NEXT: pand %xmm8, %xmm1
2307 ; SSSE3-NEXT: movdqa %xmm4, %xmm5
2308 ; SSSE3-NEXT: pshufb %xmm1, %xmm5
2309 ; SSSE3-NEXT: por %xmm6, %xmm5
2310 ; SSSE3-NEXT: movdqa %xmm2, %xmm1
2311 ; SSSE3-NEXT: pand %xmm8, %xmm1
2312 ; SSSE3-NEXT: movdqa %xmm9, %xmm7
2313 ; SSSE3-NEXT: pshufb %xmm1, %xmm7
2314 ; SSSE3-NEXT: psrlw $4, %xmm2
2315 ; SSSE3-NEXT: pand %xmm8, %xmm2
2316 ; SSSE3-NEXT: movdqa %xmm4, %xmm6
2317 ; SSSE3-NEXT: pshufb %xmm2, %xmm6
2318 ; SSSE3-NEXT: por %xmm7, %xmm6
2319 ; SSSE3-NEXT: movdqa %xmm3, %xmm1
2320 ; SSSE3-NEXT: pand %xmm8, %xmm1
2321 ; SSSE3-NEXT: pshufb %xmm1, %xmm9
2322 ; SSSE3-NEXT: psrlw $4, %xmm3
2323 ; SSSE3-NEXT: pand %xmm8, %xmm3
2324 ; SSSE3-NEXT: pshufb %xmm3, %xmm4
2325 ; SSSE3-NEXT: por %xmm9, %xmm4
2326 ; SSSE3-NEXT: movdqa %xmm5, %xmm1
2327 ; SSSE3-NEXT: movdqa %xmm6, %xmm2
2328 ; SSSE3-NEXT: movdqa %xmm4, %xmm3
2329 ; SSSE3-NEXT: retq
2333 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
2334 ; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2335 ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm4
2336 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2337 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
2338 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
2339 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
2340 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2341 ; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2
2342 ; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2
2343 ; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm4
2344 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
2345 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
2346 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
2347 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0
2348 ; AVX1-NEXT: vpor %xmm0, %xmm4, %xmm0
2349 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2350 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
2351 ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm4
2352 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
2353 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
2354 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
2355 ; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2
2356 ; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2
2357 ; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm4
2358 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
2359 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
2360 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
2361 ; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1
2362 ; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1
2363 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
2364 ; AVX1-NEXT: retq
2368 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2369 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3
2370 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2371 ; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3
2372 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
2373 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
2374 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2375 ; AVX2-NEXT: vpshufb %ymm0, %ymm5, %ymm0
2376 ; AVX2-NEXT: vpor %ymm0, %ymm3, %ymm0
2377 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm3
2378 ; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3
2379 ; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1
2380 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
2381 ; AVX2-NEXT: vpshufb %ymm1, %ymm5, %ymm1
2382 ; AVX2-NEXT: vpor %ymm1, %ymm3, %ymm1
2383 ; AVX2-NEXT: retq
2387 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2388 ; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm3
2389 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2390 ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
2391 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
2392 ; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
2393 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2394 ; AVX512F-NEXT: vpshufb %ymm0, %ymm5, %ymm0
2395 ; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0
2396 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
2397 ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
2398 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
2399 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
2400 ; AVX512F-NEXT: vpshufb %ymm1, %ymm5, %ymm1
2401 ; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1
2402 ; AVX512F-NEXT: retq
2406 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2407 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
2408 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2409 ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
2410 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
2411 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
2412 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2413 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0
2414 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
2415 ; AVX512BW-NEXT: retq
2419 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
2420 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
2421 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
2422 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0
2423 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2424 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
2425 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
2426 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1
2427 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
2428 ; XOPAVX1-NEXT: retq
2432 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
2433 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
2434 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
2435 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0
2436 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
2437 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
2438 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
2439 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1
2440 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
2441 ; XOPAVX2-NEXT: retq
2449 ; SSE2-NEXT: pxor %xmm9, %xmm9
2450 ; SSE2-NEXT
2451 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
2452 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7]
2453 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,7,6]
2454 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
2455 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
2456 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
2457 ; SSE2-NEXT: packuswb %xmm4, %xmm0
2458 ; SSE2-NEXT: movdqa %xmm0, %xmm5
2459 ; SSE2-NEXT: psllw $5, %xmm5
2460 ; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
2461 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm10
2462 ; SSE2-NEXT: pand %xmm10, %xmm5
2463 ; SSE2-NEXT: movdqa %xmm0, %xmm4
2464 ; SSE2-NEXT: psllw $7, %xmm4
2465 ; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
2466 ; SSE2-NEXT: pand %xmm11, %xmm11
2467 ; SSE2-NEXT: pand %xmm11, %xmm4
2468 ; SSE2-NEXT: movdqa %xmm0, %xmm6
2469 ; SSE2-NEXT: psllw $3, %xmm6
2470 ; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
2471 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm12
2472 ; SSE2-NEXT: pand %xmm12, %xmm6
2473 ; SSE2-NEXT: por %xmm5, %xmm6
2474 ; SSE2-NEXT: movdqa %xmm0, %xmm5
2475 ; SSE2-NEXT: paddb %xmm5, %xmm5
2476 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
2477 ; SSE2-NEXT: pand %xmm8, %xmm5
2478 ; SSE2-NEXT: por %xmm6, %xmm5
2479 ; SSE2-NEXT: movdqa %xmm0, %xmm6
2480 ; SSE2-NEXT: psrlw $1, %xmm6
2481 ; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
2482 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm13
2483 ; SSE2-NEXT: pand %xmm13, %xmm6
2484 ; SSE2-NEXT: por %xmm5, %xmm6
2485 ; SSE2-NEXT: movdqa %xmm0, %xmm7
2486 ; SSE2-NEXT: psrlw $3, %xmm7
2487 ; SSE2-NEXT: movdqa {{.*#+}} xmm14 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
2488 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm14
2489 ; SSE2-NEXT: pand %xmm14, %xmm7
2490 ; SSE2-NEXT: por %xmm6, %xmm7
2491 ; SSE2-NEXT: movdqa %xmm0, %xmm5
2492 ; SSE2-NEXT: psrlw $5, %xmm5
2493 ; SSE2-NEXT: movdqa {{.*#+}} xmm15 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
2494 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm15
2495 ; SSE2-NEXT: pand %xmm15, %xmm5
2496 ; SSE2-NEXT: por %xmm7, %xmm5
2497 ; SSE2-NEXT: psrlw $7, %xmm0
2498 ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
2499 ; SSE2-NEXT: pand %xmm7, %xmm7
2500 ; SSE2-NEXT: pand %xmm7, %xmm0
2501 ; SSE2-NEXT: por %xmm5, %xmm0
2502 ; SSE2-NEXT: por %xmm4, %xmm0
2503 ; SSE2-NEXT: movdqa %xmm1, %xmm4
2504 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
2505 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7]
2506 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,7,6]
2507 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
2508 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
2509 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,6]
2510 ; SSE2-NEXT: packuswb %xmm4, %xmm1
2511 ; SSE2-NEXT: movdqa %xmm1, %xmm5
2512 ; SSE2-NEXT: psllw $5, %xmm5
2513 ; SSE2-NEXT: pand %xmm10, %xmm5
2514 ; SSE2-NEXT: movdqa %xmm1, %xmm4
2515 ; SSE2-NEXT: psllw $7, %xmm4
2516 ; SSE2-NEXT: pand %xmm11, %xmm4
2517 ; SSE2-NEXT: movdqa %xmm1, %xmm6
2518 ; SSE2-NEXT: psllw $3, %xmm6
2519 ; SSE2-NEXT: pand %xmm12, %xmm6
2520 ; SSE2-NEXT: por %xmm5, %xmm6
2521 ; SSE2-NEXT: movdqa %xmm1, %xmm5
2522 ; SSE2-NEXT: paddb %xmm5, %xmm5
2523 ; SSE2-NEXT: pand %xmm8, %xmm5
2524 ; SSE2-NEXT: por %xmm6, %xmm5
2525 ; SSE2-NEXT: movdqa %xmm1, %xmm6
2526 ; SSE2-NEXT: psrlw $1, %xmm6
2527 ; SSE2-NEXT: pand %xmm13, %xmm6
2528 ; SSE2-NEXT: por %xmm5, %xmm6
2529 ; SSE2-NEXT: movdqa %xmm1, %xmm5
2530 ; SSE2-NEXT: psrlw $3, %xmm5
2531 ; SSE2-NEXT: pand %xmm14, %xmm5
2532 ; SSE2-NEXT: por %xmm6, %xmm5
2533 ; SSE2-NEXT: movdqa %xmm1, %xmm6
2534 ; SSE2-NEXT: psrlw $5, %xmm6
2535 ; SSE2-NEXT: pand %xmm15, %xmm6
2536 ; SSE2-NEXT: por %xmm5, %xmm6
2537 ; SSE2-NEXT: psrlw $7, %xmm1
2538 ; SSE2-NEXT: pand %xmm7, %xmm1
2539 ; SSE2-NEXT: por %xmm6, %xmm1
2540 ; SSE2-NEXT: por %xmm4, %xmm1
2541 ; SSE2-NEXT: movdqa %xmm2, %xmm4
2542 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
2543 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7]
2544 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,7,6]
2545 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
2546 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7]
2547 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6]
2548 ; SSE2-NEXT: packuswb %xmm4, %xmm2
2549 ; SSE2-NEXT: movdqa %xmm2, %xmm5
2550 ; SSE2-NEXT: psllw $5, %xmm5
2551 ; SSE2-NEXT: pand %xmm10, %xmm5
2552 ; SSE2-NEXT: movdqa %xmm2, %xmm4
2553 ; SSE2-NEXT: psllw $7, %xmm4
2554 ; SSE2-NEXT: pand %xmm11, %xmm4
2555 ; SSE2-NEXT: movdqa %xmm2, %xmm6
2556 ; SSE2-NEXT: psllw $3, %xmm6
2557 ; SSE2-NEXT: pand %xmm12, %xmm6
2558 ; SSE2-NEXT: por %xmm5, %xmm6
2559 ; SSE2-NEXT: movdqa %xmm2, %xmm5
2560 ; SSE2-NEXT: paddb %xmm5, %xmm5
2561 ; SSE2-NEXT: pand %xmm8, %xmm5
2562 ; SSE2-NEXT: por %xmm6, %xmm5
2563 ; SSE2-NEXT: movdqa %xmm2, %xmm6
2564 ; SSE2-NEXT: psrlw $1, %xmm6
2565 ; SSE2-NEXT: pand %xmm13, %xmm6
2566 ; SSE2-NEXT: por %xmm5, %xmm6
2567 ; SSE2-NEXT: movdqa %xmm2, %xmm5
2568 ; SSE2-NEXT: psrlw $3, %xmm5
2569 ; SSE2-NEXT: pand %xmm14, %xmm5
2570 ; SSE2-NEXT: por %xmm6, %xmm5
2571 ; SSE2-NEXT: movdqa %xmm2, %xmm6
2572 ; SSE2-NEXT: psrlw $5, %xmm6
2573 ; SSE2-NEXT: pand %xmm15, %xmm6
2574 ; SSE2-NEXT: por %xmm5, %xmm6
2575 ; SSE2-NEXT: psrlw $7, %xmm2
2576 ; SSE2-NEXT: pand %xmm7, %xmm2
2577 ; SSE2-NEXT: por %xmm6, %xmm2
2578 ; SSE2-NEXT: por %xmm4, %xmm2
2579 ; SSE2-NEXT: movdqa %xmm3, %xmm4
2580 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
2581 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7]
2582 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,7,6]
2583 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
2584 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7]
2585 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,7,6]
2586 ; SSE2-NEXT: packuswb %xmm4, %xmm3
2587 ; SSE2-NEXT: movdqa %xmm3, %xmm5
2588 ; SSE2-NEXT: psllw $5, %xmm5
2589 ; SSE2-NEXT: pand %xmm10, %xmm5
2590 ; SSE2-NEXT: movdqa %xmm3, %xmm4
2591 ; SSE2-NEXT: psllw $7, %xmm4
2592 ; SSE2-NEXT: pand %xmm11, %xmm4
2593 ; SSE2-NEXT: movdqa %xmm3, %xmm6
2594 ; SSE2-NEXT: psllw $3, %xmm6
2595 ; SSE2-NEXT: pand %xmm12, %xmm6
2596 ; SSE2-NEXT: por %xmm5, %xmm6
2597 ; SSE2-NEXT: movdqa %xmm3, %xmm5
2598 ; SSE2-NEXT: paddb %xmm5, %xmm5
2599 ; SSE2-NEXT: pand %xmm8, %xmm5
2600 ; SSE2-NEXT: por %xmm6, %xmm5
2601 ; SSE2-NEXT: movdqa %xmm3, %xmm6
2602 ; SSE2-NEXT: psrlw $1, %xmm6
2603 ; SSE2-NEXT: pand %xmm13, %xmm6
2604 ; SSE2-NEXT: por %xmm5, %xmm6
2605 ; SSE2-NEXT: movdqa %xmm3, %xmm5
2606 ; SSE2-NEXT: psrlw $3, %xmm5
2607 ; SSE2-NEXT: pand %xmm14, %xmm5
2608 ; SSE2-NEXT: por %xmm6, %xmm5
2609 ; SSE2-NEXT: movdqa %xmm3, %xmm6
2610 ; SSE2-NEXT: psrlw $5, %xmm6
2611 ; SSE2-NEXT: pand %xmm15, %xmm6
2612 ; SSE2-NEXT: por %xmm5, %xmm6
2613 ; SSE2-NEXT: psrlw $7, %xmm3
2614 ; SSE2-NEXT: pand %xmm7, %xmm3
2615 ; SSE2-NEXT: por %xmm6, %xmm3
2616 ; SSE2-NEXT: por %xmm4, %xmm3
2617 ; SSE2-NEXT: retq
2621 ; SSSE3-NEXT: movdqa %xmm1, %xmm5
2622 ; SSSE3-NEXT: movdqa %xmm0, %xmm1
2623 ; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
2624 ; SSSE3-NEXT: pshufb %xmm8, %xmm1
2625 ; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2626 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
2627 ; SSSE3-NEXT: pand %xmm9, %xmm0
2628 ; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2629 ; SSSE3-NEXT: movdqa %xmm7, %xmm6
2630 ; SSSE3-NEXT: pshufb %xmm0, %xmm6
2631 ; SSSE3-NEXT: psrlw $4, %xmm1
2632 ; SSSE3-NEXT: pand %xmm9, %xmm1
2633 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2634 ; SSSE3-NEXT: movdqa %xmm4, %xmm0
2635 ; SSSE3-NEXT: pshufb %xmm1, %xmm0
2636 ; SSSE3-NEXT: por %xmm6, %xmm0
2637 ; SSSE3-NEXT: pshufb %xmm8, %xmm5
2638 ; SSSE3-NEXT: movdqa %xmm5, %xmm1
2639 ; SSSE3-NEXT: pand %xmm9, %xmm1
2640 ; SSSE3-NEXT: movdqa %xmm7, %xmm6
2641 ; SSSE3-NEXT: pshufb %xmm1, %xmm6
2642 ; SSSE3-NEXT: psrlw $4, %xmm5
2643 ; SSSE3-NEXT: pand %xmm9, %xmm5
2644 ; SSSE3-NEXT: movdqa %xmm4, %xmm1
2645 ; SSSE3-NEXT: pshufb %xmm5, %xmm1
2646 ; SSSE3-NEXT: por %xmm6, %xmm1
2647 ; SSSE3-NEXT: pshufb %xmm8, %xmm2
2648 ; SSSE3-NEXT: movdqa %xmm2, %xmm5
2649 ; SSSE3-NEXT: pand %xmm9, %xmm5
2650 ; SSSE3-NEXT: movdqa %xmm7, %xmm6
2651 ; SSSE3-NEXT: pshufb %xmm5, %xmm6
2652 ; SSSE3-NEXT: psrlw $4, %xmm2
2653 ; SSSE3-NEXT: pand %xmm9, %xmm2
2654 ; SSSE3-NEXT: movdqa %xmm4, %xmm5
2655 ; SSSE3-NEXT: pshufb %xmm2, %xmm5
2656 ; SSSE3-NEXT: por %xmm6, %xmm5
2657 ; SSSE3-NEXT: pshufb %xmm8, %xmm3
2658 ; SSSE3-NEXT: movdqa %xmm3, %xmm2
2659 ; SSSE3-NEXT: pand %xmm9, %xmm2
2660 ; SSSE3-NEXT: pshufb %xmm2, %xmm7
2661 ; SSSE3-NEXT: psrlw $4, %xmm3
2662 ; SSSE3-NEXT: pand %xmm9, %xmm3
2663 ; SSSE3-NEXT: pshufb %xmm3, %xmm4
2664 ; SSSE3-NEXT: por %xmm7, %xmm4
2665 ; SSSE3-NEXT: movdqa %xmm5, %xmm2
2666 ; SSSE3-NEXT: movdqa %xmm4, %xmm3
2667 ; SSSE3-NEXT: retq
2671 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
2672 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
2673 ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
2674 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2675 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5
2676 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2677 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
2678 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
2679 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
2680 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2681 ; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2
2682 ; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2
2683 ; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
2684 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm5
2685 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
2686 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
2687 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
2688 ; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0
2689 ; AVX1-NEXT: vpor %xmm0, %xmm5, %xmm0
2690 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2691 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
2692 ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
2693 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5
2694 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
2695 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
2696 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
2697 ; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2
2698 ; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2
2699 ; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
2700 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm3
2701 ; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3
2702 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
2703 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
2704 ; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm1
2705 ; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1
2706 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
2707 ; AVX1-NEXT: retq
2711 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
2712 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
2713 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2714 ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4
2715 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2716 ; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4
2717 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
2718 ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0
2719 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2720 ; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0
2721 ; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0
2722 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
2723 ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm2
2724 ; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm2
2725 ; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1
2726 ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
2727 ; AVX2-NEXT: vpshufb %ymm1, %ymm6, %ymm1
2728 ; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1
2729 ; AVX2-NEXT: retq
2733 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
2734 ; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0
2735 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2736 ; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm4
2737 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2738 ; AVX512F-NEXT: vpshufb %ymm4, %ymm5, %ymm4
2739 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
2740 ; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0
2741 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2742 ; AVX512F-NEXT: vpshufb %ymm0, %ymm6, %ymm0
2743 ; AVX512F-NEXT: vpor %ymm0, %ymm4, %ymm0
2744 ; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1
2745 ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm2
2746 ; AVX512F-NEXT: vpshufb %ymm2, %ymm5, %ymm2
2747 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
2748 ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1
2749 ; AVX512F-NEXT: vpshufb %ymm1, %ymm6, %ymm1
2750 ; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1
2751 ; AVX512F-NEXT: retq
2755 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30,33,32,35,34,37,36,39,38,41,40,43,42,45,44,47,46,49,48,51,50,53,52,55,54,57,56,59,58,61,60,63,62]
2756 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2757 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
2758 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2759 ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
2760 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
2761 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
2762 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2763 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0
2764 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
2765 ; AVX512BW-NEXT: retq
2769 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
2770 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
2771 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
2772 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0
2773 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2774 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
2775 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
2776 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1
2777 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
2778 ; XOPAVX1-NEXT: retq
2782 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
2783 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
2784 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
2785 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0
2786 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
2787 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
2788 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
2789 ; XOPAVX2-NEXT
2790 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
2791 ; XOPAVX2-NEXT: retq
2799 ; SSE2-NEXT: pxor %xmm9, %xmm9
2800 ; SSE2-NEXT: movdqa %xmm0, %xmm4
2801 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
2802 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
2803 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
2804 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
2805 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
2806 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
2807 ; SSE2-NEXT: packuswb %xmm4, %xmm0
2808 ; SSE2-NEXT: movdqa %xmm0, %xmm5
2809 ; SSE2-NEXT: psllw $5, %xmm5
2810 ; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
2811 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm10
2812 ; SSE2-NEXT: pand %xmm10, %xmm5
2813 ; SSE2-NEXT: movdqa %xmm0, %xmm4
2814 ; SSE2-NEXT: psllw $7, %xmm4
2815 ; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
2816 ; SSE2-NEXT: pand %xmm11, %xmm11
2817 ; SSE2-NEXT: pand %xmm11, %xmm4
2818 ; SSE2-NEXT: movdqa %xmm0, %xmm6
2819 ; SSE2-NEXT: psllw $3, %xmm6
2820 ; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
2821 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm12
2822 ; SSE2-NEXT: pand %xmm12, %xmm6
2823 ; SSE2-NEXT: por %xmm5, %xmm6
2824 ; SSE2-NEXT: movdqa %xmm0, %xmm5
2825 ; SSE2-NEXT: paddb %xmm5, %xmm5
2826 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
2827 ; SSE2-NEXT: pand %xmm8, %xmm5
2828 ; SSE2-NEXT: por %xmm6, %xmm5
2829 ; SSE2-NEXT: movdqa %xmm0, %xmm6
2830 ; SSE2-NEXT: psrlw $1, %xmm6
2831 ; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
2832 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm13
2833 ; SSE2-NEXT: pand %xmm13, %xmm6
2834 ; SSE2-NEXT: por %xmm5, %xmm6
2835 ; SSE2-NEXT: movdqa %xmm0, %xmm7
2836 ; SSE2-NEXT: psrlw $3, %xmm7
2837 ; SSE2-NEXT: movdqa {{.*#+}} xmm14 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
2838 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm14
2839 ; SSE2-NEXT: pand %xmm14, %xmm7
2840 ; SSE2-NEXT: por %xmm6, %xmm7
2841 ; SSE2-NEXT: movdqa %xmm0, %xmm5
2842 ; SSE2-NEXT: psrlw $5, %xmm5
2843 ; SSE2-NEXT: movdqa {{.*#+}} xmm15 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
2844 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm15
2845 ; SSE2-NEXT: pand %xmm15, %xmm5
2846 ; SSE2-NEXT: por %xmm7, %xmm5
2847 ; SSE2-NEXT: psrlw $7, %xmm0
2848 ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
2849 ; SSE2-NEXT: pand %xmm7, %xmm7
2850 ; SSE2-NEXT: pand %xmm7, %xmm0
2851 ; SSE2-NEXT: por %xmm5, %xmm0
2852 ; SSE2-NEXT: por %xmm4, %xmm0
2853 ; SSE2-NEXT: movdqa %xmm1, %xmm4
2854 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
2855 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
2856 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
2857 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
2858 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
2859 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
2860 ; SSE2-NEXT: packuswb %xmm4, %xmm1
2861 ; SSE2-NEXT: movdqa %xmm1, %xmm5
2862 ; SSE2-NEXT: psllw $5, %xmm5
2863 ; SSE2-NEXT: pand %xmm10, %xmm5
2864 ; SSE2-NEXT: movdqa %xmm1, %xmm4
2865 ; SSE2-NEXT: psllw $7, %xmm4
2866 ; SSE2-NEXT: pand %xmm11, %xmm4
2867 ; SSE2-NEXT: movdqa %xmm1, %xmm6
2868 ; SSE2-NEXT: psllw $3, %xmm6
2869 ; SSE2-NEXT: pand %xmm12, %xmm6
2870 ; SSE2-NEXT: por %xmm5, %xmm6
2871 ; SSE2-NEXT: movdqa %xmm1, %xmm5
2872 ; SSE2-NEXT: paddb %xmm5, %xmm5
2873 ; SSE2-NEXT: pand %xmm8, %xmm5
2874 ; SSE2-NEXT: por %xmm6, %xmm5
2875 ; SSE2-NEXT: movdqa %xmm1, %xmm6
2876 ; SSE2-NEXT: psrlw $1, %xmm6
2877 ; SSE2-NEXT: pand %xmm13, %xmm6
2878 ; SSE2-NEXT: por %xmm5, %xmm6
2879 ; SSE2-NEXT: movdqa %xmm1, %xmm5
2880 ; SSE2-NEXT: psrlw $3, %xmm5
2881 ; SSE2-NEXT: pand %xmm14, %xmm5
2882 ; SSE2-NEXT: por %xmm6, %xmm5
2883 ; SSE2-NEXT: movdqa %xmm1, %xmm6
2884 ; SSE2-NEXT: psrlw $5, %xmm6
2885 ; SSE2-NEXT: pand %xmm15, %xmm6
2886 ; SSE2-NEXT: por %xmm5, %xmm6
2887 ; SSE2-NEXT: psrlw $7, %xmm1
2888 ; SSE2-NEXT: pand %xmm7, %xmm1
2889 ; SSE2-NEXT: por %xmm6, %xmm1
2890 ; SSE2-NEXT: por %xmm4, %xmm1
2891 ; SSE2-NEXT: movdqa %xmm2, %xmm4
2892 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
2893 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
2894 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
2895 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
2896 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
2897 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
2898 ; SSE2-NEXT: packuswb %xmm4, %xmm2
2899 ; SSE2-NEXT: movdqa %xmm2, %xmm5
2900 ; SSE2-NEXT: psllw $5, %xmm5
2901 ; SSE2-NEXT: pand %xmm10, %xmm5
2902 ; SSE2-NEXT: movdqa %xmm2, %xmm4
2903 ; SSE2-NEXT: psllw $7, %xmm4
2904 ; SSE2-NEXT: pand %xmm11, %xmm4
2905 ; SSE2-NEXT: movdqa %xmm2, %xmm6
2906 ; SSE2-NEXT: psllw $3, %xmm6
2907 ; SSE2-NEXT: pand %xmm12, %xmm6
2908 ; SSE2-NEXT: por %xmm5, %xmm6
2909 ; SSE2-NEXT: movdqa %xmm2, %xmm5
2910 ; SSE2-NEXT: paddb %xmm5, %xmm5
2911 ; SSE2-NEXT: pand %xmm8, %xmm5
2912 ; SSE2-NEXT: por %xmm6, %xmm5
2913 ; SSE2-NEXT: movdqa %xmm2, %xmm6
2914 ; SSE2-NEXT: psrlw $1, %xmm6
2915 ; SSE2-NEXT: pand %xmm13, %xmm6
2916 ; SSE2-NEXT: por %xmm5, %xmm6
2917 ; SSE2-NEXT: movdqa %xmm2, %xmm5
2918 ; SSE2-NEXT: psrlw $3, %xmm5
2919 ; SSE2-NEXT: pand %xmm14, %xmm5
2920 ; SSE2-NEXT: por %xmm6, %xmm5
2921 ; SSE2-NEXT: movdqa %xmm2, %xmm6
2922 ; SSE2-NEXT: psrlw $5, %xmm6
2923 ; SSE2-NEXT: pand %xmm15, %xmm6
2924 ; SSE2-NEXT: por %xmm5, %xmm6
2925 ; SSE2-NEXT: psrlw $7, %xmm2
2926 ; SSE2-NEXT: pand %xmm7, %xmm2
2927 ; SSE2-NEXT: por %xmm6, %xmm2
2928 ; SSE2-NEXT: por %xmm4, %xmm2
2929 ; SSE2-NEXT: movdqa %xmm3, %xmm4
2930 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
2931 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
2932 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
2933 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
2934 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
2935 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
2936 ; SSE2-NEXT: packuswb %xmm4, %xmm3
2937 ; SSE2-NEXT: movdqa %xmm3, %xmm5
2938 ; SSE2-NEXT: psllw $5, %xmm5
2939 ; SSE2-NEXT: pand %xmm10, %xmm5
2940 ; SSE2-NEXT: movdqa %xmm3, %xmm4
2941 ; SSE2-NEXT: psllw $7, %xmm4
2942 ; SSE2-NEXT: pand %xmm11, %xmm4
2943 ; SSE2-NEXT: movdqa %xmm3, %xmm6
2944 ; SSE2-NEXT: psllw $3, %xmm6
2945 ; SSE2-NEXT: pand %xmm12, %xmm6
2946 ; SSE2-NEXT: por %xmm5, %xmm6
2947 ; SSE2-NEXT: movdqa %xmm3, %xmm5
2948 ; SSE2-NEXT: paddb %xmm5, %xmm5
2949 ; SSE2-NEXT: pand %xmm8, %xmm5
2950 ; SSE2-NEXT: por %xmm6, %xmm5
2951 ; SSE2-NEXT: movdqa %xmm3, %xmm6
2952 ; SSE2-NEXT: psrlw $1, %xmm6
2953 ; SSE2-NEXT: pand %xmm13, %xmm6
2954 ; SSE2-NEXT: por %xmm5, %xmm6
2955 ; SSE2-NEXT: movdqa %xmm3, %xmm5
2956 ; SSE2-NEXT: psrlw $3, %xmm5
2957 ; SSE2-NEXT: pand %xmm14, %xmm5
2958 ; SSE2-NEXT: por %xmm6, %xmm5
2959 ; SSE2-NEXT: movdqa %xmm3, %xmm6
2960 ; SSE2-NEXT: psrlw $5, %xmm6
2961 ; SSE2-NEXT: pand %xmm15, %xmm6
2962 ; SSE2-NEXT: por %xmm5, %xmm6
2963 ; SSE2-NEXT: psrlw $7, %xmm3
2964 ; SSE2-NEXT: pand %xmm7, %xmm3
2965 ; SSE2-NEXT: por %xmm6, %xmm3
2966 ; SSE2-NEXT: por %xmm4, %xmm3
2967 ; SSE2-NEXT: retq
2971 ; SSSE3-NEXT: movdqa %xmm1, %xmm5
2972 ; SSSE3-NEXT: movdqa %xmm0, %xmm1
2973 ; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
2974 ; SSSE3-NEXT: pshufb %xmm8, %xmm1
2975 ; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2976 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
2977 ; SSSE3-NEXT: pand %xmm9, %xmm0
2978 ; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2979 ; SSSE3-NEXT: movdqa %xmm7, %xmm6
2980 ; SSSE3-NEXT: pshufb %xmm0, %xmm6
2981 ; SSSE3-NEXT: psrlw $4, %xmm1
2982 ; SSSE3-NEXT: pand %xmm9, %xmm1
2983 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2984 ; SSSE3-NEXT: movdqa %xmm4, %xmm0
2985 ; SSSE3-NEXT: pshufb %xmm1, %xmm0
2986 ; SSSE3-NEXT: por %xmm6, %xmm0
2987 ; SSSE3-NEXT: pshufb %xmm8, %xmm5
2988 ; SSSE3-NEXT: movdqa %xmm5, %xmm1
2989 ; SSSE3-NEXT: pand %xmm9, %xmm1
2990 ; SSSE3-NEXT: movdqa %xmm7, %xmm6
2991 ; SSSE3-NEXT: pshufb %xmm1, %xmm6
2992 ; SSSE3-NEXT: psrlw $4, %xmm5
2993 ; SSSE3-NEXT: pand %xmm9, %xmm5
2994 ; SSSE3-NEXT: movdqa %xmm4, %xmm1
2995 ; SSSE3-NEXT: pshufb %xmm5, %xmm1
2996 ; SSSE3-NEXT: por %xmm6, %xmm1
2997 ; SSSE3-NEXT: pshufb %xmm8, %xmm2
2998 ; SSSE3-NEXT: movdqa %xmm2, %xmm5
2999 ; SSSE3-NEXT: pand %xmm9, %xmm5
3000 ; SSSE3-NEXT: movdqa %xmm7, %xmm6
3001 ; SSSE3-NEXT: pshufb %xmm5, %xmm6
3002 ; SSSE3-NEXT: psrlw $4, %xmm2
3003 ; SSSE3-NEXT: pand %xmm9, %xmm2
3004 ; SSSE3-NEXT: movdqa %xmm4, %xmm5
3005 ; SSSE3-NEXT: pshufb %xmm2, %xmm5
3006 ; SSSE3-NEXT: por %xmm6, %xmm5
3007 ; SSSE3-NEXT: pshufb %xmm8, %xmm3
3008 ; SSSE3-NEXT: movdqa %xmm3, %xmm2
3009 ; SSSE3-NEXT: pand %xmm9, %xmm2
3010 ; SSSE3-NEXT: pshufb %xmm2, %xmm7
3011 ; SSSE3-NEXT: psrlw $4, %xmm3
3012 ; SSSE3-NEXT: pand %xmm9, %xmm3
3013 ; SSSE3-NEXT: pshufb %xmm3, %xmm4
3014 ; SSSE3-NEXT: por %xmm7, %xmm4
3015 ; SSSE3-NEXT: movdqa %xmm5, %xmm2
3016 ; SSSE3-NEXT: movdqa %xmm4, %xmm3
3017 ; SSSE3-NEXT: retq
3021 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
3022 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
3023 ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
3024 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
3025 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5
3026 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
3027 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
3028 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
3029 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
3030 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
3031 ; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2
3032 ; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2
3033 ; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
3034 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm5
3035 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
3036 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
3037 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
3038 ; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0
3039 ; AVX1-NEXT: vpor %xmm0, %xmm5, %xmm0
3040 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
3041 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
3042 ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
3043 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5
3044 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
3045 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
3046 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
3047 ; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2
3048 ; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2
3049 ; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
3050 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm3
3051 ; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3
3052 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
3053 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
3054 ; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm1
3055 ; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1
3056 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
3057 ; AVX1-NEXT: retq
3061 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
3062 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
3063 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
3064 ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4
3065 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
3066 ; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4
3067 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
3068 ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0
3069 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
3070 ; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0
3071 ; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0
3072 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
3073 ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm2
3074 ; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm2
3075 ; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1
3076 ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
3077 ; AVX2-NEXT: vpshufb %ymm1, %ymm6, %ymm1
3078 ; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1
3079 ; AVX2-NEXT: retq
3083 ; AVX512F-NEXT: vpslld $29, %zmm0, %zmm1
3084 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm1, %zmm1
3085 ; AVX512F-NEXT: vpslld $31, %zmm0, %zmm2
3086 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3087 ; AVX512F-NEXT: vpord %zmm1, %zmm2, %zmm1
3088 ; AVX512F-NEXT: vpslld $27, %zmm0, %zmm2
3089 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3090 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1
3091 ; AVX512F-NEXT: vpslld $25, %zmm0, %zmm2
3092 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3093 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1
3094 ; AVX512F-NEXT: vpslld $23, %zmm0, %zmm2
3095 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3096 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1
3097 ; AVX512F-NEXT: vpslld $21, %zmm0, %zmm2
3098 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3099 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1
3100 ; AVX512F-NEXT: vpslld $19, %zmm0, %zmm2
3101 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3102 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1
3103 ; AVX512F-NEXT: vpslld $17, %zmm0, %zmm2
3104 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3105 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1
3106 ; AVX512F-NEXT: vpslld $15, %zmm0, %zmm2
3107 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3108 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1
3109 ; AVX512F-NEXT: vpslld $13, %zmm0, %zmm2
3110 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3111 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1
3112 ; AVX512F-NEXT: vpslld $11, %zmm0, %zmm2
3113 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3114 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1
3115 ; AVX512F-NEXT: vpslld $9, %zmm0, %zmm2
3116 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3117 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1
3118 ; AVX512F-NEXT: vpslld $7, %zmm0, %zmm2
3119 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3120 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1
3121 ; AVX512F-NEXT: vpslld $5, %zmm0, %zmm2
3122 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3123 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1
3124 ; AVX512F-NEXT: vpslld $3, %zmm0, %zmm2
3125 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3126 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1
3127 ; AVX512F-NEXT: vpslld $1, %zmm0, %zmm2
3128 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3129 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1
3130 ; AVX512F-NEXT: vpsrld $1, %zmm0, %zmm2
3131 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3132 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1
3133 ; AVX512F-NEXT: vpsrld $3, %zmm0, %zmm2
3134 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3135 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1
3136 ; AVX512F-NEXT: vpsrld $5, %zmm0, %zmm2
3137 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3138 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1
3139 ; AVX512F-NEXT: vpsrld $7, %zmm0, %zmm2
3140 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3141 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1
3142 ; AVX512F-NEXT: vpsrld $9, %zmm0, %zmm2
3143 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3144 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1
3145 ; AVX512F-NEXT: vpsrld $11, %zmm0, %zmm2
3146 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3147 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1
3148 ; AVX512F-NEXT: vpsrld $13, %zmm0, %zmm2
3149 NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3150 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1
3151 ; AVX512F-NEXT: vpsrld $15, %zmm0, %zmm2
3152 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3153 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1
3154 ; AVX512F-NEXT: vpsrld $17, %zmm0, %zmm2
3155 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3156 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1
3157 ; AVX512F-NEXT: vpsrld $19, %zmm0, %zmm2
3158 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3159 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1
3160 ; AVX512F-NEXT: vpsrld $21, %zmm0, %zmm2
3161 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3162 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1
3163 ; AVX512F-NEXT: vpsrld $23, %zmm0, %zmm2
3164 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3165 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1
3166 ; AVX512F-NEXT: vpsrld $25, %zmm0, %zmm2
3167 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3168 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1
3169 ; AVX512F-NEXT: vpsrld $27, %zmm0, %zmm2
3170 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3171 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1
3172 ; AVX512F-NEXT: vpsrld $29, %zmm0, %zmm2
3173 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
3174 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1
3175 ; AVX512F-NEXT: vpsrld $31, %zmm0, %zmm0
3176 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0
3177 ; AVX512F-NEXT: vpord %zmm0, %zmm1, %zmm0
3178 ; AVX512F-NEXT: retq
3182 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28,35,34,33,32,39,38,37,36,43,42,41,40,47,46,45,44,51,50,49,48,55,54,53,52,59,58,57,56,63,62,61,60]
3183 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
3184 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
3185 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
3186 ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
3187 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
3188 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
3189 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
3190 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0
3191 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
3192 ; AVX512BW-NEXT: retq
3196 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
3197 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
3198 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
3199 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0
3200 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
3201 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
3202 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
3203 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1
3204 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
3205 ; XOPAVX1-NEXT: retq
3209 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
3210 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
3211 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
3212 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0
3213 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
3214 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
3215 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
3216 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1
3217 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
3218 ; XOPAVX2-NEXT: retq
3226 ; SSE2-NEXT: pxor %xmm9, %xmm9
3227 ; SSE2-NEXT: movdqa %xmm0, %xmm4
3228 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
3229 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
3230 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
3231 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
3232 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
3233 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
3234 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
3235 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
3236 ; SSE2-NEXT: packuswb %xmm4, %xmm0
3237 ; SSE2-NEXT: movdqa %xmm0, %xmm5
3238 ; SSE2-NEXT: psllw $5, %xmm5
3239 ; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
3240 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm10
3241 ; SSE2-NEXT: pand %xmm10, %xmm5
3242 ; SSE2-NEXT: movdqa %xmm0, %xmm4
3243 ; SSE2-NEXT: psllw $7, %xmm4
3244 ; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
3245 ; SSE2-NEXT: pand %xmm11, %xmm11
3246 ; SSE2-NEXT: pand %xmm11, %xmm4
3247 ; SSE2-NEXT: movdqa %xmm0, %xmm6
3248 ; SSE2-NEXT: psllw $3, %xmm6
3249 ; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
3250 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm12
3251 ; SSE2-NEXT: pand %xmm12, %xmm6
3252 ; SSE2-NEXT: por %xmm5, %xmm6
3253 ; SSE2-NEXT: movdqa %xmm0, %xmm5
3254 ; SSE2-NEXT: paddb %xmm5, %xmm5
3255 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
3256 ; SSE2-NEXT: pand %xmm8, %xmm5
3257 ; SSE2-NEXT: por %xmm6, %xmm5
3258 ; SSE2-NEXT: movdqa %xmm0, %xmm6
3259 ; SSE2-NEXT: psrlw $1, %xmm6
3260 ; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
3261 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm13
3262 ; SSE2-NEXT: pand %xmm13, %xmm6
3263 ; SSE2-NEXT: por %xmm5, %xmm6
3264 ; SSE2-NEXT: movdqa %xmm0, %xmm7
3265 ; SSE2-NEXT: psrlw $3, %xmm7
3266 ; SSE2-NEXT: movdqa {{.*#+}} xmm14 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
3267 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm14
3268 ; SSE2-NEXT: pand %xmm14, %xmm7
3269 ; SSE2-NEXT: por %xmm6, %xmm7
3270 ; SSE2-NEXT: movdqa %xmm0, %xmm5
3271 ; SSE2-NEXT: psrlw $5, %xmm5
3272 ; SSE2-NEXT: movdqa {{.*#+}} xmm15 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
3273 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm15
3274 ; SSE2-NEXT: pand %xmm15, %xmm5
3275 ; SSE2-NEXT: por %xmm7, %xmm5
3276 ; SSE2-NEXT: psrlw $7, %xmm0
3277 ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
3278 ; SSE2-NEXT: pand %xmm7, %xmm7
3279 ; SSE2-NEXT: pand %xmm7, %xmm0
3280 ; SSE2-NEXT: por %xmm5, %xmm0
3281 ; SSE2-NEXT: por %xmm4, %xmm0
3282 ; SSE2-NEXT: movdqa %xmm1, %xmm4
3283 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
3284 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
3285 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
3286 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
3287 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
3288 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
3289 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
3290 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
3291 ; SSE2-NEXT: packuswb %xmm4, %xmm1
3292 ; SSE2-NEXT: movdqa %xmm1, %xmm5
3293 ; SSE2-NEXT: psllw $5, %xmm5
3294 ; SSE2-NEXT: pand %xmm10, %xmm5
3295 ; SSE2-NEXT: movdqa %xmm1, %xmm4
3296 ; SSE2-NEXT: psllw $7, %xmm4
3297 ; SSE2-NEXT: pand %xmm11, %xmm4
3298 ; SSE2-NEXT: movdqa %xmm1, %xmm6
3299 ; SSE2-NEXT: psllw $3, %xmm6
3300 ; SSE2-NEXT: pand %xmm12, %xmm6
3301 ; SSE2-NEXT: por %xmm5, %xmm6
3302 ; SSE2-NEXT: movdqa %xmm1, %xmm5
3303 ; SSE2-NEXT: paddb %xmm5, %xmm5
3304 ; SSE2-NEXT: pand %xmm8, %xmm5
3305 ; SSE2-NEXT: por %xmm6, %xmm5
3306 ; SSE2-NEXT: movdqa %xmm1, %xmm6
3307 ; SSE2-NEXT: psrlw $1, %xmm6
3308 ; SSE2-NEXT: pand %xmm13, %xmm6
3309 ; SSE2-NEXT: por %xmm5, %xmm6
3310 ; SSE2-NEXT: movdqa %xmm1, %xmm5
3311 ; SSE2-NEXT: psrlw $3, %xmm5
3312 ; SSE2-NEXT: pand %xmm14, %xmm5
3313 ; SSE2-NEXT: por %xmm6, %xmm5
3314 ; SSE2-NEXT: movdqa %xmm1, %xmm6
3315 ; SSE2-NEXT: psrlw $5, %xmm6
3316 ; SSE2-NEXT: pand %xmm15, %xmm6
3317 ; SSE2-NEXT: por %xmm5, %xmm6
3318 ; SSE2-NEXT: psrlw $7, %xmm1
3319 ; SSE2-NEXT: pand %xmm7, %xmm1
3320 ; SSE2-NEXT: por %xmm6, %xmm1
3321 ; SSE2-NEXT: por %xmm4, %xmm1
3322 ; SSE2-NEXT: movdqa %xmm2, %xmm4
3323 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
3324 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
3325 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
3326 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
3327 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
3328 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
3329 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
3330 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
3331 ; SSE2-NEXT: packuswb %xmm4, %xmm2
3332 ; SSE2-NEXT: movdqa %xmm2, %xmm5
3333 ; SSE2-NEXT: psllw $5, %xmm5
3334 ; SSE2-NEXT: pand %xmm10, %xmm5
3335 ; SSE2-NEXT: movdqa %xmm2, %xmm4
3336 ; SSE2-NEXT: psllw $7, %xmm4
3337 ; SSE2-NEXT: pand %xmm11, %xmm4
3338 ; SSE2-NEXT: movdqa %xmm2, %xmm6
3339 ; SSE2-NEXT: psllw $3, %xmm6
3340 ; SSE2-NEXT: pand %xmm12, %xmm6
3341 ; SSE2-NEXT: por %xmm5, %xmm6
3342 ; SSE2-NEXT: movdqa %xmm2, %xmm5
3343 ; SSE2-NEXT: paddb %xmm5, %xmm5
3344 ; SSE2-NEXT: pand %xmm8, %xmm5
3345 ; SSE2-NEXT: por %xmm6, %xmm5
3346 ; SSE2-NEXT: movdqa %xmm2, %xmm6
3347 ; SSE2-NEXT: psrlw $1, %xmm6
3348 ; SSE2-NEXT: pand %xmm13, %xmm6
3349 ; SSE2-NEXT: por %xmm5, %xmm6
3350 ; SSE2-NEXT: movdqa %xmm2, %xmm5
3351 ; SSE2-NEXT: psrlw $3, %xmm5
3352 ; SSE2-NEXT: pand %xmm14, %xmm5
3353 ; SSE2-NEXT: por %xmm6, %xmm5
3354 ; SSE2-NEXT: movdqa %xmm2, %xmm6
3355 ; SSE2-NEXT: psrlw $5, %xmm6
3356 ; SSE2-NEXT: pand %xmm15, %xmm6
3357 ; SSE2-NEXT: por %xmm5, %xmm6
3358 ; SSE2-NEXT: psrlw $7, %xmm2
3359 ; SSE2-NEXT: pand %xmm7, %xmm2
3360 ; SSE2-NEXT: por %xmm6, %xmm2
3361 ; SSE2-NEXT: por %xmm4, %xmm2
3362 ; SSE2-NEXT: movdqa %xmm3, %xmm4
3363 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
3364 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
3365 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
3366 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
3367 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
3368 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
3369 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
3370 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
3371 ; SSE2-NEXT: packuswb %xmm4, %xmm3
3372 ; SSE2-NEXT: movdqa %xmm3, %xmm5
3373 ; SSE2-NEXT: psllw $5, %xmm5
3374 ; SSE2-NEXT: pand %xmm10, %xmm5
3375 ; SSE2-NEXT: movdqa %xmm3, %xmm4
3376 ; SSE2-NEXT: psllw $7, %xmm4
3377 ; SSE2-NEXT: pand %xmm11, %xmm4
3378 ; SSE2-NEXT: movdqa %xmm3, %xmm6
3379 ; SSE2-NEXT: psllw $3, %xmm6
3380 ; SSE2-NEXT: pand %xmm12, %xmm6
3381 ; SSE2-NEXT: por %xmm5, %xmm6
3382 ; SSE2-NEXT: movdqa %xmm3, %xmm5
3383 ; SSE2-NEXT: paddb %xmm5, %xmm5
3384 ; SSE2-NEXT: pand %xmm8, %xmm5
3385 ; SSE2-NEXT: por %xmm6, %xmm5
3386 ; SSE2-NEXT: movdqa %xmm3, %xmm6
3387 ; SSE2-NEXT: psrlw $1, %xmm6
3388 ; SSE2-NEXT: pand %xmm13, %xmm6
3389 ; SSE2-NEXT: por %xmm5, %xmm6
3390 ; SSE2-NEXT: movdqa %xmm3, %xmm5
3391 ; SSE2-NEXT: psrlw $3, %xmm5
3392 ; SSE2-NEXT: pand %xmm14, %xmm5
3393 ; SSE2-NEXT: por %xmm6, %xmm5
3394 ; SSE2-NEXT: movdqa %xmm3, %xmm6
3395 ; SSE2-NEXT: psrlw $5, %xmm6
3396 ; SSE2-NEXT: pand %xmm15, %xmm6
3397 ; SSE2-NEXT: por %xmm5, %xmm6
3398 ; SSE2-NEXT: psrlw $7, %xmm3
3399 ; SSE2-NEXT: pand %xmm7, %xmm3
3400 ; SSE2-NEXT: por %xmm6, %xmm3
3401 ; SSE2-NEXT: por %xmm4, %xmm3
3402 ; SSE2-NEXT: retq
3406 ; SSSE3-NEXT: movdqa %xmm1, %xmm5
3407 ; SSSE3-NEXT: movdqa %xmm0, %xmm1
3408 ; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
3409 ; SSSE3-NEXT: pshufb %xmm8, %xmm1
3410 ; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
3411 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
3412 ; SSSE3-NEXT: pand %xmm9, %xmm0
3413 ; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
3414 ; SSSE3-NEXT: movdqa %xmm7, %xmm6
3415 ; SSSE3-NEXT: pshufb %xmm0, %xmm6
3416 ; SSSE3-NEXT: psrlw $4, %xmm1
3417 ; SSSE3-NEXT: pand %xmm9, %xmm1
3418 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
3419 ; SSSE3-NEXT: movdqa %xmm4, %xmm0
3420 ; SSSE3-NEXT: pshufb %xmm1, %xmm0
3421 ; SSSE3-NEXT: por %xmm6, %xmm0
3422 ; SSSE3-NEXT: pshufb %xmm8, %xmm5
3423 ; SSSE3-NEXT: movdqa %xmm5, %xmm1
3424 ; SSSE3-NEXT: pand %xmm9, %xmm1
3425 ; SSSE3-NEXT: movdqa %xmm7, %xmm6
3426 ; SSSE3-NEXT: pshufb %xmm1, %xmm6
3427 ; SSSE3-NEXT: psrlw $4, %xmm5
3428 ; SSSE3-NEXT: pand %xmm9, %xmm5
3429 ; SSSE3-NEXT: movdqa %xmm4, %xmm1
3430 ; SSSE3-NEXT: pshufb %xmm5, %xmm1
3431 ; SSSE3-NEXT: por %xmm6, %xmm1
3432 ; SSSE3-NEXT: pshufb %xmm8, %xmm2
3433 ; SSSE3-NEXT: movdqa %xmm2, %xmm5
3434 ; SSSE3-NEXT: pand %xmm9, %xmm5
3435 ; SSSE3-NEXT: movdqa %xmm7, %xmm6
3436 ; SSSE3-NEXT: pshufb %xmm5, %xmm6
3437 ; SSSE3-NEXT: psrlw $4, %xmm2
3438 ; SSSE3-NEXT: pand %xmm9, %xmm2
3439 ; SSSE3-NEXT: movdqa %xmm4, %xmm5
3440 ; SSSE3-NEXT: pshufb %xmm2, %xmm5
3441 ; SSSE3-NEXT: por %xmm6, %xmm5
3442 ; SSSE3-NEXT: pshufb %xmm8, %xmm3
3443 ; SSSE3-NEXT: movdqa %xmm3, %xmm2
3444 ; SSSE3-NEXT: pand %xmm9, %xmm2
3445 ; SSSE3-NEXT: pshufb %xmm2, %xmm7
3446 ; SSSE3-NEXT: psrlw $4, %xmm3
3447 ; SSSE3-NEXT: pand %xmm9, %xmm3
3448 ; SSSE3-NEXT: pshufb %xmm3, %xmm4
3449 ; SSSE3-NEXT: por %xmm7, %xmm4
3450 ; SSSE3-NEXT: movdqa %xmm5, %xmm2
3451 ; SSSE3-NEXT: movdqa %xmm4, %xmm3
3452 ; SSSE3-NEXT: retq
3456 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
3457 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
3458 ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
3459 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
3460 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5
3461 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
3462 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
3463 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
3464 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
3465 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
3466 ; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2
3467 ; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2
3468 ; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
3469 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm5
3470 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
3471 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
3472 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
3473 ; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0
3474 ; AVX1-NEXT: vpor %xmm0, %xmm5, %xmm0
3475 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
3476 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
3477 ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
3478 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5
3479 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
3480 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
3481 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
3482 ; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2
3483 ; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2
3484 ; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
3485 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm3
3486 ; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3
3487 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
3488 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
3489 ; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm1
3490 ; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1
3491 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
3492 ; AVX1-NEXT: retq
3496 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
3497 ; AVX2-NEXT
3498 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
3499 ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4
3500 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
3501 ; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4
3502 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
3503 ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0
3504 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
3505 ; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0
3506 ; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0
3507 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
3508 ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm2
3509 ; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm2
3510 ; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1
3511 ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
3512 ; AVX2-NEXT: vpshufb %ymm1, %ymm6, %ymm1
3513 ; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1
3514 ; AVX2-NEXT: retq
3518 ; AVX512F-NEXT: vpsllq $61, %zmm0, %zmm1
3519 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
3520 ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm2
3521 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3522 ; AVX512F-NEXT: vporq %zmm1, %zmm2, %zmm1
3523 ; AVX512F-NEXT: vpsllq $59, %zmm0, %zmm2
3524 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3525 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3526 ; AVX512F-NEXT: vpsllq $57, %zmm0, %zmm2
3527 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3528 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3529 ; AVX512F-NEXT: vpsllq $55, %zmm0, %zmm2
3530 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3531 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3532 ; AVX512F-NEXT: vpsllq $53, %zmm0, %zmm2
3533 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3534 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3535 ; AVX512F-NEXT: vpsllq $51, %zmm0, %zmm2
3536 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3537 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3538 ; AVX512F-NEXT: vpsllq $49, %zmm0, %zmm2
3539 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3540 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3541 ; AVX512F-NEXT: vpsllq $47, %zmm0, %zmm2
3542 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3543 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3544 ; AVX512F-NEXT: vpsllq $45, %zmm0, %zmm2
3545 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3546 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3547 ; AVX512F-NEXT: vpsllq $43, %zmm0, %zmm2
3548 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3549 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3550 ; AVX512F-NEXT: vpsllq $41, %zmm0, %zmm2
3551 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3552 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3553 ; AVX512F-NEXT: vpsllq $39, %zmm0, %zmm2
3554 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3555 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3556 ; AVX512F-NEXT: vpsllq $37, %zmm0, %zmm2
3557 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3558 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3559 ; AVX512F-NEXT: vpsllq $35, %zmm0, %zmm2
3560 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3561 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3562 ; AVX512F-NEXT: vpsllq $33, %zmm0, %zmm2
3563 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3564 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3565 ; AVX512F-NEXT: vpsllq $31, %zmm0, %zmm2
3566 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3567 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3568 ; AVX512F-NEXT: vpsllq $29, %zmm0, %zmm2
3569 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3570 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3571 ; AVX512F-NEXT: vpsllq $27, %zmm0, %zmm2
3572 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3573 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3574 ; AVX512F-NEXT: vpsllq $25, %zmm0, %zmm2
3575 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3576 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3577 ; AVX512F-NEXT: vpsllq $23, %zmm0, %zmm2
3578 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3579 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3580 ; AVX512F-NEXT: vpsllq $21, %zmm0, %zmm2
3581 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3582 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3583 ; AVX512F-NEXT: vpsllq $19, %zmm0, %zmm2
3584 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3585 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3586 ; AVX512F-NEXT: vpsllq $17, %zmm0, %zmm2
3587 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3588 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3589 ; AVX512F-NEXT: vpsllq $15, %zmm0, %zmm2
3590 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3591 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3592 ; AVX512F-NEXT: vpsllq $13, %zmm0, %zmm2
3593 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3594 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3595 ; AVX512F-NEXT: vpsllq $11, %zmm0, %zmm2
3596 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3597 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3598 ; AVX512F-NEXT: vpsllq $9, %zmm0, %zmm2
3599 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3600 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3601 ; AVX512F-NEXT: vpsllq $7, %zmm0, %zmm2
3602 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3603 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3604 ; AVX512F-NEXT: vpsllq $5, %zmm0, %zmm2
3605 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3606 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3607 ; AVX512F-NEXT: vpsllq $3, %zmm0, %zmm2
3608 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3609 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3610 ; AVX512F-NEXT: vpsllq $1, %zmm0, %zmm2
3611 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3612 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3613 ; AVX512F-NEXT: vpsrlq $1, %zmm0, %zmm2
3614 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3615 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3616 ; AVX512F-NEXT: vpsrlq $3, %zmm0, %zmm2
3617 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3618 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3619 ; AVX512F-NEXT: vpsrlq $5, %zmm0, %zmm2
3620 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3621 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3622 ; AVX512F-NEXT: vpsrlq $7, %zmm0, %zmm2
3623 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3624 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3625 ; AVX512F-NEXT: vpsrlq $9, %zmm0, %zmm2
3626 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3627 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3628 ; AVX512F-NEXT: vpsrlq $11, %zmm0, %zmm2
3629 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3630 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3631 ; AVX512F-NEXT: vpsrlq $13, %zmm0, %zmm2
3632 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3633 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3634 ; AVX512F-NEXT: vpsrlq $15, %zmm0, %zmm2
3635 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3636 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3637 ; AVX512F-NEXT: vpsrlq $17, %zmm0, %zmm2
3638 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3639 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3640 ; AVX512F-NEXT: vpsrlq $19, %zmm0, %zmm2
3641 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3642 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3643 ; AVX512F-NEXT: vpsrlq $21, %zmm0, %zmm2
3644 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3645 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3646 ; AVX512F-NEXT: vpsrlq $23, %zmm0, %zmm2
3647 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3648 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3649 ; AVX512F-NEXT: vpsrlq $25, %zmm0, %zmm2
3650 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3651 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3652 ; AVX512F-NEXT: vpsrlq $27, %zmm0, %zmm2
3653 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3654 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3655 ; AVX512F-NEXT: vpsrlq $29, %zmm0, %zmm2
3656 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3657 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3658 ; AVX512F-NEXT: vpsrlq $31, %zmm0, %zmm2
3659 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3660 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3661 ; AVX512F-NEXT: vpsrlq $33, %zmm0, %zmm2
3662 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3663 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3664 ; AVX512F-NEXT: vpsrlq $35, %zmm0, %zmm2
3665 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3666 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3667 ; AVX512F-NEXT: vpsrlq $37, %zmm0, %zmm2
3668 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3669 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3670 ; AVX512F-NEXT: vpsrlq $39, %zmm0, %zmm2
3671 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3672 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3673 ; AVX512F-NEXT: vpsrlq $41, %zmm0, %zmm2
3674 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3675 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3676 ; AVX512F-NEXT: vpsrlq $43, %zmm0, %zmm2
3677 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3678 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3679 ; AVX512F-NEXT: vpsrlq $45, %zmm0, %zmm2
3680 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3681 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3682 ; AVX512F-NEXT: vpsrlq $47, %zmm0, %zmm2
3683 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3684 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3685 ; AVX512F-NEXT: vpsrlq $49, %zmm0, %zmm2
3686 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3687 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3688 ; AVX512F-NEXT: vpsrlq $51, %zmm0, %zmm2
3689 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3690 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3691 ; AVX512F-NEXT: vpsrlq $53, %zmm0, %zmm2
3692 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3693 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3694 ; AVX512F-NEXT: vpsrlq $55, %zmm0, %zmm2
3695 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3696 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3697 ; AVX512F-NEXT: vpsrlq $57, %zmm0, %zmm2
3698 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3699 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3700 ; AVX512F-NEXT: vpsrlq $59, %zmm0, %zmm2
3701 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3702 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3703 ; AVX512F-NEXT: vpsrlq $61, %zmm0, %zmm2
3704 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3705 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
3706 ; AVX512F-NEXT: vpsrlq $63, %zmm0, %zmm0
3707 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0
3708 ; AVX512F-NEXT: vporq %zmm0, %zmm1, %zmm0
3709 ; AVX512F-NEXT: retq
3713 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24,39,38,37,36,35,34,33,32,47,46,45,44,43,42,41,40,55,54,53,52,51,50,49,48,63,62,61,60,59,58,57,56]
3714 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
3715 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
3716 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
3717 ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
3718 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
3719 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
3720 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
3721 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0
3722 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
3723 ; AVX512BW-NEXT: retq
3727 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
3728 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
3729 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
3730 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0
3731 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
3732 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
3733 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
3734 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1
3735 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
3736 ; XOPAVX1-NEXT: retq
3740 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
3741 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
3742 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
3743 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0
3744 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
3745 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
3746 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
3747 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1
3748 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
3749 ; XOPAVX2-NEXT: retq