Lines Matching full:next
15 ; SSE-NEXT: paddq %xmm2, %xmm0
16 ; SSE-NEXT: paddq %xmm3, %xmm1
17 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
18 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
19 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
20 ; SSE-NEXT: retq
24 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2
25 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
26 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
27 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
28 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
29 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
30 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
31 ; AVX1-NEXT: vzeroupper
32 ; AVX1-NEXT: retq
36 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
37 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
38 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
39 ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
40 ; AVX2-NEXT: vzeroupper
41 ; AVX2-NEXT: retq
45 ; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0
46 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
47 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
48 ; AVX512-NEXT: retq
57 ; SSE-NEXT: paddq %xmm6, %xmm2
58 ; SSE-NEXT: paddq %xmm4, %xmm0
59 ; SSE-NEXT: paddq %xmm7, %xmm3
60 ; SSE-NEXT: paddq %xmm5, %xmm1
61 ; SSE-NEXT: pextrw $4, %xmm1, %eax
62 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
63 ; SSE-NEXT: pextrw $4, %xmm0, %ecx
64 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
65 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
66 ; SSE-NEXT: pextrw $4, %xmm3, %edx
67 ; SSE-NEXT: movd %edx, %xmm1
68 ; SSE-NEXT: movd %eax, %xmm3
69 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
70 ; SSE-NEXT: pextrw $4, %xmm2, %eax
71 ; SSE-NEXT: movd %eax, %xmm1
72 ; SSE-NEXT: movd %ecx, %xmm2
73 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
74 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
75 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
76 ; SSE-NEXT: retq
80 ; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm4
81 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
82 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
83 ; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0
84 ; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm2
85 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
86 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
87 ; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
88 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
89 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
90 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
91 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
92 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
93 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7]
94 ; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
95 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
96 ; AVX1-NEXT: vzeroupper
97 ; AVX1-NEXT: retq
101 ; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1
102 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
103 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
104 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
105 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
106 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3]
107 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
108 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
109 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
110 ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
111 ; AVX2-NEXT: vzeroupper
112 ; AVX2-NEXT: retq
116 ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
117 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
118 ; AVX512-NEXT: retq
127 ; SSE-NEXT: paddd %xmm2, %xmm0
128 ; SSE-NEXT: paddd %xmm3, %xmm1
129 ; SSE-NEXT: pslld $16, %xmm1
130 ; SSE-NEXT: psrad $16, %xmm1
131 ; SSE-NEXT: pslld $16, %xmm0
132 ; SSE-NEXT: psrad $16, %xmm0
133 ; SSE-NEXT: packssdw %xmm1, %xmm0
134 ; SSE-NEXT: retq
138 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2
139 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
140 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
141 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
142 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
143 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
144 ; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
145 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
146 ; AVX1-NEXT: vzeroupper
147 ; AVX1-NEXT: retq
151 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
152 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
153 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
154 ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
155 ; AVX2-NEXT: vzeroupper
156 ; AVX2-NEXT: retq
160 ; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0
161 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
162 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
163 ; AVX512-NEXT: retq
172 ; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm0
173 ; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm1
174 ; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm2
175 ; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm3
176 ; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm4
177 ; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm5
178 ; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm6
179 ; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm7
180 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
181 ; SSE-NEXT: pand %xmm8, %xmm7
182 ; SSE-NEXT: pand %xmm8, %xmm6
183 ; SSE-NEXT: packuswb %xmm7, %xmm6
184 ; SSE-NEXT: pand %xmm8, %xmm5
185 ; SSE-NEXT: pand %xmm8, %xmm4
186 ; SSE-NEXT: packuswb %xmm5, %xmm4
187 ; SSE-NEXT: packuswb %xmm6, %xmm4
188 ; SSE-NEXT: pand %xmm8, %xmm3
189 ; SSE-NEXT: pand %xmm8, %xmm2
190 ; SSE-NEXT: packuswb %xmm3, %xmm2
191 ; SSE-NEXT: pand %xmm8, %xmm1
192 ; SSE-NEXT: pand %xmm8, %xmm0
193 ; SSE-NEXT: packuswb %xmm1, %xmm0
194 ; SSE-NEXT: packuswb %xmm2, %xmm0
195 ; SSE-NEXT: packuswb %xmm4, %xmm0
196 ; SSE-NEXT: retq
200 ; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm8
201 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4
202 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
203 ; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0
204 ; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm4
205 ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5
206 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
207 ; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm1
208 ; AVX1-NEXT: vpaddq %xmm6, %xmm2, %xmm5
209 ; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6
210 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
211 ; AVX1-NEXT: vpaddq %xmm6, %xmm2, %xmm2
212 ; AVX1-NEXT: vpaddq %xmm7, %xmm3, %xmm6
213 ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7
214 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
215 ; AVX1-NEXT: vpaddq %xmm7, %xmm3, %xmm3
216 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
217 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
218 ; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6
219 ; AVX1-NEXT: vpackuswb %xmm3, %xmm6, %xmm3
220 ; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2
221 ; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5
222 ; AVX1-NEXT: vpackuswb %xmm2, %xmm5, %xmm2
223 ; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
224 ; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1
225 ; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm3
226 ; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1
227 ; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0
228 ; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm3
229 ; AVX1-NEXT: vpackuswb %xmm0, %xmm3, %xmm0
230 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
231 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
232 ; AVX1-NEXT: vzeroupper
233 ; AVX1-NEXT: retq
237 ; AVX2-NEXT: vpaddq %ymm5, %ymm1, %ymm1
238 ; AVX2-NEXT: vpaddq %ymm4, %ymm0, %ymm0
239 ; AVX2-NEXT: vpaddq %ymm7, %ymm3, %ymm3
240 ; AVX2-NEXT: vpaddq %ymm6, %ymm2, %ymm2
241 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,0,2,4,6,4,6]
242 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3]
243 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,0,2,4,6,4,6]
244 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3]
245 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
246 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
247 ; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
248 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
249 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
250 ; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
251 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
252 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
253 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
254 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3]
255 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
256 ; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
257 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
258 ; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
259 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
260 ; AVX2-NEXT: vzeroupper
261 ; AVX2-NEXT: retq
265 ; AVX512-NEXT: vpaddq %zmm3, %zmm1, %zmm1
266 ; AVX512-NEXT: vpaddq %zmm2, %zmm0, %zmm0
267 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
268 ; AVX512-NEXT: vpmovqd %zmm1, %ymm1
269 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
270 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
271 ; AVX512-NEXT: retq
280 ; SSE-NEXT: paddd %xmm4, %xmm0
281 ; SSE-NEXT: paddd %xmm5, %xmm1
282 ; SSE-NEXT: paddd %xmm6, %xmm2
283 ; SSE-NEXT: paddd %xmm7, %xmm3
284 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
285 ; SSE-NEXT: pand %xmm4, %xmm3
286 ; SSE-NEXT: pand %xmm4, %xmm2
287 ; SSE-NEXT: packuswb %xmm3, %xmm2
288 ; SSE-NEXT: pand %xmm4, %xmm1
289 ; SSE-NEXT: pand %xmm4, %xmm0
290 ; SSE-NEXT: packuswb %xmm1, %xmm0
291 ; SSE-NEXT: packuswb %xmm2, %xmm0
292 ; SSE-NEXT: retq
296 ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm4
297 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
298 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
299 ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
300 ; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm2
301 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
302 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
303 ; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
304 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
305 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
306 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
307 ; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
308 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
309 ; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2
310 ; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0
311 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
312 ; AVX1-NEXT: vzeroupper
313 ; AVX1-NEXT: retq
317 ; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0
318 ; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1
319 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
320 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
321 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
322 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
323 ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
324 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
325 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
326 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
327 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
328 ; AVX2-NEXT: vzeroupper
329 ; AVX2-NEXT: retq
333 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
334 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
335 ; AVX512-NEXT: retq
344 ; SSE-NEXT: paddw %xmm2, %xmm0
345 ; SSE-NEXT: paddw %xmm3, %xmm1
346 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
347 ; SSE-NEXT: pand %xmm2, %xmm1
348 ; SSE-NEXT: pand %xmm2, %xmm0
349 ; SSE-NEXT: packuswb %xmm1, %xmm0
350 ; SSE-NEXT: retq
354 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2
355 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
356 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
357 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
358 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
359 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
360 ; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
361 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
362 ; AVX1-NEXT: vzeroupper
363 ; AVX1-NEXT: retq
367 ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
368 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
369 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
370 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
371 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
372 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
373 ; AVX2-NEXT: vzeroupper
374 ; AVX2-NEXT: retq
378 ; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm0
379 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
380 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
381 ; AVX512F-NEXT: retq
385 ; AVX512BW-NEXT: vpaddw %ymm1, %ymm0, %ymm0
386 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
387 ; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
388 ; AVX512BW-NEXT: retq
401 ; SSE-NEXT
402 ; SSE-NEXT: movd %rax, %xmm2
403 ; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
404 ; SSE-NEXT: paddq %xmm0, %xmm2
405 ; SSE-NEXT: paddq {{.*}}(%rip), %xmm1
406 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
407 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
408 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
409 ; SSE-NEXT: retq
413 ; AVX1-NEXT: movl $1, %eax
414 ; AVX1-NEXT: vmovq %rax, %xmm1
415 ; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
416 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1
417 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
418 ; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0
419 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
420 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
421 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
422 ; AVX1-NEXT: vzeroupper
423 ; AVX1-NEXT: retq
427 ; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
428 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
429 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
430 ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
431 ; AVX2-NEXT: vzeroupper
432 ; AVX2-NEXT: retq
436 ; AVX512-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
437 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
438 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
439 ; AVX512-NEXT: retq
448 ; SSE-NEXT: movdqa %xmm0, %xmm4
449 ; SSE-NEXT: movl $1, %eax
450 ; SSE-NEXT: movd %rax, %xmm0
451 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
452 ; SSE-NEXT: paddq %xmm4, %xmm0
453 ; SSE-NEXT: paddq {{.*}}(%rip), %xmm2
454 ; SSE-NEXT: paddq {{.*}}(%rip), %xmm3
455 ; SSE-NEXT: paddq {{.*}}(%rip), %xmm1
456 ; SSE-NEXT: pextrw $4, %xmm1, %eax
457 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
458 ; SSE-NEXT: pextrw $4, %xmm0, %ecx
459 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
460 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
461 ; SSE-NEXT: pextrw $4, %xmm3, %edx
462 ; SSE-NEXT: movd %edx, %xmm1
463 ; SSE-NEXT: movd %eax, %xmm3
464 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
465 ; SSE-NEXT: movd %ecx, %xmm1
466 ; SSE-NEXT: pextrw $4, %xmm2, %eax
467 ; SSE-NEXT: movd %eax, %xmm2
468 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
469 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
470 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
471 ; SSE-NEXT: retq
475 ; AVX1-NEXT: movl $1, %eax
476 ; AVX1-NEXT: vmovq %rax, %xmm2
477 ; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
478 ; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm2
479 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
480 ; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0
481 ; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm3
482 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
483 ; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm1
484 ; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
485 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7]
486 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7]
487 ; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1
488 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7]
489 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7]
490 ; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
491 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
492 ; AVX1-NEXT: vzeroupper
493 ; AVX1-NEXT: retq
497 ; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm1, %ymm1
498 ; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
499 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
500 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
501 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
502 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3]
503 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
504 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
505 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
506 ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
507 ; AVX2-NEXT: vzeroupper
508 ; AVX2-NEXT: retq
512 ; AVX512-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
513 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
514 ; AVX512-NEXT: retq
523 ; SSE-NEXT: paddd {{.*}}(%rip), %xmm0
524 ; SSE-NEXT: paddd {{.*}}(%rip), %xmm1
525 ; SSE-NEXT: pslld $16, %xmm1
526 ; SSE-NEXT: psrad $16, %xmm1
527 ; SSE-NEXT: pslld $16, %xmm0
528 ; SSE-NEXT: psrad $16, %xmm0
529 ; SSE-NEXT: packssdw %xmm1, %xmm0
530 ; SSE-NEXT: retq
534 ; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm1
535 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
536 ; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
537 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
538 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
539 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
540 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
541 ; AVX1-NEXT: vzeroupper
542 ; AVX1-NEXT: retq
546 ; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
547 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
548 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
549 ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
550 ; AVX2-NEXT: vzeroupper
551 ; AVX2-NEXT: retq
555 ; AVX512-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
556 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
557 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
558 ; AVX512-NEXT: retq
567 ; SSE-NEXT: movl $1, %eax
568 ; SSE-NEXT: movd %rax, %xmm8
569 ; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7]
570 ; SSE-NEXT: paddq %xmm8, %xmm0
571 ; SSE-NEXT: paddq {{.*}}(%rip), %xmm1
572 ; SSE-NEXT: paddq {{.*}}(%rip), %xmm2
573 ; SSE-NEXT: paddq {{.*}}(%rip), %xmm3
574 ; SSE-NEXT: paddq {{.*}}(%rip), %xmm4
575 ; SSE-NEXT: paddq {{.*}}(%rip), %xmm5
576 ; SSE-NEXT: paddq {{.*}}(%rip), %xmm6
577 ; SSE-NEXT: paddq {{.*}}(%rip), %xmm7
578 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
579 ; SSE-NEXT: pand %xmm8, %xmm7
580 ; SSE-NEXT: pand %xmm8, %xmm6
581 ; SSE-NEXT: packuswb %xmm7, %xmm6
582 ; SSE-NEXT: pand %xmm8, %xmm5
583 ; SSE-NEXT: pand %xmm8, %xmm4
584 ; SSE-NEXT: packuswb %xmm5, %xmm4
585 ; SSE-NEXT: packuswb %xmm6, %xmm4
586 ; SSE-NEXT: pand %xmm8, %xmm3
587 ; SSE-NEXT: pand %xmm8, %xmm2
588 ; SSE-NEXT: packuswb %xmm3, %xmm2
589 ; SSE-NEXT: pand %xmm8, %xmm1
590 ; SSE-NEXT: pand %xmm8, %xmm0
591 ; SSE-NEXT: packuswb %xmm1, %xmm0
592 ; SSE-NEXT: packuswb %xmm2, %xmm0
593 ; SSE-NEXT: packuswb %xmm4, %xmm0
594 ; SSE-NEXT: retq
598 ; AVX1-NEXT: movl $1, %eax
599 ; AVX1-NEXT: vmovq %rax, %xmm4
600 ; AVX1-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
601 ; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm8
602 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
603 ; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0
604 ; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm5
605 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
606 ; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm1
607 ; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm2, %xmm6
608 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
609 ; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm2, %xmm2
610 ; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm3, %xmm7
611 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
612 ; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm3, %xmm3
613 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
614 ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
615 ; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm7
616 ; AVX1-NEXT: vpackuswb %xmm3, %xmm7, %xmm3
617 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
618 ; AVX1-NEXT: vpand %xmm4, %xmm6, %xmm6
619 ; AVX1-NEXT: vpackuswb %xmm2, %xmm6, %xmm2
620 ; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
621 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
622 ; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm3
623 ; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1
624 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
625 ; AVX1-NEXT: vpand %xmm4, %xmm8, %xmm3
626 ; AVX1-NEXT: vpackuswb %xmm0, %xmm3, %xmm0
627 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
628 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
629 ; AVX1-NEXT: vzeroupper
630 ; AVX1-NEXT: retq
634 ; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm1, %ymm1
635 ; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
636 ; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm3, %ymm3
637 ; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm2, %ymm2
638 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,0,2,4,6,4,6]
639 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3]
640 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,0,2,4,6,4,6]
641 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3]
642 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
643 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
644 ; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
645 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
646 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
647 ; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
648 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
649 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
650 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
651 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3]
652 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
653 ; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
654 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
655 ; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
656 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
657 ; AVX2-NEXT: vzeroupper
658 ; AVX2-NEXT: retq
662 ; AVX512-NEXT: vpaddq {{.*}}(%rip), %zmm1, %zmm1
663 ; AVX512-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
664 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
665 ; AVX512-NEXT: vpmovqd %zmm1, %ymm1
666 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
667 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
668 ; AVX512-NEXT: retq
677 ; SSE-NEXT: paddd {{.*}}(%rip), %xmm0
678 ; SSE-NEXT: paddd {{.*}}(%rip), %xmm1
679 ; SSE-NEXT: paddd {{.*}}(%rip), %xmm2
680 ; SSE-NEXT: paddd {{.*}}(%rip), %xmm3
681 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
682 ; SSE-NEXT: pand %xmm4, %xmm3
683 ; SSE-NEXT: pand %xmm4, %xmm2
684 ; SSE-NEXT: packuswb %xmm3, %xmm2
685 ; SSE-NEXT: pand %xmm4, %xmm1
686 ; SSE-NEXT: pand %xmm4, %xmm0
687 ; SSE-NEXT: packuswb %xmm1, %xmm0
688 ; SSE-NEXT: packuswb %xmm2, %xmm0
689 ; SSE-NEXT: retq
693 ; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm2
694 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
695 ; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
696 ; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm3
697 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
698 ; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1
699 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
700 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
701 ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
702 ; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1
703 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
704 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
705 ; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0
706 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
707 ; AVX1-NEXT: vzeroupper
708 ; AVX1-NEXT: retq
712 ; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
713 ; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm1, %ymm1
714 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
715 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
716 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
717 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
718 ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
719 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
720 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
721 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
722 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
723 ; AVX2-NEXT: vzeroupper
724 ; AVX2-NEXT: retq
728 ; AVX512-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0
729 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
730 ; AVX512-NEXT: retq
739 ; SSE-NEXT: paddw {{.*}}(%rip), %xmm0
740 ; SSE-NEXT: paddw {{.*}}(%rip), %xmm1
741 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
742 ; SSE-NEXT: pand %xmm2, %xmm1
743 ; SSE-NEXT: pand %xmm2, %xmm0
744 ; SSE-NEXT: packuswb %xmm1, %xmm0
745 ; SSE-NEXT: retq
749 ; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm1
750 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
751 ; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
752 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
753 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
754 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
755 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
756 ; AVX1-NEXT: vzeroupper
757 ; AVX1-NEXT: retq
761 ; AVX2-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0
762 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
763 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
764 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
765 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
766 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
767 ; AVX2-NEXT: vzeroupper
768 ; AVX2-NEXT: retq
772 ; AVX512F-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0
773 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
774 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
775 ; AVX512F-NEXT: retq
779 ; AVX512BW-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0
780 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
781 ; AVX512BW-NEXT
782 ; AVX512BW-NEXT: retq
795 ; SSE-NEXT: psubq %xmm2, %xmm0
796 ; SSE-NEXT: psubq %xmm3, %xmm1
797 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
798 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
799 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
800 ; SSE-NEXT: retq
804 ; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm2
805 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
806 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
807 ; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0
808 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
809 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
810 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
811 ; AVX1-NEXT: vzeroupper
812 ; AVX1-NEXT: retq
816 ; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0
817 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
818 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
819 ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
820 ; AVX2-NEXT: vzeroupper
821 ; AVX2-NEXT: retq
825 ; AVX512-NEXT: vpsubq %ymm1, %ymm0, %ymm0
826 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
827 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
828 ; AVX512-NEXT: retq
837 ; SSE-NEXT: psubq %xmm6, %xmm2
838 ; SSE-NEXT: psubq %xmm4, %xmm0
839 ; SSE-NEXT: psubq %xmm7, %xmm3
840 ; SSE-NEXT: psubq %xmm5, %xmm1
841 ; SSE-NEXT: pextrw $4, %xmm1, %eax
842 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
843 ; SSE-NEXT: pextrw $4, %xmm0, %ecx
844 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
845 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
846 ; SSE-NEXT: pextrw $4, %xmm3, %edx
847 ; SSE-NEXT: movd %edx, %xmm1
848 ; SSE-NEXT: movd %eax, %xmm3
849 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
850 ; SSE-NEXT: pextrw $4, %xmm2, %eax
851 ; SSE-NEXT: movd %eax, %xmm1
852 ; SSE-NEXT: movd %ecx, %xmm2
853 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
854 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
855 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
856 ; SSE-NEXT: retq
860 ; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm4
861 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
862 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
863 ; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0
864 ; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm2
865 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
866 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
867 ; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1
868 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
869 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
870 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
871 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
872 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
873 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7]
874 ; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
875 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
876 ; AVX1-NEXT: vzeroupper
877 ; AVX1-NEXT: retq
881 ; AVX2-NEXT: vpsubq %ymm3, %ymm1, %ymm1
882 ; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
883 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
884 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
885 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
886 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3]
887 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
888 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
889 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
890 ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
891 ; AVX2-NEXT: vzeroupper
892 ; AVX2-NEXT: retq
896 ; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0
897 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
898 ; AVX512-NEXT: retq
907 ; SSE-NEXT: psubd %xmm2, %xmm0
908 ; SSE-NEXT: psubd %xmm3, %xmm1
909 ; SSE-NEXT: pslld $16, %xmm1
910 ; SSE-NEXT: psrad $16, %xmm1
911 ; SSE-NEXT: pslld $16, %xmm0
912 ; SSE-NEXT: psrad $16, %xmm0
913 ; SSE-NEXT: packssdw %xmm1, %xmm0
914 ; SSE-NEXT: retq
918 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm2
919 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
920 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
921 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
922 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
923 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
924 ; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
925 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
926 ; AVX1-NEXT: vzeroupper
927 ; AVX1-NEXT: retq
931 ; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0
932 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
933 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
934 ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
935 ; AVX2-NEXT: vzeroupper
936 ; AVX2-NEXT: retq
940 ; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0
941 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
942 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
943 ; AVX512-NEXT: retq
952 ; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm0
953 ; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm1
954 ; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm2
955 ; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm3
956 ; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm4
957 ; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm5
958 ; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm6
959 ; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm7
960 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
961 ; SSE-NEXT: pand %xmm8, %xmm7
962 ; SSE-NEXT: pand %xmm8, %xmm6
963 ; SSE-NEXT: packuswb %xmm7, %xmm6
964 ; SSE-NEXT: pand %xmm8, %xmm5
965 ; SSE-NEXT: pand %xmm8, %xmm4
966 ; SSE-NEXT: packuswb %xmm5, %xmm4
967 ; SSE-NEXT: packuswb %xmm6, %xmm4
968 ; SSE-NEXT: pand %xmm8, %xmm3
969 ; SSE-NEXT: pand %xmm8, %xmm2
970 ; SSE-NEXT: packuswb %xmm3, %xmm2
971 ; SSE-NEXT: pand %xmm8, %xmm1
972 ; SSE-NEXT: pand %xmm8, %xmm0
973 ; SSE-NEXT: packuswb %xmm1, %xmm0
974 ; SSE-NEXT: packuswb %xmm2, %xmm0
975 ; SSE-NEXT: packuswb %xmm4, %xmm0
976 ; SSE-NEXT: retq
980 ; AVX1-NEXT: vpsubq %xmm4, %xmm0, %xmm8
981 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4
982 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
983 ; AVX1-NEXT: vpsubq %xmm4, %xmm0, %xmm0
984 ; AVX1-NEXT: vpsubq %xmm5, %xmm1, %xmm4
985 ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5
986 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
987 ; AVX1-NEXT: vpsubq %xmm5, %xmm1, %xmm1
988 ; AVX1-NEXT: vpsubq %xmm6, %xmm2, %xmm5
989 ; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6
990 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
991 ; AVX1-NEXT: vpsubq %xmm6, %xmm2, %xmm2
992 ; AVX1-NEXT: vpsubq %xmm7, %xmm3, %xmm6
993 ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7
994 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
995 ; AVX1-NEXT: vpsubq %xmm7, %xmm3, %xmm3
996 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
997 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
998 ; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6
999 ; AVX1-NEXT: vpackuswb %xmm3, %xmm6, %xmm3
1000 ; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2
1001 ; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5
1002 ; AVX1-NEXT: vpackuswb %xmm2, %xmm5, %xmm2
1003 ; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
1004 ; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1
1005 ; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm3
1006 ; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1
1007 ; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0
1008 ; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm3
1009 ; AVX1-NEXT: vpackuswb %xmm0, %xmm3, %xmm0
1010 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1011 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
1012 ; AVX1-NEXT: vzeroupper
1013 ; AVX1-NEXT: retq
1017 ; AVX2-NEXT: vpsubq %ymm5, %ymm1, %ymm1
1018 ; AVX2-NEXT: vpsubq %ymm4, %ymm0, %ymm0
1019 ; AVX2-NEXT: vpsubq %ymm7, %ymm3, %ymm3
1020 ; AVX2-NEXT: vpsubq %ymm6, %ymm2, %ymm2
1021 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,0,2,4,6,4,6]
1022 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3]
1023 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,0,2,4,6,4,6]
1024 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3]
1025 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
1026 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
1027 ; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
1028 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
1029 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1030 ; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
1031 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
1032 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
1033 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
1034 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3]
1035 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1036 ; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
1037 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1038 ; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
1039 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
1040 ; AVX2-NEXT: vzeroupper
1041 ; AVX2-NEXT: retq
1045 ; AVX512-NEXT: vpsubq %zmm3, %zmm1, %zmm1
1046 ; AVX512-NEXT: vpsubq %zmm2, %zmm0, %zmm0
1047 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
1048 ; AVX512-NEXT: vpmovqd %zmm1, %ymm1
1049 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
1050 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
1051 ; AVX512-NEXT: retq
1060 ; SSE-NEXT: psubd %xmm4, %xmm0
1061 ; SSE-NEXT: psubd %xmm5, %xmm1
1062 ; SSE-NEXT: psubd %xmm6, %xmm2
1063 ; SSE-NEXT: psubd %xmm7, %xmm3
1064 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
1065 ; SSE-NEXT: pand %xmm4, %xmm3
1066 ; SSE-NEXT: pand %xmm4, %xmm2
1067 ; SSE-NEXT: packuswb %xmm3, %xmm2
1068 ; SSE-NEXT: pand %xmm4, %xmm1
1069 ; SSE-NEXT: pand %xmm4, %xmm0
1070 ; SSE-NEXT: packuswb %xmm1, %xmm0
1071 ; SSE-NEXT: packuswb %xmm2, %xmm0
1072 ; SSE-NEXT: retq
1076 ; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm4
1077 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
1078 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1079 ; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0
1080 ; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm2
1081 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
1082 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1083 ; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1
1084 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
1085 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
1086 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
1087 ; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
1088 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
1089 ; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2
1090 ; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0
1091 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1092 ; AVX1-NEXT: vzeroupper
1093 ; AVX1-NEXT: retq
1097 ; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0
1098 ; AVX2-NEXT: vpsubd %ymm3, %ymm1, %ymm1
1099 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
1100 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
1101 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
1102 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1103 ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
1104 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
1105 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1106 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
1107 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1108 ; AVX2-NEXT: vzeroupper
1109 ; AVX2-NEXT: retq
1113 ; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm0
1114 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
1115 ; AVX512-NEXT: retq
1124 ; SSE-NEXT: psubw %xmm2, %xmm0
1125 ; SSE-NEXT: psubw %xmm3, %xmm1
1126 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1127 ; SSE-NEXT: pand %xmm2, %xmm1
1128 ; SSE-NEXT: pand %xmm2, %xmm0
1129 ; SSE-NEXT: packuswb %xmm1, %xmm0
1130 ; SSE-NEXT: retq
1134 ; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm2
1135 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1136 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1137 ; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0
1138 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1139 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
1140 ; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
1141 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1142 ; AVX1-NEXT: vzeroupper
1143 ; AVX1-NEXT: retq
1147 ; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0
1148 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1149 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1150 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1151 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1152 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1153 ; AVX2-NEXT: vzeroupper
1154 ; AVX2-NEXT: retq
1158 ; AVX512F-NEXT: vpsubw %ymm1, %ymm0, %ymm0
1159 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
1160 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
1161 ; AVX512F-NEXT: retq
1165 ; AVX512BW-NEXT: vpsubw %ymm1, %ymm0, %ymm0
1166 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1167 ; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
1168 ; AVX512BW-NEXT: retq
1181 ; SSE-NEXT: movl $1, %eax
1182 ; SSE-NEXT: movd %rax, %xmm2
1183 ; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
1184 ; SSE-NEXT: psubq %xmm2, %xmm0
1185 ; SSE-NEXT: psubq {{.*}}(%rip), %xmm1
1186 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1187 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1188 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1189 ; SSE-NEXT: retq
1193 ; AVX1-NEXT: movl $1, %eax
1194 ; AVX1-NEXT: vmovq %rax, %xmm1
1195 NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
1196 ; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1
1197 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1198 ; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm0, %xmm0
1199 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1200 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
1201 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
1202 ; AVX1-NEXT: vzeroupper
1203 ; AVX1-NEXT: retq
1207 ; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0
1208 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
1209 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
1210 ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
1211 ; AVX2-NEXT: vzeroupper
1212 ; AVX2-NEXT: retq
1216 ; AVX512-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0
1217 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
1218 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
1219 ; AVX512-NEXT: retq
1228 ; SSE-NEXT: movl $1, %eax
1229 ; SSE-NEXT: movd %rax, %xmm4
1230 ; SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
1231 ; SSE-NEXT: psubq %xmm4, %xmm0
1232 ; SSE-NEXT: psubq {{.*}}(%rip), %xmm2
1233 ; SSE-NEXT: psubq {{.*}}(%rip), %xmm3
1234 ; SSE-NEXT: psubq {{.*}}(%rip), %xmm1
1235 ; SSE-NEXT: pextrw $4, %xmm1, %eax
1236 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
1237 ; SSE-NEXT: pextrw $4, %xmm0, %ecx
1238 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1239 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1240 ; SSE-NEXT: pextrw $4, %xmm3, %edx
1241 ; SSE-NEXT: movd %edx, %xmm1
1242 ; SSE-NEXT: movd %eax, %xmm3
1243 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
1244 ; SSE-NEXT: movd %ecx, %xmm1
1245 ; SSE-NEXT: pextrw $4, %xmm2, %eax
1246 ; SSE-NEXT: movd %eax, %xmm2
1247 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1248 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
1249 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1250 ; SSE-NEXT: retq
1254 ; AVX1-NEXT: movl $1, %eax
1255 ; AVX1-NEXT: vmovq %rax, %xmm2
1256 ; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
1257 ; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm2
1258 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1259 ; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm0, %xmm0
1260 ; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm1, %xmm3
1261 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1262 ; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm1, %xmm1
1263 ; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
1264 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7]
1265 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7]
1266 ; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1
1267 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7]
1268 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7]
1269 ; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
1270 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
1271 ; AVX1-NEXT: vzeroupper
1272 ; AVX1-NEXT: retq
1276 ; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm1, %ymm1
1277 ; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0
1278 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
1279 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
1280 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
1281 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3]
1282 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1283 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
1284 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1285 ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
1286 ; AVX2-NEXT: vzeroupper
1287 ; AVX2-NEXT: retq
1291 ; AVX512-NEXT: vpsubq {{.*}}(%rip), %zmm0, %zmm0
1292 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
1293 ; AVX512-NEXT: retq
1302 ; SSE-NEXT: psubd {{.*}}(%rip), %xmm0
1303 ; SSE-NEXT: psubd {{.*}}(%rip), %xmm1
1304 ; SSE-NEXT: pslld $16, %xmm1
1305 ; SSE-NEXT: psrad $16, %xmm1
1306 ; SSE-NEXT: pslld $16, %xmm0
1307 ; SSE-NEXT: psrad $16, %xmm0
1308 ; SSE-NEXT: packssdw %xmm1, %xmm0
1309 ; SSE-NEXT: retq
1313 ; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm1
1314 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1315 ; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0
1316 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1317 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1318 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1319 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1320 ; AVX1-NEXT: vzeroupper
1321 ; AVX1-NEXT: retq
1325 ; AVX2-NEXT: vpsubd {{.*}}(%rip), %ymm0, %ymm0
1326 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
1327 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1328 ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
1329 ; AVX2-NEXT: vzeroupper
1330 ; AVX2-NEXT: retq
1334 ; AVX512-NEXT: vpsubd {{.*}}(%rip), %ymm0, %ymm0
1335 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
1336 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
1337 ; AVX512-NEXT: retq
1346 ; SSE-NEXT: movl $1, %eax
1347 ; SSE-NEXT: movd %rax, %xmm8
1348 ; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7]
1349 ; SSE-NEXT: psubq %xmm8, %xmm0
1350 ; SSE-NEXT: psubq {{.*}}(%rip), %xmm1
1351 ; SSE-NEXT: psubq {{.*}}(%rip), %xmm2
1352 ; SSE-NEXT: psubq {{.*}}(%rip), %xmm3
1353 ; SSE-NEXT: psubq {{.*}}(%rip), %xmm4
1354 ; SSE-NEXT: psubq {{.*}}(%rip), %xmm5
1355 ; SSE-NEXT: psubq {{.*}}(%rip), %xmm6
1356 ; SSE-NEXT: psubq {{.*}}(%rip), %xmm7
1357 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
1358 ; SSE-NEXT: pand %xmm8, %xmm7
1359 ; SSE-NEXT: pand %xmm8, %xmm6
1360 ; SSE-NEXT: packuswb %xmm7, %xmm6
1361 ; SSE-NEXT: pand %xmm8, %xmm5
1362 ; SSE-NEXT: pand %xmm8, %xmm4
1363 ; SSE-NEXT: packuswb %xmm5, %xmm4
1364 ; SSE-NEXT: packuswb %xmm6, %xmm4
1365 ; SSE-NEXT: pand %xmm8, %xmm3
1366 ; SSE-NEXT: pand %xmm8, %xmm2
1367 ; SSE-NEXT: packuswb %xmm3, %xmm2
1368 ; SSE-NEXT: pand %xmm8, %xmm1
1369 ; SSE-NEXT: pand %xmm8, %xmm0
1370 ; SSE-NEXT: packuswb %xmm1, %xmm0
1371 ; SSE-NEXT: packuswb %xmm2, %xmm0
1372 ; SSE-NEXT: packuswb %xmm4, %xmm0
1373 ; SSE-NEXT: retq
1377 ; AVX1-NEXT: movl $1, %eax
1378 ; AVX1-NEXT: vmovq %rax, %xmm4
1379 ; AVX1-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
1380 ; AVX1-NEXT: vpsubq %xmm4, %xmm0, %xmm8
1381 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1382 ; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm0, %xmm0
1383 ; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm1, %xmm5
1384 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1385 ; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm1, %xmm1
1386 ; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm2, %xmm6
1387 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
1388 ; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm2, %xmm2
1389 ; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm3, %xmm7
1390 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
1391 ; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm3, %xmm3
1392 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
1393 ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
1394 ; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm7
1395 ; AVX1-NEXT: vpackuswb %xmm3, %xmm7, %xmm3
1396 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
1397 ; AVX1-NEXT: vpand %xmm4, %xmm6, %xmm6
1398 ; AVX1-NEXT: vpackuswb %xmm2, %xmm6, %xmm2
1399 ; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
1400 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
1401 ; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm3
1402 ; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1
1403 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
1404 ; AVX1-NEXT: vpand %xmm4, %xmm8, %xmm3
1405 ; AVX1-NEXT: vpackuswb %xmm0, %xmm3, %xmm0
1406 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1407 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
1408 ; AVX1-NEXT: vzeroupper
1409 ; AVX1-NEXT: retq
1413 ; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm1, %ymm1
1414 ; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0
1415 ; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm3, %ymm3
1416 ; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm2, %ymm2
1417 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,0,2,4,6,4,6]
1418 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3]
1419 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,0,2,4,6,4,6]
1420 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3]
1421 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
1422 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
1423 ; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
1424 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
1425 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1426 ; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
1427 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
1428 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
1429 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
1430 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3]
1431 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1432 ; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
1433 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1434 ; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
1435 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
1436 ; AVX2-NEXT: vzeroupper
1437 ; AVX2-NEXT: retq
1441 ; AVX512-NEXT: vpsubq {{.*}}(%rip), %zmm1, %zmm1
1442 ; AVX512-NEXT: vpsubq {{.*}}(%rip), %zmm0, %zmm0
1443 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
1444 ; AVX512-NEXT: vpmovqd %zmm1, %ymm1
1445 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
1446 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
1447 ; AVX512-NEXT: retq
1456 ; SSE-NEXT: psubd {{.*}}(%rip), %xmm0
1457 ; SSE-NEXT: psubd {{.*}}(%rip), %xmm1
1458 ; SSE-NEXT: psubd {{.*}}(%rip), %xmm2
1459 ; SSE-NEXT: psubd {{.*}}(%rip), %xmm3
1460 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
1461 ; SSE-NEXT: pand %xmm4, %xmm3
1462 ; SSE-NEXT: pand %xmm4, %xmm2
1463 ; SSE-NEXT: packuswb %xmm3, %xmm2
1464 ; SSE-NEXT: pand %xmm4, %xmm1
1465 ; SSE-NEXT: pand %xmm4, %xmm0
1466 ; SSE-NEXT: packuswb %xmm1, %xmm0
1467 ; SSE-NEXT: packuswb %xmm2, %xmm0
1468 ; SSE-NEXT: retq
1472 ; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm2
1473 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1474 ; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0
1475 ; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm1, %xmm3
1476 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1477 ; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm1, %xmm1
1478 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
1479 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
1480 ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
1481 ; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1
1482 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
1483 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
1484 ; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0
1485 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1486 ; AVX1-NEXT: vzeroupper
1487 ; AVX1-NEXT: retq
1491 ; AVX2-NEXT: vpsubd {{.*}}(%rip), %ymm0, %ymm0
1492 ; AVX2-NEXT: vpsubd {{.*}}(%rip), %ymm1, %ymm1
1493 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
1494 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
1495 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
1496 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1497 ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
1498 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
1499 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1500 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
1501 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1502 ; AVX2-NEXT: vzeroupper
1503 ; AVX2-NEXT: retq
1507 ; AVX512-NEXT: vpsubd {{.*}}(%rip), %zmm0, %zmm0
1508 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
1509 ; AVX512-NEXT: retq
1518 ; SSE-NEXT: psubw {{.*}}(%rip), %xmm0
1519 ; SSE-NEXT: psubw {{.*}}(%rip), %xmm1
1520 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1521 ; SSE-NEXT: pand %xmm2, %xmm1
1522 ; SSE-NEXT: pand %xmm2, %xmm0
1523 ; SSE-NEXT: packuswb %xmm1, %xmm0
1524 ; SSE-NEXT: retq
1528 ; AVX1-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm1
1529 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1530 ; AVX1-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
1531 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1532 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1533 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1534 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1535 ; AVX1-NEXT: vzeroupper
1536 ; AVX1-NEXT: retq
1540 ; AVX2-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
1541 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1542 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1543 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1544 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1545 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1546 ; AVX2-NEXT: vzeroupper
1547 ; AVX2-NEXT: retq
1551 ; AVX512F-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
1552 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
1553 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
1554 ; AVX512F-NEXT: retq
1558 ; AVX512BW-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
1559 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1560 ; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
1561 ; AVX512BW-NEXT: retq
1574 ; SSE-NEXT: movdqa %xmm0, %xmm4
1575 ; SSE-NEXT: pmuludq %xmm2, %xmm4
1576 ; SSE-NEXT: movdqa %xmm2, %xmm5
1577 ; SSE-NEXT: psrlq $32, %xmm5
1578 NEXT: pmuludq %xmm0, %xmm5
1579 ; SSE-NEXT: psllq $32, %xmm5
1580 ; SSE-NEXT: paddq %xmm4, %xmm5
1581 ; SSE-NEXT: psrlq $32, %xmm0
1582 ; SSE-NEXT: pmuludq %xmm2, %xmm0
1583 ; SSE-NEXT: psllq $32, %xmm0
1584 ; SSE-NEXT: paddq %xmm5, %xmm0
1585 ; SSE-NEXT: movdqa %xmm1, %xmm2
1586 ; SSE-NEXT: pmuludq %xmm3, %xmm2
1587 ; SSE-NEXT: movdqa %xmm3, %xmm4
1588 ; SSE-NEXT: psrlq $32, %xmm4
1589 ; SSE-NEXT: pmuludq %xmm1, %xmm4
1590 ; SSE-NEXT: psllq $32, %xmm4
1591 ; SSE-NEXT: paddq %xmm2, %xmm4
1592 ; SSE-NEXT: psrlq $32, %xmm1
1593 ; SSE-NEXT: pmuludq %xmm3, %xmm1
1594 ; SSE-NEXT: psllq $32, %xmm1
1595 ; SSE-NEXT: paddq %xmm4, %xmm1
1596 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1597 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1598 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1599 ; SSE-NEXT: retq
1603 ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
1604 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3
1605 ; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
1606 ; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3
1607 ; AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2
1608 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm3
1609 ; AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm3
1610 ; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3
1611 ; AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2
1612 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1613 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1614 ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm3
1615 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4
1616 ; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm4
1617 ; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
1618 ; AVX1-NEXT: vpaddq %xmm4, %xmm3, %xmm3
1619 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
1620 ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
1621 ; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
1622 ; AVX1-NEXT: vpaddq %xmm0, %xmm3, %xmm0
1623 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
1624 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
1625 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
1626 ; AVX1-NEXT: vzeroupper
1627 ; AVX1-NEXT: retq
1631 ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm2
1632 ; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3
1633 ; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
1634 ; AVX2-NEXT: vpsllq $32, %ymm3, %ymm3
1635 ; AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2
1636 ; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0
1637 ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
1638 ; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0
1639 ; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0
1640 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
1641 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
1642 ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
1643 ; AVX2-NEXT: vzeroupper
1644 ; AVX2-NEXT: retq
1648 ; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm2
1649 ; AVX512-NEXT: vpsrlq $32, %ymm1, %ymm3
1650 ; AVX512-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
1651 ; AVX512-NEXT: vpsllq $32, %ymm3, %ymm3
1652 ; AVX512-NEXT: vpaddq %ymm3, %ymm2, %ymm2
1653 ; AVX512-NEXT: vpsrlq $32, %ymm0, %ymm0
1654 ; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
1655 ; AVX512-NEXT: vpsllq $32, %ymm0, %ymm0
1656 ; AVX512-NEXT: vpaddq %ymm0, %ymm2, %ymm0
1657 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
1658 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
1659 ; AVX512-NEXT: retq
1668 ; SSE-NEXT: movdqa %xmm2, %xmm8
1669 ; SSE-NEXT: pmuludq %xmm6, %xmm8
1670 ; SSE-NEXT: movdqa %xmm6, %xmm9
1671 ; SSE-NEXT: psrlq $32, %xmm9
1672 ; SSE-NEXT: pmuludq %xmm2, %xmm9
1673 ; SSE-NEXT: psllq $32, %xmm9
1674 ; SSE-NEXT: paddq %xmm8, %xmm9
1675 ; SSE-NEXT: psrlq $32, %xmm2
1676 ; SSE-NEXT: pmuludq %xmm6, %xmm2
1677 ; SSE-NEXT: psllq $32, %xmm2
1678 ; SSE-NEXT: paddq %xmm9, %xmm2
1679 ; SSE-NEXT: movdqa %xmm0, %xmm8
1680 ; SSE-NEXT: pmuludq %xmm4, %xmm8
1681 ; SSE-NEXT: movdqa %xmm4, %xmm6
1682 ; SSE-NEXT: psrlq $32, %xmm6
1683 ; SSE-NEXT: pmuludq %xmm0, %xmm6
1684 ; SSE-NEXT: psllq $32, %xmm6
1685 ; SSE-NEXT: paddq %xmm8, %xmm6
1686 ; SSE-NEXT: psrlq $32, %xmm0
1687 ; SSE-NEXT: pmuludq %xmm4, %xmm0
1688 ; SSE-NEXT: psllq $32, %xmm0
1689 ; SSE-NEXT: paddq %xmm6, %xmm0
1690 ; SSE-NEXT: movdqa %xmm3, %xmm4
1691 ; SSE-NEXT: pmuludq %xmm7, %xmm4
1692 ; SSE-NEXT: movdqa %xmm7, %xmm6
1693 ; SSE-NEXT: psrlq $32, %xmm6
1694 ; SSE-NEXT: pmuludq %xmm3, %xmm6
1695 ; SSE-NEXT: psllq $32, %xmm6
1696 ; SSE-NEXT: paddq %xmm4, %xmm6
1697 ; SSE-NEXT: psrlq $32, %xmm3
1698 ; SSE-NEXT: pmuludq %xmm7, %xmm3
1699 ; SSE-NEXT: psllq $32, %xmm3
1700 ; SSE-NEXT: paddq %xmm6, %xmm3
1701 ; SSE-NEXT: movdqa %xmm1, %xmm4
1702 ; SSE-NEXT: pmuludq %xmm5, %xmm4
1703 ; SSE-NEXT: movdqa %xmm5, %xmm6
1704 ; SSE-NEXT: psrlq $32, %xmm6
1705 ; SSE-NEXT: pmuludq %xmm1, %xmm6
1706 ; SSE-NEXT: psllq $32, %xmm6
1707 ; SSE-NEXT: paddq %xmm4, %xmm6
1708 ; SSE-NEXT: psrlq $32, %xmm1
1709 ; SSE-NEXT: pmuludq %xmm5, %xmm1
1710 ; SSE-NEXT: psllq $32, %xmm1
1711 ; SSE-NEXT: paddq %xmm6, %xmm1
1712 ; SSE-NEXT: pextrw $4, %xmm1, %eax
1713 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
1714 ; SSE-NEXT: pextrw $4, %xmm0, %ecx
1715 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1716 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1717 ; SSE-NEXT: pextrw $4, %xmm3, %edx
1718 ; SSE-NEXT: movd %edx, %xmm1
1719 ; SSE-NEXT: movd %eax, %xmm3
1720 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
1721 ; SSE-NEXT: pextrw $4, %xmm2, %eax
1722 ; SSE-NEXT: movd %eax, %xmm1
1723 ; SSE-NEXT: movd %ecx, %xmm2
1724 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1725 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
1726 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1727 ; SSE-NEXT: retq
1731 ; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm4
1732 ; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm5
1733 ; AVX1-NEXT: vpmuludq %xmm5, %xmm0, %xmm5
1734 ; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5
1735 ; AVX1-NEXT: vpaddq %xmm5, %xmm4, %xmm4
1736 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm5
1737 ; AVX1-NEXT: vpmuludq %xmm2, %xmm5, %xmm5
1738 ; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5
1739 ; AVX1-NEXT: vpaddq %xmm5, %xmm4, %xmm4
1740 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
1741 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1742 ; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm5
1743 ; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm6
1744 ; AVX1-NEXT: vpmuludq %xmm6, %xmm0, %xmm6
1745 ; AVX1-NEXT: vpsllq $32, %xmm6, %xmm6
1746 ; AVX1-NEXT: vpaddq %xmm6, %xmm5, %xmm5
1747 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
1748 ; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0
1749 ; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
1750 ; AVX1-NEXT: vpaddq %xmm0, %xmm5, %xmm0
1751 ; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm2
1752 ; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5
1753 ; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm5
1754 ; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5
1755 ; AVX1-NEXT: vpaddq %xmm5, %xmm2, %xmm2
1756 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm5
1757 ; AVX1-NEXT: vpmuludq %xmm3, %xmm5, %xmm5
1758 ; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5
1759 ; AVX1-NEXT: vpaddq %xmm5, %xmm2, %xmm2
1760 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
1761 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1762 ; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm5
1763 ; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm6
1764 ; AVX1-NEXT: vpmuludq %xmm6, %xmm1, %xmm6
1765 ; AVX1-NEXT: vpsllq $32, %xmm6, %xmm6
1766 ; AVX1-NEXT: vpaddq %xmm6, %xmm5, %xmm5
1767 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1
1768 ; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
1769 ; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1
1770 ; AVX1-NEXT: vpaddq %xmm1, %xmm5, %xmm1
1771 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1772 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
1773 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
1774 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
1775 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
1776 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7]
1777 ; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
1778 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
1779 ; AVX1-NEXT: vzeroupper
1780 ; AVX1-NEXT: retq
1784 ; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm4
1785 ; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm5
1786 ; AVX2-NEXT: vpmuludq %ymm5, %ymm1, %ymm5
1787 ; AVX2-NEXT: vpsllq $32, %ymm5, %ymm5
1788 ; AVX2-NEXT: vpaddq %ymm5, %ymm4, %ymm4
1789 ; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1
1790 ; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm1
1791 ; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1
1792 ; AVX2-NEXT: vpaddq %ymm1, %ymm4, %ymm1
1793 ; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm3
1794 ; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm4
1795 ; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm4
1796 ; AVX2-NEXT: vpsllq $32, %ymm4, %ymm4
1797 ; AVX2-NEXT: vpaddq %ymm4, %ymm3, %ymm3
1798 ; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0
1799 ; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm0
1800 ; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0
1801 ; AVX2-NEXT: vpaddq %ymm0, %ymm3, %ymm0
1802 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
1803 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
1804 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
1805 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3]
1806 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1807 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
1808 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1809 ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
1810 ; AVX2-NEXT: vzeroupper
1811 ; AVX2-NEXT: retq
1815 ; AVX512-NEXT: vpmuludq %zmm1, %zmm0, %zmm2
1816 ; AVX512-NEXT: vpsrlq $32, %zmm1, %zmm3
1817 ; AVX512-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
1818 ; AVX512-NEXT: vpsllq $32, %zmm3, %zmm3
1819 ; AVX512-NEXT: vpaddq %zmm3, %zmm2, %zmm2
1820 ; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm0
1821 ; AVX512-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
1822 ; AVX512-NEXT: vpsllq $32, %zmm0, %zmm0
1823 ; AVX512-NEXT: vpaddq %zmm0, %zmm2, %zmm0
1824 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
1825 ; AVX512-NEXT: retq
1834 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
1835 ; SSE-NEXT: pmuludq %xmm2, %xmm0
1836 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1837 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
1838 ; SSE-NEXT: pmuludq %xmm4, %xmm2
1839 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
1840 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1841 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
1842 ; SSE-NEXT: pmuludq %xmm3, %xmm1
1843 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1844 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
1845 ; SSE-NEXT: pmuludq %xmm2, %xmm3
1846 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
1847 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1848 ; SSE-NEXT: pslld $16, %xmm1
1849 ; SSE-NEXT: psrad $16, %xmm1
1850 ; SSE-NEXT: pslld $16, %xmm0
1851 ; SSE-NEXT: psrad $16, %xmm0
1852 ; SSE-NEXT: packssdw %xmm1, %xmm0
1853 ; SSE-NEXT: retq
1857 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm2
1858 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1859 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1860 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1861 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1862 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
1863 ; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
1864 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1865 ; AVX1-NEXT: vzeroupper
1866 ; AVX1-NEXT: retq
1870 ; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0
1871 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
1872 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1873 ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
1874 ; AVX2-NEXT: vzeroupper
1875 ; AVX2-NEXT: retq
1879 ; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0
1880 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
1881 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
1882 ; AVX512-NEXT: retq
1891 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1892 ; SSE-NEXT: movdqa %xmm0, %xmm9
1893 ; SSE-NEXT: pmuludq %xmm8, %xmm9
1894 ; SSE-NEXT: movdqa %xmm8, %xmm10
1895 ; SSE-NEXT: psrlq $32, %xmm10
1896 ; SSE-NEXT: pmuludq %xmm0, %xmm10
1897 ; SSE-NEXT: psllq $32, %xmm10
1898 ; SSE-NEXT: paddq %xmm10, %xmm9
1899 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm10
1900 ; SSE-NEXT: psrlq $32, %xmm0
1901 ; SSE-NEXT: pmuludq %xmm8, %xmm0
1902 ; SSE-NEXT: psllq $32, %xmm0
1903 ; SSE-NEXT: paddq %xmm9, %xmm0
1904 ; SSE-NEXT: movdqa %xmm1, %xmm8
1905 ; SSE-NEXT: pmuludq %xmm10, %xmm8
1906 ; SSE-NEXT: movdqa %xmm10, %xmm9
1907 ; SSE-NEXT: psrlq $32, %xmm9
1908 ; SSE-NEXT: pmuludq %xmm1, %xmm9
1909 ; SSE-NEXT: psllq $32, %xmm9
1910 ; SSE-NEXT: paddq %xmm8, %xmm9
1911 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1912 ; SSE-NEXT: psrlq $32, %xmm1
1913 ; SSE-NEXT: pmuludq %xmm10, %xmm1
1914 ; SSE-NEXT: psllq $32, %xmm1
1915 ; SSE-NEXT: paddq %xmm9, %xmm1
1916 ; SSE-NEXT: movdqa %xmm2, %xmm9
1917 ; SSE-NEXT: pmuludq %xmm8, %xmm9
1918 ; SSE-NEXT: movdqa %xmm8, %xmm10
1919 ; SSE-NEXT: psrlq $32, %xmm10
1920 ; SSE-NEXT: pmuludq %xmm2, %xmm10
1921 ; SSE-NEXT: psllq $32, %xmm10
1922 ; SSE-NEXT: paddq %xmm9, %xmm10
1923 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
1924 ; SSE-NEXT: psrlq $32, %xmm2
1925 ; SSE-NEXT: pmuludq %xmm8, %xmm2
1926 ; SSE-NEXT: psllq $32, %xmm2
1927 ; SSE-NEXT: paddq %xmm10, %xmm2
1928 ; SSE-NEXT: movdqa %xmm3, %xmm8
1929 ; SSE-NEXT: pmuludq %xmm9, %xmm8
1930 ; SSE-NEXT: movdqa %xmm9, %xmm10
1931 ; SSE-NEXT: psrlq $32, %xmm10
1932 ; SSE-NEXT: pmuludq %xmm3, %xmm10
1933 ; SSE-NEXT: psllq $32, %xmm10
1934 ; SSE-NEXT: paddq %xmm8, %xmm10
1935 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1936 ; SSE-NEXT: psrlq $32, %xmm3
1937 ; SSE-NEXT: pmuludq %xmm9, %xmm3
1938 ; SSE-NEXT: psllq $32, %xmm3
1939 ; SSE-NEXT: paddq %xmm10, %xmm3
1940 ; SSE-NEXT: movdqa %xmm4, %xmm9
1941 ; SSE-NEXT: pmuludq %xmm8, %xmm9
1942 ; SSE-NEXT: movdqa %xmm8, %xmm10
1943 ; SSE-NEXT: psrlq $32, %xmm10
1944 ; SSE-NEXT: pmuludq %xmm4, %xmm10
1945 ; SSE-NEXT: psllq $32, %xmm10
1946 ; SSE-NEXT: paddq %xmm9, %xmm10
1947 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
1948 ; SSE-NEXT: psrlq $32, %xmm4
1949 ; SSE-NEXT: pmuludq %xmm8, %xmm4
1950 ; SSE-NEXT: psllq $32, %xmm4
1951 ; SSE-NEXT: paddq %xmm10, %xmm4
1952 ; SSE-NEXT: movdqa %xmm5, %xmm8
1953 ; SSE-NEXT: pmuludq %xmm9, %xmm8
1954 ; SSE-NEXT: movdqa %xmm9, %xmm10
1955 ; SSE-NEXT: psrlq $32, %xmm10
1956 ; SSE-NEXT: pmuludq %xmm5, %xmm10
1957 ; SSE-NEXT: psllq $32, %xmm10
1958 ; SSE-NEXT: paddq %xmm8, %xmm10
1959 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1960 ; SSE-NEXT: psrlq $32, %xmm5
1961 ; SSE-NEXT: pmuludq %xmm9, %xmm5
1962 ; SSE-NEXT: psllq $32, %xmm5
1963 ; SSE-NEXT: paddq %xmm10, %xmm5
1964 ; SSE-NEXT: movdqa %xmm6, %xmm9
1965 ; SSE-NEXT: pmuludq %xmm8, %xmm9
1966 ; SSE-NEXT: movdqa %xmm8, %xmm10
1967 ; SSE-NEXT: psrlq $32, %xmm10
1968 ; SSE-NEXT: pmuludq %xmm6, %xmm10
1969 ; SSE-NEXT: psllq $32, %xmm10
1970 ; SSE-NEXT: paddq %xmm9, %xmm10
1971 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
1972 ; SSE-NEXT: psrlq $32, %xmm6
1973 ; SSE-NEXT: pmuludq %xmm8, %xmm6
1974 ; SSE-NEXT: psllq $32, %xmm6
1975 ; SSE-NEXT: paddq %xmm10, %xmm6
1976 ; SSE-NEXT: movdqa %xmm7, %xmm8
1977 ; SSE-NEXT: pmuludq %xmm9, %xmm8
1978 ; SSE-NEXT: movdqa %xmm9, %xmm10
1979 ; SSE-NEXT: psrlq $32, %xmm10
1980 ; SSE-NEXT: pmuludq %xmm7, %xmm10
1981 ; SSE-NEXT: psllq $32, %xmm10
1982 ; SSE-NEXT: paddq %xmm8, %xmm10
1983 ; SSE-NEXT: psrlq $32, %xmm7
1984 ; SSE-NEXT: pmuludq %xmm9, %xmm7
1985 ; SSE-NEXT: psllq $32, %xmm7
1986 ; SSE-NEXT: paddq %xmm10, %xmm7
1987 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
1988 ; SSE-NEXT: pand %xmm8, %xmm7
1989 ; SSE-NEXT: pand %xmm8, %xmm6
1990 ; SSE-NEXT: packuswb %xmm7, %xmm6
1991 NEXT: pand %xmm8, %xmm5
1992 ; SSE-NEXT: pand %xmm8, %xmm4
1993 ; SSE-NEXT: packuswb %xmm5, %xmm4
1994 ; SSE-NEXT: packuswb %xmm6, %xmm4
1995 ; SSE-NEXT: pand %xmm8, %xmm3
1996 ; SSE-NEXT: pand %xmm8, %xmm2
1997 ; SSE-NEXT: packuswb %xmm3, %xmm2
1998 ; SSE-NEXT: pand %xmm8, %xmm1
1999 ; SSE-NEXT: pand %xmm8, %xmm0
2000 ; SSE-NEXT: packuswb %xmm1, %xmm0
2001 ; SSE-NEXT: packuswb %xmm2, %xmm0
2002 ; SSE-NEXT: packuswb %xmm4, %xmm0
2003 ; SSE-NEXT: retq
2007 ; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm8
2008 ; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm9
2009 ; AVX1-NEXT: vpmuludq %xmm9, %xmm0, %xmm9
2010 ; AVX1-NEXT: vpsllq $32, %xmm9, %xmm9
2011 ; AVX1-NEXT: vpaddq %xmm9, %xmm8, %xmm8
2012 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm9
2013 ; AVX1-NEXT: vpmuludq %xmm4, %xmm9, %xmm9
2014 ; AVX1-NEXT: vpsllq $32, %xmm9, %xmm9
2015 ; AVX1-NEXT: vpaddq %xmm9, %xmm8, %xmm8
2016 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm10
2017 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2018 ; AVX1-NEXT: vpmuludq %xmm10, %xmm0, %xmm9
2019 ; AVX1-NEXT: vpsrlq $32, %xmm10, %xmm4
2020 ; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm4
2021 ; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
2022 ; AVX1-NEXT: vpaddq %xmm4, %xmm9, %xmm4
2023 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
2024 ; AVX1-NEXT: vpmuludq %xmm10, %xmm0, %xmm0
2025 ; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
2026 ; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm9
2027 ; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm4
2028 ; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm0
2029 ; AVX1-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
2030 ; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
2031 ; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0
2032 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4
2033 ; AVX1-NEXT: vpmuludq %xmm5, %xmm4, %xmm4
2034 ; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
2035 ; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm10
2036 ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm0
2037 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
2038 ; AVX1-NEXT: vpmuludq %xmm0, %xmm1, %xmm5
2039 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm4
2040 ; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm4
2041 ; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
2042 ; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4
2043 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1
2044 ; AVX1-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
2045 ; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
2046 ; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm1
2047 ; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm0
2048 ; AVX1-NEXT: vpsrlq $32, %xmm6, %xmm4
2049 ; AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm4
2050 ; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
2051 ; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0
2052 ; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm4
2053 ; AVX1-NEXT: vpmuludq %xmm6, %xmm4, %xmm4
2054 ; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
2055 ; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm5
2056 ; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm0
2057 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
2058 ; AVX1-NEXT: vpmuludq %xmm0, %xmm2, %xmm4
2059 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm6
2060 ; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm6
2061 ; AVX1-NEXT: vpsllq $32, %xmm6, %xmm6
2062 ; AVX1-NEXT: vpaddq %xmm6, %xmm4, %xmm4
2063 ; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2
2064 ; AVX1-NEXT: vpmuludq %xmm0, %xmm2, %xmm0
2065 ; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
2066 ; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0
2067 ; AVX1-NEXT: vpmuludq %xmm7, %xmm3, %xmm2
2068 ; AVX1-NEXT: vpsrlq $32, %xmm7, %xmm4
2069 ; AVX1-NEXT: vpmuludq %xmm4, %xmm3, %xmm4
2070 ; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
2071 ; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2
2072 ; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm4
2073 ; AVX1-NEXT: vpmuludq %xmm7, %xmm4, %xmm4
2074 ; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
2075 ; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2
2076 ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm4
2077 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
2078 ; AVX1-NEXT: vpmuludq %xmm4, %xmm3, %xmm6
2079 ; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm7
2080 ; AVX1-NEXT: vpmuludq %xmm7, %xmm3, %xmm7
2081 ; AVX1-NEXT: vpsllq $32, %xmm7, %xmm7
2082 ; AVX1-NEXT: vpaddq %xmm7, %xmm6, %xmm6
2083 ; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm3
2084 ; AVX1-NEXT: vpmuludq %xmm4, %xmm3, %xmm3
2085 ; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3
2086 ; AVX1-NEXT: vpaddq %xmm3, %xmm6, %xmm3
2087 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
2088 ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
2089 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
2090 ; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
2091 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
2092 ; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm3
2093 ; AVX1-NEXT: vpackuswb %xmm0, %xmm3, %xmm0
2094 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
2095 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
2096 ; AVX1-NEXT: vpand %xmm4, %xmm10, %xmm2
2097 ; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
2098 ; AVX1-NEXT: vpand %xmm4, %xmm9, %xmm2
2099 ; AVX1-NEXT: vpand %xmm4, %xmm8, %xmm3
2100 ; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
2101 ; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
2102 ; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
2103 ; AVX1-NEXT: vzeroupper
2104 ; AVX1-NEXT: retq
2108 ; AVX2-NEXT: vpmuludq %ymm5, %ymm1, %ymm8
2109 ; AVX2-NEXT: vpsrlq $32, %ymm5, %ymm9
2110 ; AVX2-NEXT: vpmuludq %ymm9, %ymm1, %ymm9
2111 ; AVX2-NEXT: vpsllq $32, %ymm9, %ymm9
2112 ; AVX2-NEXT: vpaddq %ymm9, %ymm8, %ymm8
2113 ; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1
2114 ; AVX2-NEXT: vpmuludq %ymm5, %ymm1, %ymm1
2115 ; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1
2116 ; AVX2-NEXT: vpaddq %ymm1, %ymm8, %ymm1
2117 ; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm5
2118 ; AVX2-NEXT: vpsrlq $32, %ymm4, %ymm8
2119 ; AVX2-NEXT: vpmuludq %ymm8, %ymm0, %ymm8
2120 ; AVX2-NEXT: vpsllq $32, %ymm8, %ymm8
2121 ; AVX2-NEXT: vpaddq %ymm8, %ymm5, %ymm5
2122 ; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0
2123 ; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm0
2124 ; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0
2125 ; AVX2-NEXT: vpaddq %ymm0, %ymm5, %ymm0
2126 ; AVX2-NEXT: vpmuludq %ymm7, %ymm3, %ymm4
2127 ; AVX2-NEXT: vpsrlq $32, %ymm7, %ymm5
2128 ; AVX2-NEXT: vpmuludq %ymm5, %ymm3, %ymm5
2129 ; AVX2-NEXT: vpsllq $32, %ymm5, %ymm5
2130 ; AVX2-NEXT: vpaddq %ymm5, %ymm4, %ymm4
2131 ; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm3
2132 ; AVX2-NEXT: vpmuludq %ymm7, %ymm3, %ymm3
2133 ; AVX2-NEXT: vpsllq $32, %ymm3, %ymm3
2134 ; AVX2-NEXT: vpaddq %ymm3, %ymm4, %ymm3
2135 ; AVX2-NEXT: vpmuludq %ymm6, %ymm2, %ymm4
2136 ; AVX2-NEXT: vpsrlq $32, %ymm6, %ymm5
2137 ; AVX2-NEXT: vpmuludq %ymm5, %ymm2, %ymm5
2138 ; AVX2-NEXT: vpsllq $32, %ymm5, %ymm5
2139 ; AVX2-NEXT: vpaddq %ymm5, %ymm4, %ymm4
2140 ; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm2
2141 ; AVX2-NEXT: vpmuludq %ymm6, %ymm2, %ymm2
2142 ; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2
2143 ; AVX2-NEXT: vpaddq %ymm2, %ymm4, %ymm2
2144 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,0,2,4,6,4,6]
2145 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3]
2146 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,0,2,4,6,4,6]
2147 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3]
2148 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
2149 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
2150 ; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
2151 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
2152 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
2153 ; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
2154 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
2155 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
2156 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
2157 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3]
2158 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2159 ; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
2160 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2161 ; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
2162 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
2163 ; AVX2-NEXT: vzeroupper
2164 ; AVX2-NEXT: retq
2168 ; AVX512-NEXT: vpmuludq %zmm3, %zmm1, %zmm4
2169 ; AVX512-NEXT: vpsrlq $32, %zmm3, %zmm5
2170 ; AVX512-NEXT: vpmuludq %zmm5, %zmm1, %zmm5
2171 ; AVX512-NEXT: vpsllq $32, %zmm5, %zmm5
2172 ; AVX512-NEXT: vpaddq %zmm5, %zmm4, %zmm4
2173 ; AVX512-NEXT: vpsrlq $32, %zmm1, %zmm1
2174 ; AVX512-NEXT: vpmuludq %zmm3, %zmm1, %zmm1
2175 ; AVX512-NEXT: vpsllq $32, %zmm1, %zmm1
2176 ; AVX512-NEXT: vpaddq %zmm1, %zmm4, %zmm1
2177 ; AVX512-NEXT: vpmuludq %zmm2, %zmm0, %zmm3
2178 ; AVX512-NEXT: vpsrlq $32, %zmm2, %zmm4
2179 ; AVX512-NEXT: vpmuludq %zmm4, %zmm0, %zmm4
2180 ; AVX512-NEXT: vpsllq $32, %zmm4, %zmm4
2181 ; AVX512-NEXT: vpaddq %zmm4, %zmm3, %zmm3
2182 ; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm0
2183 ; AVX512-NEXT: vpmuludq %zmm2, %zmm0, %zmm0
2184 ; AVX512-NEXT: vpsllq $32, %zmm0, %zmm0
2185 ; AVX512-NEXT: vpaddq %zmm0, %zmm3, %zmm0
2186 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
2187 ; AVX512-NEXT: vpmovqd %zmm1, %ymm1
2188 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2189 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
2190 ; AVX512-NEXT: retq
2199 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3]
2200 ; SSE-NEXT: pmuludq %xmm4, %xmm0
2201 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2202 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
2203 ; SSE-NEXT: pmuludq %xmm8, %xmm4
2204 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2205 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
2206 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
2207 ; SSE-NEXT: pmuludq %xmm5, %xmm1
2208 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2209 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
2210 ; SSE-NEXT: pmuludq %xmm4, %xmm5
2211 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
2212 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
2213 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
2214 ; SSE-NEXT: pmuludq %xmm6, %xmm2
2215 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2216 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
2217 ; SSE-NEXT: pmuludq %xmm4, %xmm5
2218 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
2219 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
2220 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
2221 ; SSE-NEXT: pmuludq %xmm7, %xmm3
2222 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
2223 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
2224 ; SSE-NEXT: pmuludq %xmm4, %xmm5
2225 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
2226 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
2227 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
2228 ; SSE-NEXT: pand %xmm4, %xmm3
2229 ; SSE-NEXT: pand %xmm4, %xmm2
2230 ; SSE-NEXT: packuswb %xmm3, %xmm2
2231 ; SSE-NEXT: pand %xmm4, %xmm1
2232 ; SSE-NEXT: pand %xmm4, %xmm0
2233 ; SSE-NEXT: packuswb %xmm1, %xmm0
2234 ; SSE-NEXT: packuswb %xmm2, %xmm0
2235 ; SSE-NEXT: retq
2239 ; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm4
2240 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
2241 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2242 ; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0
2243 ; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm2
2244 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
2245 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
2246 ; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1
2247 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
2248 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
2249 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
2250 ; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
2251 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
2252 ; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2
2253 ; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0
2254 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2255 ; AVX1-NEXT: vzeroupper
2256 ; AVX1-NEXT: retq
2260 ; AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm0
2261 ; AVX2-NEXT: vpmulld %ymm3, %ymm1, %ymm1
2262 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
2263 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
2264 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
2265 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
2266 ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
2267 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
2268 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2269 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
2270 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2271 ; AVX2-NEXT: vzeroupper
2272 ; AVX2-NEXT: retq
2276 ; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0
2277 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
2278 ; AVX512-NEXT: retq
2287 ; SSE-NEXT: pmullw %xmm2, %xmm0
2288 ; SSE-NEXT: pmullw %xmm3, %xmm1
2289 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
2290 ; SSE-NEXT: pand %xmm2, %xmm1
2291 ; SSE-NEXT: pand %xmm2, %xmm0
2292 ; SSE-NEXT: packuswb %xmm1, %xmm0
2293 ; SSE-NEXT: retq
2297 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm2
2298 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
2299 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2300 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
2301 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
2302 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
2303 ; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
2304 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2305 ; AVX1-NEXT: vzeroupper
2306 ; AVX1-NEXT: retq
2310 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0
2311 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2312 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
2313 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
2314 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
2315 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2316 ; AVX2-NEXT: vzeroupper
2317 ; AVX2-NEXT: retq
2321 ; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0
2322 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
2323 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
2324 ; AVX512F-NEXT: retq
2328 ; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0
2329 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
2330 ; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
2331 ; AVX512BW-NEXT: retq
2344 ; SSE-NEXT: movl $1, %eax
2345 ; SSE-NEXT: movd %rax, %xmm2
2346 ; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
2347 ; SSE-NEXT: movdqa %xmm0, %xmm3
2348 ; SSE-NEXT: pmuludq %xmm2, %xmm3
2349 ; SSE-NEXT: psrlq $32, %xmm0
2350 ; SSE-NEXT: pmuludq %xmm2, %xmm0
2351 ; SSE-NEXT: psllq $32, %xmm0
2352 ; SSE-NEXT: paddq %xmm3, %xmm0
2353 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2,3]
2354 ; SSE-NEXT: movdqa %xmm1, %xmm3
2355 ; SSE-NEXT: pmuludq %xmm2, %xmm3
2356 ; SSE-NEXT: psrlq $32, %xmm1
2357 ; SSE-NEXT: pmuludq %xmm2, %xmm1
2358 ; SSE-NEXT: psllq $32, %xmm1
2359 ; SSE-NEXT: paddq %xmm3, %xmm1
2360 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2361 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2362 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2363 ; SSE-NEXT: retq
2367 ; AVX1-NEXT: movl $1, %eax
2368 ; AVX1-NEXT: vmovq %rax, %xmm1
2369 ; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
2370 ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
2371 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm3
2372 ; AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
2373 ; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1
2374 ; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1
2375 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2376 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3]
2377 ; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm3
2378 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
2379 ; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0
2380 ; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
2381 ; AVX1-NEXT: vpaddq %xmm0, %xmm3, %xmm0
2382 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
2383 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2384 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
2385 ; AVX1-NEXT: vzeroupper
2386 ; AVX1-NEXT: retq
2390 ; AVX2-NEXT
2391 ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm2
2392 ; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0
2393 ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
2394 ; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0
2395 ; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0
2396 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
2397 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
2398 ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
2399 ; AVX2-NEXT: vzeroupper
2400 ; AVX2-NEXT: retq
2404 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3]
2405 ; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm2
2406 ; AVX512-NEXT: vpsrlq $32, %ymm0, %ymm0
2407 ; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
2408 ; AVX512-NEXT: vpsllq $32, %ymm0, %ymm0
2409 ; AVX512-NEXT: vpaddq %ymm0, %ymm2, %ymm0
2410 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
2411 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
2412 ; AVX512-NEXT: retq
2421 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [4,5]
2422 ; SSE-NEXT: movdqa %xmm2, %xmm5
2423 ; SSE-NEXT: pmuludq %xmm4, %xmm5
2424 ; SSE-NEXT: psrlq $32, %xmm2
2425 ; SSE-NEXT: pmuludq %xmm4, %xmm2
2426 ; SSE-NEXT: psllq $32, %xmm2
2427 ; SSE-NEXT: paddq %xmm5, %xmm2
2428 ; SSE-NEXT: movl $1, %eax
2429 ; SSE-NEXT: movd %rax, %xmm4
2430 ; SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
2431 ; SSE-NEXT: movdqa %xmm0, %xmm5
2432 ; SSE-NEXT: pmuludq %xmm4, %xmm5
2433 ; SSE-NEXT: psrlq $32, %xmm0
2434 ; SSE-NEXT: pmuludq %xmm4, %xmm0
2435 ; SSE-NEXT: psllq $32, %xmm0
2436 ; SSE-NEXT: paddq %xmm5, %xmm0
2437 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [6,7]
2438 ; SSE-NEXT: movdqa %xmm3, %xmm5
2439 ; SSE-NEXT: pmuludq %xmm4, %xmm5
2440 ; SSE-NEXT: psrlq $32, %xmm3
2441 ; SSE-NEXT: pmuludq %xmm4, %xmm3
2442 ; SSE-NEXT: psllq $32, %xmm3
2443 ; SSE-NEXT: paddq %xmm5, %xmm3
2444 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [2,3]
2445 ; SSE-NEXT: movdqa %xmm1, %xmm5
2446 ; SSE-NEXT: pmuludq %xmm4, %xmm5
2447 ; SSE-NEXT: psrlq $32, %xmm1
2448 ; SSE-NEXT: pmuludq %xmm4, %xmm1
2449 ; SSE-NEXT: psllq $32, %xmm1
2450 ; SSE-NEXT: paddq %xmm5, %xmm1
2451 ; SSE-NEXT: pextrw $4, %xmm1, %eax
2452 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
2453 ; SSE-NEXT: pextrw $4, %xmm0, %ecx
2454 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
2455 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2456 ; SSE-NEXT: pextrw $4, %xmm3, %edx
2457 ; SSE-NEXT: movd %edx, %xmm1
2458 ; SSE-NEXT: movd %eax, %xmm3
2459 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
2460 ; SSE-NEXT: pextrw $4, %xmm2, %eax
2461 ; SSE-NEXT: movd %eax, %xmm1
2462 ; SSE-NEXT: movd %ecx, %xmm2
2463 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
2464 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
2465 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
2466 ; SSE-NEXT: retq
2470 ; AVX1-NEXT: movl $1, %eax
2471 ; AVX1-NEXT: vmovq %rax, %xmm2
2472 ; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
2473 ; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm3
2474 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm4
2475 ; AVX1-NEXT: vpmuludq %xmm2, %xmm4, %xmm2
2476 ; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2
2477 ; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
2478 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2479 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3]
2480 ; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm4
2481 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
2482 ; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm0
2483 ; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
2484 ; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0
2485 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5]
2486 ; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm4
2487 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm5
2488 ; AVX1-NEXT: vpmuludq %xmm3, %xmm5, %xmm3
2489 ; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3
2490 ; AVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm3
2491 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
2492 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [6,7]
2493 ; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm5
2494 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1
2495 ; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm1
2496 ; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1
2497 ; AVX1-NEXT: vpaddq %xmm1, %xmm5, %xmm1
2498 ; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
2499 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7]
2500 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7]
2501 ; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1
2502 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7]
2503 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7]
2504 ; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
2505 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
2506 ; AVX1-NEXT: vzeroupper
2507 ; AVX1-NEXT: retq
2511 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,6,7]
2512 ; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm3
2513 ; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1
2514 ; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm1
2515 ; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1
2516 ; AVX2-NEXT: vpaddq %ymm1, %ymm3, %ymm1
2517 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3]
2518 ; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm3
2519 ; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0
2520 ; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm0
2521 ; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0
2522 ; AVX2-NEXT: vpaddq %ymm0, %ymm3, %ymm0
2523 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
2524 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
2525 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
2526 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3]
2527 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2528 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
2529 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2530 ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
2531 ; AVX2-NEXT: vzeroupper
2532 ; AVX2-NEXT: retq
2536 ; AVX512-NEXT: vmovdqa32 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7]
2537 ; AVX512-NEXT: vpmuludq %zmm1, %zmm0, %zmm2
2538 ; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm0
2539 ; AVX512-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
2540 ; AVX512-NEXT: vpsllq $32, %zmm0, %zmm0
2541 ; AVX512-NEXT: vpaddq %zmm0, %zmm2, %zmm0
2542 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
2543 ; AVX512-NEXT: retq
2552 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,1,2,3]
2553 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
2554 ; SSE-NEXT: pmuludq %xmm2, %xmm0
2555 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2556 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
2557 ; SSE-NEXT: pmuludq %xmm3, %xmm2
2558 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2559 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2560 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [4,5,6,7]
2561 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
2562 ; SSE-NEXT: pmuludq %xmm2, %xmm1
2563 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2564 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
2565 ; SSE-NEXT: pmuludq %xmm3, %xmm2
2566 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2567 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
2568 ; SSE-NEXT: pslld $16, %xmm1
2569 ; SSE-NEXT: psrad $16, %xmm1
2570 ; SSE-NEXT: pslld $16, %xmm0
2571 ; SSE-NEXT: psrad $16, %xmm0
2572 ; SSE-NEXT: packssdw %xmm1, %xmm0
2573 ; SSE-NEXT: retq
2577 ; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm1
2578 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2579 ; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
2580 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
2581 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
2582 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
2583 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2584 ; AVX1-NEXT: vzeroupper
2585 ; AVX1-NEXT: retq
2589 ; AVX2-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0
2590 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
2591 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2592 ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
2593 ; AVX2-NEXT: vzeroupper
2594 ; AVX2-NEXT: retq
2598 ; AVX512-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0
2599 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
2600 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
2601 ; AVX512-NEXT: retq
2610 ; SSE-NEXT: movl $1, %eax
2611 ; SSE-NEXT: movd %rax, %xmm8
2612 ; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7]
2613 ; SSE-NEXT: movdqa %xmm0, %xmm9
2614 ; SSE-NEXT: pmuludq %xmm8, %xmm9
2615 ; SSE-NEXT: psrlq $32, %xmm0
2616 ; SSE-NEXT: pmuludq %xmm8, %xmm0
2617 ; SSE-NEXT: psllq $32, %xmm0
2618 ; SSE-NEXT: paddq %xmm9, %xmm0
2619 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [2,3]
2620 ; SSE-NEXT: movdqa %xmm1, %xmm9
2621 ; SSE-NEXT: pmuludq %xmm8, %xmm9
2622 ; SSE-NEXT: psrlq $32, %xmm1
2623 ; SSE-NEXT: pmuludq %xmm8, %xmm1
2624 ; SSE-NEXT: psllq $32, %xmm1
2625 ; SSE-NEXT: paddq %xmm9, %xmm1
2626 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [4,5]
2627 ; SSE-NEXT: movdqa %xmm2, %xmm9
2628 ; SSE-NEXT: pmuludq %xmm8, %xmm9
2629 ; SSE-NEXT: psrlq $32, %xmm2
2630 ; SSE-NEXT: pmuludq %xmm8, %xmm2
2631 ; SSE-NEXT: psllq $32, %xmm2
2632 ; SSE-NEXT: paddq %xmm9, %xmm2
2633 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [6,7]
2634 ; SSE-NEXT: movdqa %xmm3, %xmm9
2635 ; SSE-NEXT: pmuludq %xmm8, %xmm9
2636 ; SSE-NEXT: psrlq $32, %xmm3
2637 ; SSE-NEXT: pmuludq %xmm8, %xmm3
2638 ; SSE-NEXT: psllq $32, %xmm3
2639 ; SSE-NEXT: paddq %xmm9, %xmm3
2640 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [8,9]
2641 ; SSE-NEXT: movdqa %xmm4, %xmm9
2642 ; SSE-NEXT: pmuludq %xmm8, %xmm9
2643 ; SSE-NEXT: psrlq $32, %xmm4
2644 ; SSE-NEXT: pmuludq %xmm8, %xmm4
2645 ; SSE-NEXT: psllq $32, %xmm4
2646 ; SSE-NEXT: paddq %xmm9, %xmm4
2647 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [10,11]
2648 ; SSE-NEXT: movdqa %xmm5, %xmm9
2649 ; SSE-NEXT: pmuludq %xmm8, %xmm9
2650 ; SSE-NEXT: psrlq $32, %xmm5
2651 ; SSE-NEXT: pmuludq %xmm8, %xmm5
2652 ; SSE-NEXT: psllq $32, %xmm5
2653 ; SSE-NEXT: paddq %xmm9, %xmm5
2654 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [12,13]
2655 ; SSE-NEXT: movdqa %xmm6, %xmm9
2656 ; SSE-NEXT: pmuludq %xmm8, %xmm9
2657 ; SSE-NEXT: psrlq $32, %xmm6
2658 ; SSE-NEXT: pmuludq %xmm8, %xmm6
2659 ; SSE-NEXT: psllq $32, %xmm6
2660 ; SSE-NEXT: paddq %xmm9, %xmm6
2661 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [14,15]
2662 ; SSE-NEXT: movdqa %xmm7, %xmm9
2663 ; SSE-NEXT: pmuludq %xmm8, %xmm9
2664 ; SSE-NEXT: psrlq $32, %xmm7
2665 ; SSE-NEXT: pmuludq %xmm8, %xmm7
2666 ; SSE-NEXT: psllq $32, %xmm7
2667 ; SSE-NEXT: paddq %xmm9, %xmm7
2668 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
2669 ; SSE-NEXT: pand %xmm8, %xmm7
2670 ; SSE-NEXT: pand %xmm8, %xmm6
2671 ; SSE-NEXT: packuswb %xmm7, %xmm6
2672 ; SSE-NEXT: pand %xmm8, %xmm5
2673 ; SSE-NEXT: pand %xmm8, %xmm4
2674 ; SSE-NEXT: packuswb %xmm5, %xmm4
2675 ; SSE-NEXT: packuswb %xmm6, %xmm4
2676 ; SSE-NEXT: pand %xmm8, %xmm3
2677 ; SSE-NEXT: pand %xmm8, %xmm2
2678 ; SSE-NEXT: packuswb %xmm3, %xmm2
2679 ; SSE-NEXT: pand %xmm8, %xmm1
2680 ; SSE-NEXT: pand %xmm8, %xmm0
2681 ; SSE-NEXT: packuswb %xmm1, %xmm0
2682 ; SSE-NEXT: packuswb %xmm2, %xmm0
2683 ; SSE-NEXT: packuswb %xmm4, %xmm0
2684 ; SSE-NEXT: retq
2688 ; AVX1-NEXT: movl $1, %eax
2689 ; AVX1-NEXT: vmovq %rax, %xmm4
2690 ; AVX1-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
2691 ; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm5
2692 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm6
2693 ; AVX1-NEXT: vpmuludq %xmm4, %xmm6, %xmm4
2694 ; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
2695 ; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm8
2696 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2697 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [2,3]
2698 ; AVX1-NEXT: vpmuludq %xmm5, %xmm0, %xmm6
2699 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
2700 ; AVX1-NEXT: vpmuludq %xmm5, %xmm0, %xmm0
2701 ; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
2702 ; AVX1-NEXT: vpaddq %xmm0, %xmm6, %xmm9
2703 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5]
2704 ; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm6
2705 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm7
2706 ; AVX1-NEXT: vpmuludq %xmm5, %xmm7, %xmm5
2707 ; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5
2708 ; AVX1-NEXT: vpaddq %xmm5, %xmm6, %xmm5
2709 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
2710 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [6,7]
2711 ; AVX1-NEXT: vpmuludq %xmm6, %xmm1, %xmm7
2712 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1
2713 ; AVX1-NEXT: vpmuludq %xmm6, %xmm1, %xmm1
2714 ; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1
2715 ; AVX1-NEXT: vpaddq %xmm1, %xmm7, %xmm1
2716 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9]
2717 ; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm7
2718 ; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm4
2719 ; AVX1-NEXT: vpmuludq %xmm6, %xmm4, %xmm4
2720 ; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
2721 ; AVX1-NEXT: vpaddq %xmm4, %xmm7, %xmm4
2722 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
2723 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [10,11]
2724 ; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm7
2725 ; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2
2726 ; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm2
2727 ; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2
2728 ; AVX1-NEXT: vpaddq %xmm2, %xmm7, %xmm2
2729 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [12,13]
2730 ; AVX1-NEXT: vpmuludq %xmm6, %xmm3, %xmm7
2731 ; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm0
2732 ; AVX1-NEXT: vpmuludq %xmm6, %xmm0, %xmm0
2733 ; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
2734 ; AVX1-NEXT: vpaddq %xmm0, %xmm7, %xmm0
2735 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
2736 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [14,15]
2737 ; AVX1-NEXT: vpmuludq %xmm6, %xmm3, %xmm7
2738 ; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm3
2739 ; AVX1-NEXT: vpmuludq %xmm6, %xmm3, %xmm3
2740 ; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3
2741 ; AVX1-NEXT: vpaddq %xmm3, %xmm7, %xmm3
2742 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
2743 ; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
2744 ; AVX1-NEXT: vpand %xmm6, %xmm0, %xmm0
2745 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
2746 ; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2
2747 ; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm3
2748 ; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
2749 ; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0
2750 ; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1
2751 ; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm2
2752 ; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
2753 ; AVX1-NEXT: vpand %xmm6, %xmm9, %xmm2
2754 ; AVX1-NEXT: vpand %xmm6, %xmm8, %xmm3
2755 ; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
2756 ; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
2757 ; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
2758 ; AVX1-NEXT: vzeroupper
2759 ; AVX1-NEXT: retq
2763 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,6,7]
2764 ; AVX2-NEXT: vpmuludq %ymm4, %ymm1, %ymm5
2765 ; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1
2766 ; AVX2-NEXT: vpmuludq %ymm4, %ymm1, %ymm1
2767 ; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1
2768 ; AVX2-NEXT: vpaddq %ymm1, %ymm5, %ymm1
2769 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3]
2770 ; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm5
2771 ; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0
2772 ; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm0
2773 ; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0
2774 ; AVX2-NEXT: vpaddq %ymm0, %ymm5, %ymm0
2775 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [12,13,14,15]
2776 ; AVX2-NEXT: vpmuludq %ymm4, %ymm3, %ymm5
2777 ; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm3
2778 ; AVX2-NEXT: vpmuludq %ymm4, %ymm3, %ymm3
2779 NEXT: vpsllq $32, %ymm3, %ymm3
2780 ; AVX2-NEXT: vpaddq %ymm3, %ymm5, %ymm3
2781 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,10,11]
2782 ; AVX2-NEXT: vpmuludq %ymm4, %ymm2, %ymm5
2783 ; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm2
2784 ; AVX2-NEXT: vpmuludq %ymm4, %ymm2, %ymm2
2785 ; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2
2786 ; AVX2-NEXT: vpaddq %ymm2, %ymm5, %ymm2
2787 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,0,2,4,6,4,6]
2788 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3]
2789 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,0,2,4,6,4,6]
2790 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3]
2791 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
2792 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
2793 ; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
2794 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
2795 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
2796 ; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
2797 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
2798 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
2799 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
2800 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3]
2801 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2802 ; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
2803 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2804 ; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
2805 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
2806 ; AVX2-NEXT: vzeroupper
2807 ; AVX2-NEXT: retq
2811 ; AVX512-NEXT: vmovdqa32 {{.*#+}} zmm2 = [8,9,10,11,12,13,14,15]
2812 ; AVX512-NEXT: vpmuludq %zmm2, %zmm1, %zmm3
2813 ; AVX512-NEXT: vpsrlq $32, %zmm1, %zmm1
2814 ; AVX512-NEXT: vpmuludq %zmm2, %zmm1, %zmm1
2815 ; AVX512-NEXT: vpsllq $32, %zmm1, %zmm1
2816 ; AVX512-NEXT: vpaddq %zmm1, %zmm3, %zmm1
2817 ; AVX512-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7]
2818 ; AVX512-NEXT: vpmuludq %zmm2, %zmm0, %zmm3
2819 ; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm0
2820 ; AVX512-NEXT: vpmuludq %zmm2, %zmm0, %zmm0
2821 ; AVX512-NEXT: vpsllq $32, %zmm0, %zmm0
2822 ; AVX512-NEXT: vpaddq %zmm0, %zmm3, %zmm0
2823 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
2824 ; AVX512-NEXT: vpmovqd %zmm1, %ymm1
2825 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2826 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
2827 ; AVX512-NEXT: retq
2836 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,1,2,3]
2837 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
2838 ; SSE-NEXT: pmuludq %xmm4, %xmm0
2839 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2840 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
2841 ; SSE-NEXT: pmuludq %xmm5, %xmm4
2842 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2843 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
2844 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [4,5,6,7]
2845 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
2846 ; SSE-NEXT: pmuludq %xmm4, %xmm1
2847 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2848 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
2849 ; SSE-NEXT: pmuludq %xmm5, %xmm4
2850 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2851 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
2852 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [8,9,10,11]
2853 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
2854 ; SSE-NEXT: pmuludq %xmm4, %xmm2
2855 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2856 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
2857 ; SSE-NEXT: pmuludq %xmm5, %xmm4
2858 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2859 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
2860 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [12,13,14,15]
2861 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
2862 ; SSE-NEXT: pmuludq %xmm4, %xmm3
2863 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
2864 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
2865 ; SSE-NEXT: pmuludq %xmm5, %xmm4
2866 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2867 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
2868 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
2869 ; SSE-NEXT: pand %xmm4, %xmm3
2870 ; SSE-NEXT: pand %xmm4, %xmm2
2871 ; SSE-NEXT: packuswb %xmm3, %xmm2
2872 ; SSE-NEXT: pand %xmm4, %xmm1
2873 ; SSE-NEXT: pand %xmm4, %xmm0
2874 ; SSE-NEXT: packuswb %xmm1, %xmm0
2875 ; SSE-NEXT: packuswb %xmm2, %xmm0
2876 ; SSE-NEXT: retq
2880 ; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
2881 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2882 ; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
2883 ; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm3
2884 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
2885 ; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
2886 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
2887 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
2888 ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
2889 ; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1
2890 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
2891 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
2892 ; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0
2893 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2894 ; AVX1-NEXT: vzeroupper
2895 ; AVX1-NEXT: retq
2899 ; AVX2-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0
2900 ; AVX2-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1
2901 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
2902 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
2903 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
2904 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
2905 ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
2906 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
2907 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2908 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
2909 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2910 ; AVX2-NEXT: vzeroupper
2911 ; AVX2-NEXT: retq
2915 ; AVX512-NEXT: vpmulld {{.*}}(%rip), %zmm0, %zmm0
2916 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
2917 ; AVX512-NEXT: retq
2926 ; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
2927 ; SSE-NEXT: pmullw {{.*}}(%rip), %xmm1
2928 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
2929 ; SSE-NEXT: pand %xmm2, %xmm1
2930 ; SSE-NEXT: pand %xmm2, %xmm0
2931 ; SSE-NEXT: packuswb %xmm1, %xmm0
2932 ; SSE-NEXT: retq
2936 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1
2937 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2938 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
2939 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
2940 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
2941 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
2942 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2943 ; AVX1-NEXT: vzeroupper
2944 ; AVX1-NEXT: retq
2948 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
2949 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2950 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
2951 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
2952 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
2953 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2954 ; AVX2-NEXT: vzeroupper
2955 ; AVX2-NEXT: retq
2959 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
2960 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
2961 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
2962 ; AVX512F-NEXT: retq
2966 ; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
2967 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
2968 ; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
2969 ; AVX512BW-NEXT: retq
2982 ; SSE-NEXT: pand %xmm2, %xmm0
2983 ; SSE-NEXT: pand %xmm3, %xmm1
2984 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2985 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2986 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2987 ; SSE-NEXT: retq
2991 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
2992 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
2993 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
2994 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2995 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
2996 ; AVX1-NEXT: vzeroupper
2997 ; AVX1-NEXT: retq
3001 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
3002 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
3003 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
3004 ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3005 ; AVX2-NEXT: vzeroupper
3006 ; AVX2-NEXT: retq
3010 ; AVX512-NEXT: vandps %ymm1, %ymm0, %ymm0
3011 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
3012 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3013 ; AVX512-NEXT: retq
3022 ; SSE-NEXT: pand %xmm6, %xmm2
3023 ; SSE-NEXT: pand %xmm4, %xmm0
3024 ; SSE-NEXT: pand %xmm7, %xmm3
3025 ; SSE-NEXT: pand %xmm5, %xmm1
3026 ; SSE-NEXT: pextrw $4, %xmm1, %eax
3027 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
3028 ; SSE-NEXT: pextrw $4, %xmm0, %ecx
3029 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
3030 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3031 ; SSE-NEXT: pextrw $4, %xmm3, %edx
3032 ; SSE-NEXT: movd %edx, %xmm1
3033 ; SSE-NEXT: movd %eax, %xmm3
3034 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
3035 ; SSE-NEXT: pextrw $4, %xmm2, %eax
3036 ; SSE-NEXT: movd %eax, %xmm1
3037 ; SSE-NEXT: movd %ecx, %xmm2
3038 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
3039 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
3040 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
3041 ; SSE-NEXT: retq
3045 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
3046 ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
3047 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
3048 ; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3
3049 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
3050 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
3051 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
3052 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
3053 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
3054 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
3055 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
3056 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
3057 ; AVX1-NEXT: vzeroupper
3058 ; AVX1-NEXT: retq
3062 ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
3063 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
3064 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
3065 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
3066 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
3067 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3]
3068 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
3069 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
3070 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3071 ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3072 ; AVX2-NEXT: vzeroupper
3073 ; AVX2-NEXT: retq
3077 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
3078 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
3079 ; AVX512-NEXT: retq
3088 ; SSE-NEXT: pand %xmm2, %xmm0
3089 ; SSE-NEXT: pand %xmm3, %xmm1
3090 ; SSE-NEXT: pslld $16, %xmm1
3091 ; SSE-NEXT: psrad $16, %xmm1
3092 ; SSE-NEXT: pslld $16, %xmm0
3093 ; SSE-NEXT: psrad $16, %xmm0
3094 ; SSE-NEXT: packssdw %xmm1, %xmm0
3095 ; SSE-NEXT: retq
3099 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
3100 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
3101 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
3102 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
3103 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
3104 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3105 ; AVX1-NEXT: vzeroupper
3106 ; AVX1-NEXT: retq
3110 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
3111 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
3112 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3113 ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3114 ; AVX2-NEXT: vzeroupper
3115 ; AVX2-NEXT: retq
3119 ; AVX512-NEXT: vandps %ymm1, %ymm0, %ymm0
3120 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
3121 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3122 ; AVX512-NEXT: retq
3131 ; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm0
3132 ; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm1
3133 ; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm2
3134 ; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm3
3135 ; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm4
3136 ; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm5
3137 ; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm6
3138 ; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm7
3139 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
3140 ; SSE-NEXT: pand %xmm8, %xmm7
3141 ; SSE-NEXT: pand %xmm8, %xmm6
3142 ; SSE-NEXT: packuswb %xmm7, %xmm6
3143 ; SSE-NEXT: pand %xmm8, %xmm5
3144 ; SSE-NEXT: pand %xmm8, %xmm4
3145 ; SSE-NEXT: packuswb %xmm5, %xmm4
3146 ; SSE-NEXT: packuswb %xmm6, %xmm4
3147 ; SSE-NEXT: pand %xmm8, %xmm3
3148 ; SSE-NEXT: pand %xmm8, %xmm2
3149 ; SSE-NEXT: packuswb %xmm3, %xmm2
3150 ; SSE-NEXT: pand %xmm8, %xmm1
3151 ; SSE-NEXT: pand %xmm8, %xmm0
3152 ; SSE-NEXT: packuswb %xmm1, %xmm0
3153 ; SSE-NEXT: packuswb %xmm2, %xmm0
3154 ; SSE-NEXT: packuswb %xmm4, %xmm0
3155 ; SSE-NEXT: retq
3159 ; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
3160 ; AVX1-NEXT: vandps %ymm5, %ymm1, %ymm1
3161 ; AVX1-NEXT: vandps %ymm6, %ymm2, %ymm2
3162 ; AVX1-NEXT: vandps %ymm7, %ymm3, %ymm3
3163 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
3164 ; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
3165 ; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
3166 ; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
3167 ; AVX1-NEXT
3168 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
3169 ; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
3170 ; AVX1-NEXT: vandps %xmm5, %xmm2, %xmm2
3171 ; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2
3172 ; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
3173 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
3174 ; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
3175 ; AVX1-NEXT: vandps %xmm5, %xmm1, %xmm1
3176 ; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
3177 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
3178 ; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
3179 ; AVX1-NEXT: vandps %xmm5, %xmm0, %xmm0
3180 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
3181 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3182 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
3183 ; AVX1-NEXT: vzeroupper
3184 ; AVX1-NEXT: retq
3188 ; AVX2-NEXT: vpand %ymm5, %ymm1, %ymm1
3189 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
3190 ; AVX2-NEXT: vpand %ymm7, %ymm3, %ymm3
3191 ; AVX2-NEXT: vpand %ymm6, %ymm2, %ymm2
3192 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,0,2,4,6,4,6]
3193 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3]
3194 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,0,2,4,6,4,6]
3195 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3]
3196 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
3197 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
3198 ; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
3199 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
3200 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
3201 ; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
3202 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
3203 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
3204 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
3205 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3]
3206 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
3207 ; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
3208 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3209 ; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
3210 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
3211 ; AVX2-NEXT: vzeroupper
3212 ; AVX2-NEXT: retq
3216 ; AVX512-NEXT: vpandq %zmm3, %zmm1, %zmm1
3217 ; AVX512-NEXT: vpandq %zmm2, %zmm0, %zmm0
3218 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
3219 ; AVX512-NEXT: vpmovqd %zmm1, %ymm1
3220 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
3221 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
3222 ; AVX512-NEXT: retq
3231 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
3232 ; SSE-NEXT: pand %xmm8, %xmm7
3233 ; SSE-NEXT: pand %xmm3, %xmm7
3234 ; SSE-NEXT: pand %xmm8, %xmm6
3235 ; SSE-NEXT: pand %xmm2, %xmm6
3236 ; SSE-NEXT: packuswb %xmm7, %xmm6
3237 ; SSE-NEXT: pand %xmm8, %xmm5
3238 ; SSE-NEXT: pand %xmm1, %xmm5
3239 ; SSE-NEXT: pand %xmm8, %xmm4
3240 ; SSE-NEXT: pand %xmm4, %xmm0
3241 ; SSE-NEXT: packuswb %xmm5, %xmm0
3242 ; SSE-NEXT: packuswb %xmm6, %xmm0
3243 ; SSE-NEXT: retq
3247 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
3248 ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
3249 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
3250 ; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
3251 ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
3252 ; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1
3253 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
3254 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
3255 ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
3256 ; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0
3257 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
3258 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3259 ; AVX1-NEXT: vzeroupper
3260 ; AVX1-NEXT: retq
3264 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
3265 ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
3266 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
3267 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
3268 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
3269 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
3270 ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
3271 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
3272 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3273 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
3274 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3275 ; AVX2-NEXT: vzeroupper
3276 ; AVX2-NEXT: retq
3280 ; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0
3281 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
3282 ; AVX512-NEXT: retq
3291 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
3292 ; SSE-NEXT: pand %xmm4, %xmm3
3293 ; SSE-NEXT: pand %xmm1, %xmm3
3294 ; SSE-NEXT: pand %xmm4, %xmm2
3295 ; SSE-NEXT: pand %xmm2, %xmm0
3296 ; SSE-NEXT: packuswb %xmm3, %xmm0
3297 ; SSE-NEXT: retq
3301 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
3302 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
3303 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
3304 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
3305 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
3306 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3307 ; AVX1-NEXT: vzeroupper
3308 ; AVX1-NEXT: retq
3312 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
3313 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
3314 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
3315 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
3316 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
3317 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3318 ; AVX2-NEXT: vzeroupper
3319 ; AVX2-NEXT: retq
3323 ; AVX512F-NEXT: vandps %ymm1, %ymm0, %ymm0
3324 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
3325 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
3326 ; AVX512F-NEXT: retq
3330 ; AVX512BW-NEXT: vandps %ymm1, %ymm0, %ymm0
3331 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
3332 ; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3333 ; AVX512BW-NEXT: retq
3346 ; SSE-NEXT: movl $1, %eax
3347 ; SSE-NEXT: movd %rax, %xmm2
3348 ; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
3349 ; SSE-NEXT: pand %xmm0, %xmm2
3350 ; SSE-NEXT: pand {{.*}}(%rip), %xmm1
3351 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
3352 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
3353 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3354 ; SSE-NEXT: retq
3358 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
3359 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
3360 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
3361 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
3362 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
3363 ; AVX1-NEXT: vzeroupper
3364 ; AVX1-NEXT: retq
3368 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
3369 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
3370 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
3371 ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3372 ; AVX2-NEXT: vzeroupper
3373 ; AVX2-NEXT: retq
3377 ; AVX512-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
3378 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
3379 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3380 ; AVX512-NEXT: retq
3389 ; SSE-NEXT: movdqa %xmm0, %xmm4
3390 ; SSE-NEXT: movl $1, %eax
3391 ; SSE-NEXT: movd %rax, %xmm0
3392 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
3393 ; SSE-NEXT: pand %xmm4, %xmm0
3394 ; SSE-NEXT: pand {{.*}}(%rip), %xmm2
3395 ; SSE-NEXT: pand {{.*}}(%rip), %xmm3
3396 ; SSE-NEXT: pand {{.*}}(%rip), %xmm1
3397 ; SSE-NEXT: pextrw $4, %xmm1, %eax
3398 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
3399 ; SSE-NEXT: pextrw $4, %xmm0, %ecx
3400 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
3401 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3402 ; SSE-NEXT: pextrw $4, %xmm3, %edx
3403 ; SSE-NEXT: movd %edx, %xmm1
3404 ; SSE-NEXT: movd %eax, %xmm3
3405 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
3406 ; SSE-NEXT: movd %ecx, %xmm1
3407 ; SSE-NEXT: pextrw $4, %xmm2, %eax
3408 ; SSE-NEXT: movd %eax, %xmm2
3409 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
3410 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
3411 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3412 ; SSE-NEXT: retq
3416 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
3417 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
3418 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
3419 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
3420 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
3421 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
3422 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
3423 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
3424 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
3425 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
3426 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
3427 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
3428 ; AVX1-NEXT: vzeroupper
3429 ; AVX1-NEXT: retq
3433 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
3434 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
3435 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
3436 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
3437 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
3438 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3]
3439 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
3440 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
3441 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3442 ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3443 ; AVX2-NEXT: vzeroupper
3444 ; AVX2-NEXT: retq
3448 ; AVX512-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
3449 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
3450 ; AVX512-NEXT: retq
3459 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0
3460 ; SSE-NEXT: pand {{.*}}(%rip), %xmm1
3461 ; SSE-NEXT: pslld $16, %xmm1
3462 ; SSE-NEXT: psrad $16, %xmm1
3463 ; SSE-NEXT: pslld $16, %xmm0
3464 ; SSE-NEXT: psrad $16, %xmm0
3465 ; SSE-NEXT: packssdw %xmm1, %xmm0
3466 ; SSE-NEXT: retq
3470 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
3471 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
3472 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
3473 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
3474 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
3475 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3476 ; AVX1-NEXT: vzeroupper
3477 ; AVX1-NEXT: retq
3481 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
3482 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
3483 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3484 ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3485 ; AVX2-NEXT: vzeroupper
3486 ; AVX2-NEXT: retq
3490 ; AVX512-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
3491 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
3492 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3493 ; AVX512-NEXT: retq
3502 ; SSE-NEXT: movl $1, %eax
3503 ; SSE-NEXT: movd %rax, %xmm8
3504 ; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7]
3505 ; SSE-NEXT: pand {{.*}}(%rip), %xmm1
3506 ; SSE-NEXT: pand {{.*}}(%rip), %xmm2
3507 ; SSE-NEXT: pand {{.*}}(%rip), %xmm3
3508 ; SSE-NEXT: pand {{.*}}(%rip), %xmm4
3509 ; SSE-NEXT: pand {{.*}}(%rip), %xmm5
3510 ; SSE-NEXT: pand {{.*}}(%rip), %xmm6
3511 ; SSE-NEXT: pand {{.*}}(%rip), %xmm7
3512 ; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
3513 ; SSE-NEXT: pand %xmm9, %xmm7
3514 ; SSE-NEXT: pand %xmm9, %xmm6
3515 ; SSE-NEXT: packuswb %xmm7, %xmm6
3516 ; SSE-NEXT: pand %xmm9, %xmm5
3517 ; SSE-NEXT: pand %xmm9, %xmm4
3518 ; SSE-NEXT: packuswb %xmm5, %xmm4
3519 ; SSE-NEXT: packuswb %xmm6, %xmm4
3520 ; SSE-NEXT: pand %xmm9, %xmm3
3521 ; SSE-NEXT: pand %xmm9, %xmm2
3522 ; SSE-NEXT: packuswb %xmm3, %xmm2
3523 ; SSE-NEXT: pand %xmm9, %xmm1
3524 ; SSE-NEXT: pand %xmm9, %xmm8
3525 ; SSE-NEXT: pand %xmm8, %xmm0
3526 ; SSE-NEXT: packuswb %xmm1, %xmm0
3527 ; SSE-NEXT: packuswb %xmm2, %xmm0
3528 ; SSE-NEXT: packuswb %xmm4, %xmm0
3529 ; SSE-NEXT: retq
3533 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
3534 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
3535 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2
3536 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm3, %ymm3
3537 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
3538 ; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
3539 ; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
3540 ; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
3541 ; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3
3542 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
3543 ; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
3544 ; AVX1-NEXT: vandps %xmm5, %xmm2, %xmm2
3545 ; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2
3546 ; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
3547 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
3548 ; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
3549 ; AVX1-NEXT: vandps %xmm5, %xmm1, %xmm1
3550 ; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
3551 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
3552 ; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
3553 ; AVX1-NEXT: vandps %xmm5, %xmm0, %xmm0
3554 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
3555 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3556 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
3557 ; AVX1-NEXT: vzeroupper
3558 ; AVX1-NEXT: retq
3562 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
3563 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
3564 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
3565 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
3566 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,0,2,4,6,4,6]
3567 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3]
3568 ; AVX2-NEXT
3569 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3]
3570 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
3571 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
3572 ; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
3573 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
3574 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
3575 ; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
3576 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
3577 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
3578 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
3579 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3]
3580 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
3581 ; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
3582 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3583 ; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
3584 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
3585 ; AVX2-NEXT: vzeroupper
3586 ; AVX2-NEXT: retq
3590 ; AVX512-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
3591 ; AVX512-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
3592 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
3593 ; AVX512-NEXT: vpmovqd %zmm1, %ymm1
3594 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
3595 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
3596 ; AVX512-NEXT: retq
3605 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0
3606 ; SSE-NEXT: pand {{.*}}(%rip), %xmm1
3607 ; SSE-NEXT: pand {{.*}}(%rip), %xmm2
3608 ; SSE-NEXT: pand {{.*}}(%rip), %xmm3
3609 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
3610 ; SSE-NEXT: pand %xmm4, %xmm3
3611 ; SSE-NEXT: pand %xmm4, %xmm2
3612 ; SSE-NEXT: packuswb %xmm3, %xmm2
3613 ; SSE-NEXT: pand %xmm4, %xmm1
3614 ; SSE-NEXT: pand %xmm4, %xmm0
3615 ; SSE-NEXT: packuswb %xmm1, %xmm0
3616 ; SSE-NEXT: packuswb %xmm2, %xmm0
3617 ; SSE-NEXT: retq
3621 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
3622 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
3623 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
3624 ; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
3625 ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
3626 ; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1
3627 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
3628 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
3629 ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
3630 ; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0
3631 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
3632 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3633 ; AVX1-NEXT: vzeroupper
3634 ; AVX1-NEXT: retq
3638 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
3639 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
3640 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
3641 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
3642 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
3643 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
3644 ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
3645 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
3646 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3647 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
3648 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3649 ; AVX2-NEXT: vzeroupper
3650 ; AVX2-NEXT: retq
3654 ; AVX512-NEXT: vpandd {{.*}}(%rip), %zmm0, %zmm0
3655 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
3656 ; AVX512-NEXT: retq
3665 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0
3666 ; SSE-NEXT: pand {{.*}}(%rip), %xmm1
3667 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
3668 ; SSE-NEXT: pand %xmm2, %xmm1
3669 ; SSE-NEXT: pand %xmm2, %xmm0
3670 ; SSE-NEXT: packuswb %xmm1, %xmm0
3671 ; SSE-NEXT: retq
3675 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
3676 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
3677 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
3678 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
3679 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
3680 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3681 ; AVX1-NEXT: vzeroupper
3682 ; AVX1-NEXT: retq
3686 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
3687 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
3688 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
3689 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
3690 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
3691 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3692 ; AVX2-NEXT: vzeroupper
3693 ; AVX2-NEXT: retq
3697 ; AVX512F-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
3698 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
3699 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
3700 ; AVX512F-NEXT: retq
3704 ; AVX512BW-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
3705 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
3706 ; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3707 ; AVX512BW-NEXT: retq
3720 ; SSE-NEXT: pxor %xmm2, %xmm0
3721 ; SSE-NEXT: pxor %xmm3, %xmm1
3722 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
3723 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
3724 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3725 ; SSE-NEXT: retq
3729 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
3730 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
3731 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
3732 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
3733 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
3734 ; AVX1-NEXT: vzeroupper
3735 ; AVX1-NEXT: retq
3739 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
3740 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
3741 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
3742 ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3743 ; AVX2-NEXT: vzeroupper
3744 ; AVX2-NEXT: retq
3748 ; AVX512-NEXT: vxorps %ymm1, %ymm0, %ymm0
3749 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
3750 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3751 ; AVX512-NEXT: retq
3760 ; SSE-NEXT: pxor %xmm6, %xmm2
3761 ; SSE-NEXT: pxor %xmm4, %xmm0
3762 ; SSE-NEXT: pxor %xmm7, %xmm3
3763 ; SSE-NEXT: pxor %xmm5, %xmm1
3764 ; SSE-NEXT: pextrw $4, %xmm1, %eax
3765 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
3766 ; SSE-NEXT: pextrw $4, %xmm0, %ecx
3767 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
3768 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3769 ; SSE-NEXT: pextrw $4, %xmm3, %edx
3770 ; SSE-NEXT: movd %edx, %xmm1
3771 ; SSE-NEXT: movd %eax, %xmm3
3772 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
3773 ; SSE-NEXT: pextrw $4, %xmm2, %eax
3774 ; SSE-NEXT: movd %eax, %xmm1
3775 ; SSE-NEXT: movd %ecx, %xmm2
3776 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
3777 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
3778 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
3779 ; SSE-NEXT: retq
3783 ; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0
3784 ; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1
3785 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
3786 ; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3
3787 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
3788 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
3789 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
3790 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
3791 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
3792 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
3793 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
3794 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
3795 ; AVX1-NEXT: vzeroupper
3796 ; AVX1-NEXT: retq
3800 ; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1
3801 ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
3802 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
3803 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
3804 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
3805 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3]
3806 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
3807 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
3808 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3809 ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3810 ; AVX2-NEXT: vzeroupper
3811 ; AVX2-NEXT: retq
3815 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
3816 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
3817 ; AVX512-NEXT: retq
3826 ; SSE-NEXT: pxor %xmm2, %xmm0
3827 ; SSE-NEXT: pxor %xmm3, %xmm1
3828 ; SSE-NEXT: pslld $16, %xmm1
3829 ; SSE-NEXT: psrad $16, %xmm1
3830 ; SSE-NEXT: pslld $16, %xmm0
3831 ; SSE-NEXT: psrad $16, %xmm0
3832 ; SSE-NEXT: packssdw %xmm1, %xmm0
3833 ; SSE-NEXT: retq
3837 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
3838 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
3839 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
3840 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
3841 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
3842 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3843 ; AVX1-NEXT: vzeroupper
3844 ; AVX1-NEXT: retq
3848 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
3849 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
3850 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3851 ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3852 ; AVX2-NEXT: vzeroupper
3853 ; AVX2-NEXT: retq
3857 ; AVX512-NEXT: vxorps %ymm1, %ymm0, %ymm0
3858 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
3859 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3860 ; AVX512-NEXT: retq
3869 ; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm0
3870 ; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm1
3871 ; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm2
3872 ; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm3
3873 ; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm4
3874 ; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm5
3875 ; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm6
3876 ; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm7
3877 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
3878 ; SSE-NEXT: pand %xmm8, %xmm7
3879 ; SSE-NEXT: pand %xmm8, %xmm6
3880 ; SSE-NEXT: packuswb %xmm7, %xmm6
3881 ; SSE-NEXT: pand %xmm8, %xmm5
3882 ; SSE-NEXT: pand %xmm8, %xmm4
3883 ; SSE-NEXT: packuswb %xmm5, %xmm4
3884 ; SSE-NEXT: packuswb %xmm6, %xmm4
3885 ; SSE-NEXT: pand %xmm8, %xmm3
3886 ; SSE-NEXT: pand %xmm8, %xmm2
3887 ; SSE-NEXT: packuswb %xmm3, %xmm2
3888 ; SSE-NEXT: pand %xmm8, %xmm1
3889 ; SSE-NEXT: pand %xmm8, %xmm0
3890 ; SSE-NEXT: packuswb %xmm1, %xmm0
3891 ; SSE-NEXT: packuswb %xmm2, %xmm0
3892 ; SSE-NEXT: packuswb %xmm4, %xmm0
3893 ; SSE-NEXT: retq
3897 ; AVX1-NEXT: vxorps %ymm4, %ymm0, %ymm0
3898 ; AVX1-NEXT: vxorps %ymm5, %ymm1, %ymm1
3899 ; AVX1-NEXT: vxorps %ymm6, %ymm2, %ymm2
3900 ; AVX1-NEXT: vxorps %ymm7, %ymm3, %ymm3
3901 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
3902 ; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
3903 ; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
3904 ; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
3905 ; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3
3906 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
3907 ; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
3908 ; AVX1-NEXT: vandps %xmm5, %xmm2, %xmm2
3909 ; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2
3910 ; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
3911 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
3912 ; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
3913 ; AVX1-NEXT: vandps %xmm5, %xmm1, %xmm1
3914 ; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
3915 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
3916 ; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
3917 ; AVX1-NEXT: vandps %xmm5, %xmm0, %xmm0
3918 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
3919 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3920 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
3921 ; AVX1-NEXT: vzeroupper
3922 ; AVX1-NEXT: retq
3926 ; AVX2-NEXT: vpxor %ymm5, %ymm1, %ymm1
3927 ; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0
3928 ; AVX2-NEXT: vpxor %ymm7, %ymm3, %ymm3
3929 ; AVX2-NEXT: vpxor %ymm6, %ymm2, %ymm2
3930 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,0,2,4,6,4,6]
3931 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3]
3932 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,0,2,4,6,4,6]
3933 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3]
3934 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
3935 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
3936 ; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
3937 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
3938 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
3939 ; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
3940 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
3941 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
3942 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
3943 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3]
3944 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
3945 ; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
3946 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3947 ; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
3948 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
3949 ; AVX2-NEXT: vzeroupper
3950 ; AVX2-NEXT: retq
3954 ; AVX512-NEXT: vpxorq %zmm3, %zmm1, %zmm1
3955 ; AVX512-NEXT: vpxorq %zmm2, %zmm0, %zmm0
3956 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
3957 ; AVX512-NEXT: vpmovqd %zmm1, %ymm1
3958 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
3959 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
3960 ; AVX512-NEXT: retq
3969 ; SSE-NEXT: pxor %xmm4, %xmm0
3970 ; SSE-NEXT: pxor %xmm5, %xmm1
3971 ; SSE-NEXT: pxor %xmm6, %xmm2
3972 ; SSE-NEXT: pxor %xmm7, %xmm3
3973 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
3974 ; SSE-NEXT: pand %xmm4, %xmm3
3975 ; SSE-NEXT: pand %xmm4, %xmm2
3976 ; SSE-NEXT: packuswb %xmm3, %xmm2
3977 ; SSE-NEXT: pand %xmm4, %xmm1
3978 ; SSE-NEXT: pand %xmm4, %xmm0
3979 ; SSE-NEXT: packuswb %xmm1, %xmm0
3980 ; SSE-NEXT: packuswb %xmm2, %xmm0
3981 ; SSE-NEXT: retq
3985 ; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0
3986 ; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1
3987 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
3988 ; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
3989 ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
3990 ; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1
3991 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
3992 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
3993 ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
3994 ; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0
3995 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
3996 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3997 ; AVX1-NEXT: vzeroupper
3998 ; AVX1-NEXT: retq
4002 ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
4003 ; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1
4004 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
4005 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
4006 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
4007 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
4008 ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
4009 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
4010 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4011 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
4012 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4013 ; AVX2-NEXT: vzeroupper
4014 ; AVX2-NEXT: retq
4018 ; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0
4019 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
4020 ; AVX512-NEXT: retq
4029 ; SSE-NEXT: pxor %xmm2, %xmm0
4030 ; SSE-NEXT: pxor %xmm3, %xmm1
4031 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
4032 ; SSE-NEXT: pand %xmm2, %xmm1
4033 ; SSE-NEXT: pand %xmm2, %xmm0
4034 ; SSE-NEXT: packuswb %xmm1, %xmm0
4035 ; SSE-NEXT: retq
4039 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
4040 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
4041 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
4042 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
4043 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
4044 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4045 ; AVX1-NEXT: vzeroupper
4046 ; AVX1-NEXT: retq
4050 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
4051 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
4052 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
4053 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
4054 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
4055 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4056 ; AVX2-NEXT: vzeroupper
4057 ; AVX2-NEXT: retq
4061 ; AVX512F-NEXT: vxorps %ymm1, %ymm0, %ymm0
4062 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
4063 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
4064 ; AVX512F-NEXT: retq
4068 ; AVX512BW-NEXT: vxorps %ymm1, %ymm0, %ymm0
4069 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
4070 ; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4071 ; AVX512BW-NEXT: retq
4084 ; SSE-NEXT: movl $1, %eax
4085 ; SSE-NEXT: movd %rax, %xmm2
4086 ; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
4087 ; SSE-NEXT: pxor %xmm0, %xmm2
4088 ; SSE-NEXT: pxor {{.*}}(%rip), %xmm1
4089 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
4090 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
4091 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4092 ; SSE-NEXT: retq
4096 ; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0
4097 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
4098 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
4099 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
4100 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
4101 ; AVX1-NEXT: vzeroupper
4102 ; AVX1-NEXT: retq
4106 ; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0
4107 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
4108 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
4109 ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4110 ; AVX2-NEXT: vzeroupper
4111 ; AVX2-NEXT: retq
4115 ; AVX512-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0
4116 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
4117 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4118 ; AVX512-NEXT: retq
4127 ; SSE-NEXT: movdqa %xmm0, %xmm4
4128 ; SSE-NEXT: movl $1, %eax
4129 ; SSE-NEXT: movd %rax, %xmm0
4130 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
4131 ; SSE-NEXT: pxor %xmm4, %xmm0
4132 ; SSE-NEXT: pxor {{.*}}(%rip), %xmm2
4133 ; SSE-NEXT: pxor {{.*}}(%rip), %xmm3
4134 ; SSE-NEXT: pxor {{.*}}(%rip), %xmm1
4135 ; SSE-NEXT: pextrw $4, %xmm1, %eax
4136 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
4137 ; SSE-NEXT: pextrw $4, %xmm0, %ecx
4138 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
4139 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
4140 ; SSE-NEXT: pextrw $4, %xmm3, %edx
4141 ; SSE-NEXT: movd %edx, %xmm1
4142 ; SSE-NEXT: movd %eax, %xmm3
4143 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
4144 ; SSE-NEXT: movd %ecx, %xmm1
4145 ; SSE-NEXT: pextrw $4, %xmm2, %eax
4146 ; SSE-NEXT: movd %eax, %xmm2
4147 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
4148 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
4149 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
4150 ; SSE-NEXT: retq
4154 ; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0
4155 ; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm1, %ymm1
4156 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
4157 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
4158 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
4159 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
4160 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
4161 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
4162 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
4163 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
4164 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
4165 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
4166 ; AVX1-NEXT: vzeroupper
4167 ; AVX1-NEXT: retq
4171 ; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm1, %ymm1
4172 ; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0
4173 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
4174 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
4175 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
4176 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3]
4177 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
4178 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
4179 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4180 ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4181 ; AVX2-NEXT: vzeroupper
4182 ; AVX2-NEXT: retq
4186 ; AVX512-NEXT: vpxorq {{.*}}(%rip), %zmm0, %zmm0
4187 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
4188 ; AVX512-NEXT: retq
4197 ; SSE-NEXT: pxor {{.*}}(%rip), %xmm0
4198 ; SSE-NEXT: pxor {{.*}}(%rip), %xmm1
4199 ; SSE-NEXT: pslld $16, %xmm1
4200 ; SSE-NEXT: psrad $16, %xmm1
4201 ; SSE-NEXT: pslld $16, %xmm0
4202 ; SSE-NEXT: psrad $16, %xmm0
4203 ; SSE-NEXT: packssdw %xmm1, %xmm0
4204 ; SSE-NEXT: retq
4208 ; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0
4209 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
4210 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
4211 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
4212 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
4213 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4214 ; AVX1-NEXT: vzeroupper
4215 ; AVX1-NEXT: retq
4219 ; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0
4220 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
4221 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4222 ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4223 ; AVX2-NEXT: vzeroupper
4224 ; AVX2-NEXT: retq
4228 ; AVX512-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0
4229 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
4230 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4231 ; AVX512-NEXT: retq
4240 ; SSE-NEXT: movl $1, %eax
4241 ; SSE-NEXT: movd %rax, %xmm8
4242 ; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7]
4243 ; SSE-NEXT: pxor %xmm8, %xmm0
4244 ; SSE-NEXT: pxor {{.*}}(%rip), %xmm1
4245 ; SSE-NEXT: pxor {{.*}}(%rip), %xmm2
4246 ; SSE-NEXT: pxor {{.*}}(%rip), %xmm3
4247 ; SSE-NEXT: pxor {{.*}}(%rip), %xmm4
4248 ; SSE-NEXT: pxor {{.*}}(%rip), %xmm5
4249 ; SSE-NEXT: pxor {{.*}}(%rip), %xmm6
4250 ; SSE-NEXT: pxor {{.*}}(%rip), %xmm7
4251 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
4252 ; SSE-NEXT: pand %xmm8, %xmm7
4253 ; SSE-NEXT: pand %xmm8, %xmm6
4254 ; SSE-NEXT: packuswb %xmm7, %xmm6
4255 ; SSE-NEXT: pand %xmm8, %xmm5
4256 ; SSE-NEXT: pand %xmm8, %xmm4
4257 ; SSE-NEXT: packuswb %xmm5, %xmm4
4258 ; SSE-NEXT: packuswb %xmm6, %xmm4
4259 ; SSE-NEXT: pand %xmm8, %xmm3
4260 ; SSE-NEXT: pand %xmm8, %xmm2
4261 ; SSE-NEXT: packuswb %xmm3, %xmm2
4262 ; SSE-NEXT: pand %xmm8, %xmm1
4263 ; SSE-NEXT: pand %xmm8, %xmm0
4264 ; SSE-NEXT: packuswb %xmm1, %xmm0
4265 ; SSE-NEXT: packuswb %xmm2, %xmm0
4266 ; SSE-NEXT: packuswb %xmm4, %xmm0
4267 ; SSE-NEXT: retq
4271 ; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0
4272 ; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm1, %ymm1
4273 ; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm2, %ymm2
4274 ; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm3, %ymm3
4275 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
4276 ; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
4277 ; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
4278 ; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
4279 ; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3
4280 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
4281 ; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
4282 ; AVX1-NEXT: vandps %xmm5, %xmm2, %xmm2
4283 ; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2
4284 ; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
4285 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
4286 ; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
4287 ; AVX1-NEXT: vandps %xmm5, %xmm1, %xmm1
4288 ; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
4289 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
4290 ; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
4291 ; AVX1-NEXT: vandps %xmm5, %xmm0, %xmm0
4292 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
4293 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
4294 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
4295 ; AVX1-NEXT: vzeroupper
4296 ; AVX1-NEXT: retq
4300 ; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm1, %ymm1
4301 ; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0
4302 ; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm3, %ymm3
4303 ; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm2, %ymm2
4304 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,0,2,4,6,4,6]
4305 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3]
4306 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,0,2,4,6,4,6]
4307 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3]
4308 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
4309 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
4310 ; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
4311 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
4312 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
4313 ; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
4314 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
4315 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
4316 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
4317 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3]
4318 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
4319 ; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
4320 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4321 ; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
4322 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
4323 ; AVX2-NEXT: vzeroupper
4324 ; AVX2-NEXT: retq
4328 ; AVX512-NEXT: vpxorq {{.*}}(%rip), %zmm1, %zmm1
4329 ; AVX512-NEXT: vpxorq {{.*}}(%rip), %zmm0, %zmm0
4330 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
4331 ; AVX512-NEXT: vpmovqd %zmm1, %ymm1
4332 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4333 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
4334 ; AVX512-NEXT: retq
4343 ; SSE-NEXT: pxor {{.*}}(%rip), %xmm0
4344 ; SSE-NEXT: pxor {{.*}}(%rip), %xmm1
4345 ; SSE-NEXT: pxor {{.*}}(%rip), %xmm2
4346 ; SSE-NEXT: pxor {{.*}}(%rip), %xmm3
4347 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
4348 ; SSE-NEXT: pand %xmm4, %xmm3
4349 ; SSE-NEXT: pand %xmm4, %xmm2
4350 ; SSE-NEXT: packuswb %xmm3, %xmm2
4351 ; SSE-NEXT: pand %xmm4, %xmm1
4352 ; SSE-NEXT: pand %xmm4, %xmm0
4353 ; SSE-NEXT: packuswb %xmm1, %xmm0
4354 ; SSE-NEXT: packuswb %xmm2, %xmm0
4355 ; SSE-NEXT: retq
4359 ; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0
4360 ; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm1, %ymm1
4361 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
4362 ; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
4363 ; AVX1-NEXT
4364 ; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1
4365 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
4366 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
4367 ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
4368 ; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0
4369 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
4370 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
4371 ; AVX1-NEXT: vzeroupper
4372 ; AVX1-NEXT: retq
4376 ; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0
4377 ; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm1, %ymm1
4378 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
4379 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
4380 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
4381 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
4382 ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
4383 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
4384 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4385 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
4386 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4387 ; AVX2-NEXT: vzeroupper
4388 ; AVX2-NEXT: retq
4392 ; AVX512-NEXT: vpxord {{.*}}(%rip), %zmm0, %zmm0
4393 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
4394 ; AVX512-NEXT: retq
4403 ; SSE-NEXT: pxor {{.*}}(%rip), %xmm0
4404 ; SSE-NEXT: pxor {{.*}}(%rip), %xmm1
4405 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
4406 ; SSE-NEXT: pand %xmm2, %xmm1
4407 ; SSE-NEXT: pand %xmm2, %xmm0
4408 ; SSE-NEXT: packuswb %xmm1, %xmm0
4409 ; SSE-NEXT: retq
4413 ; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0
4414 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
4415 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
4416 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
4417 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
4418 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4419 ; AVX1-NEXT: vzeroupper
4420 ; AVX1-NEXT: retq
4424 ; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0
4425 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
4426 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
4427 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
4428 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
4429 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4430 ; AVX2-NEXT: vzeroupper
4431 ; AVX2-NEXT: retq
4435 ; AVX512F-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0
4436 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
4437 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
4438 ; AVX512F-NEXT: retq
4442 ; AVX512BW-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0
4443 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
4444 ; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4445 ; AVX512BW-NEXT: retq
4458 ; SSE-NEXT: por %xmm2, %xmm0
4459 ; SSE-NEXT: por %xmm3, %xmm1
4460 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
4461 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
4462 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4463 ; SSE-NEXT: retq
4467 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
4468 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
4469 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
4470 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
4471 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
4472 ; AVX1-NEXT: vzeroupper
4473 ; AVX1-NEXT: retq
4477 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
4478 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
4479 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
4480 ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4481 ; AVX2-NEXT: vzeroupper
4482 ; AVX2-NEXT: retq
4486 ; AVX512-NEXT: vorps %ymm1, %ymm0, %ymm0
4487 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
4488 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4489 ; AVX512-NEXT: retq
4498 ; SSE-NEXT: por %xmm6, %xmm2
4499 ; SSE-NEXT: por %xmm4, %xmm0
4500 ; SSE-NEXT: por %xmm7, %xmm3
4501 ; SSE-NEXT: por %xmm5, %xmm1
4502 ; SSE-NEXT: pextrw $4, %xmm1, %eax
4503 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
4504 ; SSE-NEXT: pextrw $4, %xmm0, %ecx
4505 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
4506 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
4507 ; SSE-NEXT: pextrw $4, %xmm3, %edx
4508 ; SSE-NEXT: movd %edx, %xmm1
4509 ; SSE-NEXT: movd %eax, %xmm3
4510 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
4511 ; SSE-NEXT: pextrw $4, %xmm2, %eax
4512 ; SSE-NEXT: movd %eax, %xmm1
4513 ; SSE-NEXT: movd %ecx, %xmm2
4514 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
4515 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
4516 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
4517 ; SSE-NEXT: retq
4521 ; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
4522 ; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1
4523 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
4524 ; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3
4525 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
4526 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
4527 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
4528 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
4529 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
4530 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
4531 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
4532 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
4533 ; AVX1-NEXT: vzeroupper
4534 ; AVX1-NEXT: retq
4538 ; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1
4539 ; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
4540 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
4541 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
4542 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
4543 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3]
4544 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
4545 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
4546 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4547 ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4548 ; AVX2-NEXT: vzeroupper
4549 ; AVX2-NEXT: retq
4553 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
4554 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
4555 ; AVX512-NEXT: retq
4564 ; SSE-NEXT: por %xmm2, %xmm0
4565 ; SSE-NEXT: por %xmm3, %xmm1
4566 ; SSE-NEXT: pslld $16, %xmm1
4567 ; SSE-NEXT: psrad $16, %xmm1
4568 ; SSE-NEXT: pslld $16, %xmm0
4569 ; SSE-NEXT: psrad $16, %xmm0
4570 ; SSE-NEXT: packssdw %xmm1, %xmm0
4571 ; SSE-NEXT: retq
4575 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
4576 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
4577 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
4578 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
4579 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
4580 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4581 ; AVX1-NEXT: vzeroupper
4582 ; AVX1-NEXT: retq
4586 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
4587 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
4588 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4589 ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4590 ; AVX2-NEXT: vzeroupper
4591 ; AVX2-NEXT: retq
4595 ; AVX512-NEXT: vorps %ymm1, %ymm0, %ymm0
4596 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
4597 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4598 ; AVX512-NEXT: retq
4607 ; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm0
4608 ; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm1
4609 ; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm2
4610 ; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm3
4611 ; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm4
4612 ; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm5
4613 ; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm6
4614 ; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm7
4615 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
4616 ; SSE-NEXT: pand %xmm8, %xmm7
4617 ; SSE-NEXT: pand %xmm8, %xmm6
4618 ; SSE-NEXT: packuswb %xmm7, %xmm6
4619 ; SSE-NEXT: pand %xmm8, %xmm5
4620 ; SSE-NEXT: pand %xmm8, %xmm4
4621 ; SSE-NEXT: packuswb %xmm5, %xmm4
4622 ; SSE-NEXT: packuswb %xmm6, %xmm4
4623 ; SSE-NEXT: pand %xmm8, %xmm3
4624 ; SSE-NEXT: pand %xmm8, %xmm2
4625 ; SSE-NEXT: packuswb %xmm3, %xmm2
4626 ; SSE-NEXT: pand %xmm8, %xmm1
4627 ; SSE-NEXT: pand %xmm8, %xmm0
4628 ; SSE-NEXT: packuswb %xmm1, %xmm0
4629 ; SSE-NEXT: packuswb %xmm2, %xmm0
4630 ; SSE-NEXT: packuswb %xmm4, %xmm0
4631 ; SSE-NEXT: retq
4635 ; AVX1-NEXT: vorps %ymm4, %ymm0, %ymm0
4636 ; AVX1-NEXT: vorps %ymm5, %ymm1, %ymm1
4637 ; AVX1-NEXT: vorps %ymm6, %ymm2, %ymm2
4638 ; AVX1-NEXT: vorps %ymm7, %ymm3, %ymm3
4639 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
4640 ; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
4641 ; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
4642 ; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
4643 ; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3
4644 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
4645 ; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
4646 ; AVX1-NEXT: vandps %xmm5, %xmm2, %xmm2
4647 ; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2
4648 ; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
4649 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
4650 ; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
4651 ; AVX1-NEXT: vandps %xmm5, %xmm1, %xmm1
4652 ; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
4653 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
4654 ; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
4655 ; AVX1-NEXT: vandps %xmm5, %xmm0, %xmm0
4656 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
4657 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
4658 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
4659 ; AVX1-NEXT: vzeroupper
4660 ; AVX1-NEXT: retq
4664 ; AVX2-NEXT: vpor %ymm5, %ymm1, %ymm1
4665 ; AVX2-NEXT: vpor %ymm4, %ymm0, %ymm0
4666 ; AVX2-NEXT: vpor %ymm7, %ymm3, %ymm3
4667 ; AVX2-NEXT: vpor %ymm6, %ymm2, %ymm2
4668 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,0,2,4,6,4,6]
4669 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3]
4670 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,0,2,4,6,4,6]
4671 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3]
4672 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
4673 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
4674 ; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
4675 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
4676 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
4677 ; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
4678 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
4679 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
4680 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
4681 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3]
4682 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
4683 ; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
4684 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4685 ; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
4686 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
4687 ; AVX2-NEXT: vzeroupper
4688 ; AVX2-NEXT: retq
4692 ; AVX512-NEXT: vporq %zmm3, %zmm1, %zmm1
4693 ; AVX512-NEXT: vporq %zmm2, %zmm0, %zmm0
4694 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
4695 ; AVX512-NEXT: vpmovqd %zmm1, %ymm1
4696 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4697 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
4698 ; AVX512-NEXT: retq
4707 ; SSE-NEXT: por %xmm4, %xmm0
4708 ; SSE-NEXT: por %xmm5, %xmm1
4709 ; SSE-NEXT: por %xmm6, %xmm2
4710 ; SSE-NEXT: por %xmm7, %xmm3
4711 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
4712 ; SSE-NEXT: pand %xmm4, %xmm3
4713 ; SSE-NEXT: pand %xmm4, %xmm2
4714 ; SSE-NEXT: packuswb %xmm3, %xmm2
4715 ; SSE-NEXT: pand %xmm4, %xmm1
4716 ; SSE-NEXT: pand %xmm4, %xmm0
4717 ; SSE-NEXT: packuswb %xmm1, %xmm0
4718 ; SSE-NEXT: packuswb %xmm2, %xmm0
4719 ; SSE-NEXT: retq
4723 ; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
4724 ; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1
4725 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
4726 ; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
4727 ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
4728 ; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1
4729 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
4730 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
4731 ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
4732 ; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0
4733 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
4734 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
4735 ; AVX1-NEXT: vzeroupper
4736 ; AVX1-NEXT: retq
4740 ; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
4741 ; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1
4742 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
4743 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
4744 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
4745 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
4746 ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
4747 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
4748 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4749 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
4750 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4751 ; AVX2-NEXT: vzeroupper
4752 ; AVX2-NEXT: retq
4756 ; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0
4757 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
4758 ; AVX512-NEXT: retq
4767 ; SSE-NEXT: por %xmm2, %xmm0
4768 ; SSE-NEXT: por %xmm3, %xmm1
4769 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
4770 ; SSE-NEXT: pand %xmm2, %xmm1
4771 ; SSE-NEXT: pand %xmm2, %xmm0
4772 NEXT: packuswb %xmm1, %xmm0
4773 ; SSE-NEXT: retq
4777 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
4778 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
4779 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
4780 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
4781 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
4782 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4783 ; AVX1-NEXT: vzeroupper
4784 ; AVX1-NEXT: retq
4788 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
4789 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
4790 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
4791 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
4792 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
4793 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4794 ; AVX2-NEXT: vzeroupper
4795 ; AVX2-NEXT: retq
4799 ; AVX512F-NEXT: vorps %ymm1, %ymm0, %ymm0
4800 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
4801 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
4802 ; AVX512F-NEXT: retq
4806 ; AVX512BW-NEXT: vorps %ymm1, %ymm0, %ymm0
4807 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
4808 ; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4809 ; AVX512BW-NEXT: retq
4822 ; SSE-NEXT: movl $1, %eax
4823 ; SSE-NEXT: movd %rax, %xmm2
4824 ; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
4825 ; SSE-NEXT: por %xmm0, %xmm2
4826 ; SSE-NEXT: por {{.*}}(%rip), %xmm1
4827 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
4828 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
4829 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4830 ; SSE-NEXT: retq
4834 ; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0
4835 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
4836 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
4837 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
4838 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
4839 ; AVX1-NEXT: vzeroupper
4840 ; AVX1-NEXT: retq
4844 ; AVX2-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0
4845 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
4846 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
4847 ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4848 ; AVX2-NEXT: vzeroupper
4849 ; AVX2-NEXT: retq
4853 ; AVX512-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0
4854 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
4855 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4856 ; AVX512-NEXT: retq
4865 ; SSE-NEXT: movdqa %xmm0, %xmm4
4866 ; SSE-NEXT: movl $1, %eax
4867 ; SSE-NEXT: movd %rax, %xmm0
4868 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
4869 ; SSE-NEXT: por %xmm4, %xmm0
4870 ; SSE-NEXT: por {{.*}}(%rip), %xmm2
4871 ; SSE-NEXT: por {{.*}}(%rip), %xmm3
4872 ; SSE-NEXT: por {{.*}}(%rip), %xmm1
4873 ; SSE-NEXT: pextrw $4, %xmm1, %eax
4874 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
4875 ; SSE-NEXT: pextrw $4, %xmm0, %ecx
4876 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
4877 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
4878 ; SSE-NEXT: pextrw $4, %xmm3, %edx
4879 ; SSE-NEXT: movd %edx, %xmm1
4880 ; SSE-NEXT: movd %eax, %xmm3
4881 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
4882 ; SSE-NEXT: movd %ecx, %xmm1
4883 ; SSE-NEXT: pextrw $4, %xmm2, %eax
4884 ; SSE-NEXT: movd %eax, %xmm2
4885 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
4886 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
4887 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
4888 ; SSE-NEXT: retq
4892 ; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0
4893 ; AVX1-NEXT: vorps {{.*}}(%rip), %ymm1, %ymm1
4894 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
4895 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
4896 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
4897 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
4898 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
4899 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
4900 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
4901 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
4902 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
4903 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
4904 ; AVX1-NEXT: vzeroupper
4905 ; AVX1-NEXT: retq
4909 ; AVX2-NEXT: vpor {{.*}}(%rip), %ymm1, %ymm1
4910 ; AVX2-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0
4911 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
4912 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
4913 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
4914 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3]
4915 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
4916 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
4917 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4918 ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4919 ; AVX2-NEXT: vzeroupper
4920 ; AVX2-NEXT: retq
4924 ; AVX512-NEXT: vporq {{.*}}(%rip), %zmm0, %zmm0
4925 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
4926 ; AVX512-NEXT: retq
4935 ; SSE-NEXT: por {{.*}}(%rip), %xmm0
4936 ; SSE-NEXT: por {{.*}}(%rip), %xmm1
4937 ; SSE-NEXT: pslld $16, %xmm1
4938 ; SSE-NEXT: psrad $16, %xmm1
4939 ; SSE-NEXT: pslld $16, %xmm0
4940 ; SSE-NEXT: psrad $16, %xmm0
4941 ; SSE-NEXT: packssdw %xmm1, %xmm0
4942 ; SSE-NEXT: retq
4946 ; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0
4947 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
4948 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
4949 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
4950 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
4951 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4952 ; AVX1-NEXT: vzeroupper
4953 ; AVX1-NEXT: retq
4957 ; AVX2-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0
4958 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
4959 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4960 ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4961 ; AVX2-NEXT: vzeroupper
4962 ; AVX2-NEXT: retq
4966 ; AVX512-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0
4967 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
4968 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4969 ; AVX512-NEXT: retq
4978 ; SSE-NEXT: movl $1, %eax
4979 ; SSE-NEXT: movd %rax, %xmm8
4980 ; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7]
4981 ; SSE-NEXT: por %xmm8, %xmm0
4982 ; SSE-NEXT: por {{.*}}(%rip), %xmm1
4983 ; SSE-NEXT: por {{.*}}(%rip), %xmm2
4984 ; SSE-NEXT: por {{.*}}(%rip), %xmm3
4985 ; SSE-NEXT: por {{.*}}(%rip), %xmm4
4986 ; SSE-NEXT: por {{.*}}(%rip), %xmm5
4987 ; SSE-NEXT: por {{.*}}(%rip), %xmm6
4988 ; SSE-NEXT: por {{.*}}(%rip), %xmm7
4989 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
4990 ; SSE-NEXT: pand %xmm8, %xmm7
4991 ; SSE-NEXT: pand %xmm8, %xmm6
4992 ; SSE-NEXT: packuswb %xmm7, %xmm6
4993 ; SSE-NEXT: pand %xmm8, %xmm5
4994 ; SSE-NEXT: pand %xmm8, %xmm4
4995 ; SSE-NEXT: packuswb %xmm5, %xmm4
4996 ; SSE-NEXT: packuswb %xmm6, %xmm4
4997 ; SSE-NEXT: pand %xmm8, %xmm3
4998 ; SSE-NEXT: pand %xmm8, %xmm2
4999 ; SSE-NEXT: packuswb %xmm3, %xmm2
5000 ; SSE-NEXT: pand %xmm8, %xmm1
5001 ; SSE-NEXT: pand %xmm8, %xmm0
5002 ; SSE-NEXT: packuswb %xmm1, %xmm0
5003 ; SSE-NEXT: packuswb %xmm2, %xmm0
5004 ; SSE-NEXT: packuswb %xmm4, %xmm0
5005 ; SSE-NEXT: retq
5009 ; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0
5010 ; AVX1-NEXT: vorps {{.*}}(%rip), %ymm1, %ymm1
5011 ; AVX1-NEXT: vorps {{.*}}(%rip), %ymm2, %ymm2
5012 ; AVX1-NEXT: vorps {{.*}}(%rip), %ymm3, %ymm3
5013 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
5014 ; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
5015 ; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
5016 ; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
5017 ; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3
5018 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
5019 ; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
5020 ; AVX1-NEXT: vandps %xmm5, %xmm2, %xmm2
5021 ; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2
5022 ; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
5023 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
5024 ; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
5025 ; AVX1-NEXT: vandps %xmm5, %xmm1, %xmm1
5026 ; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
5027 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
5028 ; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
5029 ; AVX1-NEXT: vandps %xmm5, %xmm0, %xmm0
5030 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
5031 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
5032 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
5033 ; AVX1-NEXT: vzeroupper
5034 ; AVX1-NEXT: retq
5038 ; AVX2-NEXT: vpor {{.*}}(%rip), %ymm1, %ymm1
5039 ; AVX2-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0
5040 ; AVX2-NEXT: vpor {{.*}}(%rip), %ymm3, %ymm3
5041 ; AVX2-NEXT: vpor {{.*}}(%rip), %ymm2, %ymm2
5042 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,0,2,4,6,4,6]
5043 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3]
5044 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,0,2,4,6,4,6]
5045 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3]
5046 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
5047 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
5048 ; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
5049 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
5050 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
5051 ; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
5052 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
5053 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
5054 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
5055 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3]
5056 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
5057 ; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
5058 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
5059 ; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
5060 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
5061 ; AVX2-NEXT: vzeroupper
5062 ; AVX2-NEXT: retq
5066 ; AVX512-NEXT: vporq {{.*}}(%rip), %zmm1, %zmm1
5067 ; AVX512-NEXT: vporq {{.*}}(%rip), %zmm0, %zmm0
5068 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
5069 ; AVX512-NEXT: vpmovqd %zmm1, %ymm1
5070 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
5071 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
5072 ; AVX512-NEXT: retq
5081 ; SSE-NEXT: por {{.*}}(%rip), %xmm0
5082 ; SSE-NEXT: por {{.*}}(%rip), %xmm1
5083 ; SSE-NEXT: por {{.*}}(%rip), %xmm2
5084 ; SSE-NEXT: por {{.*}}(%rip), %xmm3
5085 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
5086 ; SSE-NEXT: pand %xmm4, %xmm3
5087 ; SSE-NEXT: pand %xmm4, %xmm2
5088 ; SSE-NEXT: packuswb %xmm3, %xmm2
5089 ; SSE-NEXT: pand %xmm4, %xmm1
5090 ; SSE-NEXT: pand %xmm4, %xmm0
5091 ; SSE-NEXT: packuswb %xmm1, %xmm0
5092 ; SSE-NEXT: packuswb %xmm2, %xmm0
5093 ; SSE-NEXT: retq
5097 ; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0
5098 ; AVX1-NEXT: vorps {{.*}}(%rip), %ymm1, %ymm1
5099 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
5100 ; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
5101 ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
5102 ; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1
5103 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
5104 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
5105 ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
5106 ; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0
5107 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
5108 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
5109 ; AVX1-NEXT: vzeroupper
5110 ; AVX1-NEXT: retq
5114 ; AVX2-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0
5115 ; AVX2-NEXT: vpor {{.*}}(%rip), %ymm1, %ymm1
5116 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
5117 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
5118 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
5119 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
5120 ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
5121 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
5122 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
5123 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
5124 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
5125 ; AVX2-NEXT: vzeroupper
5126 ; AVX2-NEXT: retq
5130 ; AVX512-NEXT: vpord {{.*}}(%rip), %zmm0, %zmm0
5131 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
5132 ; AVX512-NEXT: retq
5141 ; SSE-NEXT: por {{.*}}(%rip), %xmm0
5142 ; SSE-NEXT: por {{.*}}(%rip), %xmm1
5143 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
5144 ; SSE-NEXT: pand %xmm2, %xmm1
5145 ; SSE-NEXT: pand %xmm2, %xmm0
5146 ; SSE-NEXT: packuswb %xmm1, %xmm0
5147 ; SSE-NEXT: retq
5151 ; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0
5152 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
5153 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
5154 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
5155 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
5156 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
5157 ; AVX1-NEXT: vzeroupper
5158 ; AVX1-NEXT: retq
5162 ; AVX2-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0
5163 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
5164 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
5165 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
5166 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
5167 ; AVX2-NEXT
5168 ; AVX2-NEXT: vzeroupper
5169 ; AVX2-NEXT: retq
5173 ; AVX512F-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0
5174 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
5175 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
5176 ; AVX512F-NEXT: retq
5180 ; AVX512BW-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0
5181 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
5182 ; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
5183 ; AVX512BW-NEXT: retq
5196 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
5197 ; SSE-NEXT: movdqa %xmm2, %xmm3
5198 ; SSE-NEXT: psrad $31, %xmm3
5199 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
5200 ; SSE-NEXT: movdqa %xmm0, %xmm3
5201 ; SSE-NEXT: psrad $31, %xmm3
5202 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
5203 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
5204 ; SSE-NEXT: movdqa %xmm3, %xmm4
5205 ; SSE-NEXT: psrad $31, %xmm4
5206 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
5207 ; SSE-NEXT: movdqa %xmm1, %xmm4
5208 ; SSE-NEXT: psrad $31, %xmm4
5209 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
5210 ; SSE-NEXT: movdqa %xmm0, %xmm4
5211 ; SSE-NEXT: pmuludq %xmm1, %xmm4
5212 ; SSE-NEXT: movdqa %xmm1, %xmm5
5213 ; SSE-NEXT: psrlq $32, %xmm5
5214 ; SSE-NEXT: pmuludq %xmm0, %xmm5
5215 ; SSE-NEXT: psllq $32, %xmm5
5216 ; SSE-NEXT: paddq %xmm4, %xmm5
5217 ; SSE-NEXT: psrlq $32, %xmm0
5218 ; SSE-NEXT: pmuludq %xmm1, %xmm0
5219 ; SSE-NEXT: psllq $32, %xmm0
5220 ; SSE-NEXT: paddq %xmm5, %xmm0
5221 ; SSE-NEXT: movdqa %xmm2, %xmm1
5222 ; SSE-NEXT: pmuludq %xmm3, %xmm1
5223 ; SSE-NEXT: movdqa %xmm3, %xmm4
5224 ; SSE-NEXT: psrlq $32, %xmm4
5225 ; SSE-NEXT: pmuludq %xmm2, %xmm4
5226 ; SSE-NEXT: psllq $32, %xmm4
5227 ; SSE-NEXT: paddq %xmm1, %xmm4
5228 ; SSE-NEXT: psrlq $32, %xmm2
5229 ; SSE-NEXT: pmuludq %xmm3, %xmm2
5230 ; SSE-NEXT: psllq $32, %xmm2
5231 ; SSE-NEXT: paddq %xmm4, %xmm2
5232 ; SSE-NEXT: paddq {{.*}}(%rip), %xmm2
5233 ; SSE-NEXT: paddq {{.*}}(%rip), %xmm0
5234 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
5235 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
5236 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
5237 ; SSE-NEXT: retq
5241 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
5242 ; AVX1-NEXT: vpmovsxdq %xmm2, %xmm2
5243 ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
5244 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
5245 ; AVX1-NEXT: vpmovsxdq %xmm3, %xmm3
5246 ; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1
5247 ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm4
5248 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm5
5249 ; AVX1-NEXT: vpmuludq %xmm5, %xmm0, %xmm5
5250 ; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5
5251 ; AVX1-NEXT: vpaddq %xmm5, %xmm4, %xmm4
5252 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
5253 ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
5254 ; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
5255 ; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0
5256 ; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm1
5257 ; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm4
5258 ; AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm4
5259 ; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
5260 ; AVX1-NEXT: vpaddq %xmm4, %xmm1, %xmm1
5261 ; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2
5262 ; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
5263 ; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2
5264 ; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm1
5265 ; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm1
5266 ; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0
5267 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
5268 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
5269 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
5270 ; AVX1-NEXT: retq
5274 ; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
5275 ; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1
5276 ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm2
5277 ; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3
5278 ; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
5279 ; AVX2-NEXT: vpsllq $32, %ymm3, %ymm3
5280 ; AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2
5281 ; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0
5282 ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
5283 ; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0
5284 ; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0
5285 ; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
5286 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
5287 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
5288 ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
5289 ; AVX2-NEXT: vzeroupper
5290 ; AVX2-NEXT: retq
5294 ; AVX512-NEXT: vpmovsxdq %xmm0, %ymm0
5295 ; AVX512-NEXT: vpmovsxdq %xmm1, %ymm1
5296 ; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm2
5297 ; AVX512-NEXT: vpsrlq $32, %ymm1, %ymm3
5298 ; AVX512-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
5299 ; AVX512-NEXT: vpsllq $32, %ymm3, %ymm3
5300 ; AVX512-NEXT: vpaddq %ymm3, %ymm2, %ymm2
5301 ; AVX512-NEXT: vpsrlq $32, %ymm0, %ymm0
5302 ; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
5303 ; AVX512-NEXT: vpsllq $32, %ymm0, %ymm0
5304 ; AVX512-NEXT: vpaddq %ymm0, %ymm2, %ymm0
5305 ; AVX512-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
5306 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
5307 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
5308 ; AVX512-NEXT: retq