Lines Matching full:next
10 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
11 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
12 ; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1
13 ; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm3
14 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
15 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
16 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
17 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1]
18 ; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1
19 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
20 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm5
21 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
22 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
23 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
24 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
25 ; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1
26 ; AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1
27 ; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm1
28 ; AVX1-NEXT: vpsubq %xmm3, %xmm0, %xmm0
29 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3
30 ; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3
31 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
32 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
33 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0
34 ; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0
35 ; AVX1-NEXT: vpsadbw %xmm2, %xmm0, %xmm0
36 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
37 ; AVX1-NEXT: retq
41 ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
42 ; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm2
43 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
44 ; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2
45 ; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
46 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
47 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3
48 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
49 ; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3
50 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
51 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
52 ; AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0
53 ; AVX2-NEXT: vpaddb %ymm3, %ymm0, %ymm0
54 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
55 ; AVX2-NEXT: retq
59 ; AVX512CDVL-NEXT: vpxord %ymm1, %ymm1, %ymm1
60 ; AVX512CDVL-NEXT: vpsubq %ymm0, %ymm1, %ymm2
61 ; AVX512CDVL-NEXT: vpandq %ymm2, %ymm0, %ymm0
62 ; AVX512CDVL-NEXT: vpsubq {{.*}}(%rip){1to4}, %ymm0, %ymm0
63 ; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
64 ; AVX512CDVL-NEXT: vpandq %ymm2, %ymm0, %ymm3
65 ; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
66 ; AVX512CDVL-NEXT: vpshufb %ymm3, %ymm4, %ymm3
67 ; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0
68 ; AVX512CDVL-NEXT: vpandq %ymm2, %ymm0, %ymm0
69 ; AVX512CDVL-NEXT: vpshufb %ymm0, %ymm4, %ymm0
70 ; AVX512CDVL-NEXT: vpaddb %ymm3, %ymm0, %ymm0
71 ; AVX512CDVL-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
72 ; AVX512CDVL-NEXT: retq
76 ; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1
77 ; AVX512CD-NEXT: vpsubq %ymm0, %ymm1, %ymm2
78 ; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm0
79 ; AVX512CD-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2
80 ; AVX512CD-NEXT: vpsubq %ymm2, %ymm0, %ymm0
81 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
82 ; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm3
83 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
84 ; AVX512CD-NEXT: vpshufb %ymm3, %ymm4, %ymm3
85 ; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0
86 ; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm0
87 ; AVX512CD-NEXT: vpshufb %ymm0, %ymm4, %ymm0
88 ; AVX512CD-NEXT: vpaddb %ymm3, %ymm0, %ymm0
89 ; AVX512CD-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
90 ; AVX512CD-NEXT: retq
98 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
99 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
100 ; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1
101 ; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm3
102 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
103 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
104 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
105 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1]
106 ; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1
107 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
108 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm5
109 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
110 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
111 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
112 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
113 ; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1
114 ; AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1
115 ; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm1
116 ; AVX1-NEXT: vpsubq %xmm3, %xmm0, %xmm0
117 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3
118 ; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3
119 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
120 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
121 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0
122 ; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0
123 ; AVX1-NEXT: vpsadbw %xmm2, %xmm0, %xmm0
124 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
125 ; AVX1-NEXT: retq
129 ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
130 ; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm2
131 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
132 ; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2
133 ; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
134 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
135 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3
136 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
137 ; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3
138 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
139 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
140 ; AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0
141 ; AVX2-NEXT: vpaddb %ymm3, %ymm0, %ymm0
142 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
143 ; AVX2-NEXT: retq
147 ; AVX512CDVL-NEXT: vpxord %ymm1, %ymm1, %ymm1
148 ; AVX512CDVL-NEXT: vpsubq %ymm0, %ymm1, %ymm1
149 ; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm0
150 ; AVX512CDVL-NEXT: vplzcntq %ymm0, %ymm0
151 ; AVX512CDVL-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1
152 ; AVX512CDVL-NEXT: vpsubq %ymm0, %ymm1, %ymm0
153 ; AVX512CDVL-NEXT: retq
157 ; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1
158 ; AVX512CD-NEXT: vpsubq %ymm0, %ymm1, %ymm1
159 ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0
160 ; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0
161 ; AVX512CD-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1
162 ; AVX512CD-NEXT: vpsubq %ymm0, %ymm1, %ymm0
163 ; AVX512CD-NEXT: retq
171 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
172 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
173 ; AVX1-NEXT: vpsubd %xmm2, %xmm1, %xmm2
174 ; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm3
175 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
176 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
177 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
178 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1]
179 ; AVX1-NEXT: vpsubd %xmm3, %xmm2, %xmm2
180 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
181 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5
182 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
183 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
184 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
185 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
186 ; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2
187 ; AVX1-NEXT: vpaddb %xmm5, %xmm2, %xmm2
188 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
189 ; AVX1-NEXT: vpsadbw %xmm1, %xmm5, %xmm5
190 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
191 ; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
192 ; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
193 ; AVX1-NEXT: vpsubd %xmm3, %xmm0, %xmm0
194 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3
195 ; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3
196 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
197 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
198 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0
199 ; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0
200 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
201 ; AVX1-NEXT: vpsadbw %xmm1, %xmm3, %xmm3
202 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
203 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
204 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
205 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
206 ; AVX1-NEXT: retq
210 ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
211 ; AVX2-NEXT: vpsubd %ymm0, %ymm1, %ymm2
212 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
213 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
214 ; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0
215 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
216 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3
217 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
218 ; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3
219 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
220 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
221 ; AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0
222 ; AVX2-NEXT: vpaddb %ymm3, %ymm0, %ymm0
223 ; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
224 ; AVX2-NEXT: vpsadbw %ymm1, %ymm2, %ymm2
225 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
226 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
227 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
228 ; AVX2-NEXT: retq
232 ; AVX512CDVL-NEXT: vpxord %ymm1, %ymm1, %ymm1
233 ; AVX512CDVL-NEXT: vpsubd %ymm0, %ymm1, %ymm2
234 ; AVX512CDVL-NEXT: vpandd %ymm2, %ymm0, %ymm0
235 ; AVX512CDVL-NEXT: vpsubd {{.*}}(%rip){1to8}, %ymm0, %ymm0
236 ; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
237 ; AVX512CDVL-NEXT: vpandq %ymm2, %ymm0, %ymm3
238 ; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
239 ; AVX512CDVL-NEXT: vpshufb %ymm3, %ymm4, %ymm3
240 ; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0
241 ; AVX512CDVL-NEXT: vpandq %ymm2, %ymm0, %ymm0
242 ; AVX512CDVL-NEXT: vpshufb %ymm0, %ymm4, %ymm0
243 ; AVX512CDVL-NEXT: vpaddb %ymm3, %ymm0, %ymm0
244 ; AVX512CDVL-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
245 ; AVX512CDVL-NEXT: vpsadbw %ymm1, %ymm2, %ymm2
246 ; AVX512CDVL-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
247 ; AVX512CDVL-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
248 ; AVX512CDVL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
249 ; AVX512CDVL-NEXT: retq
253 ; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1
254 ; AVX512CD-NEXT: vpsubd %ymm0, %ymm1, %ymm2
255 ; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm0
256 ; AVX512CD-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
257 ; AVX512CD-NEXT: vpsubd %ymm2, %ymm0, %ymm0
258 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
259 ; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm3
260 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
261 ; AVX512CD-NEXT: vpshufb %ymm3, %ymm4, %ymm3
262 ; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0
263 ; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm0
264 ; AVX512CD-NEXT: vpshufb %ymm0, %ymm4, %ymm0
265 ; AVX512CD-NEXT: vpaddb %ymm3, %ymm0, %ymm0
266 ; AVX512CD-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
267 ; AVX512CD-NEXT: vpsadbw %ymm1, %ymm2, %ymm2
268 ; AVX512CD-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
269 ; AVX512CD-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
270 ; AVX512CD-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
271 ; AVX512CD-NEXT: retq
279 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
280 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
281 ; AVX1-NEXT: vpsubd %xmm2, %xmm1, %xmm2
282 ; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm3
283 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
284 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
285 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
286 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1]
287 ; AVX1-NEXT: vpsubd %xmm3, %xmm2, %xmm2
288 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
289 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5
290 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
291 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
292 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
293 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
294 ; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2
295 ; AVX1-NEXT: vpaddb %xmm5, %xmm2, %xmm2
296 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
297 ; AVX1-NEXT: vpsadbw %xmm1, %xmm5, %xmm5
298 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
299 ; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
300 ; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
301 ; AVX1-NEXT: vpsubd %xmm3, %xmm0, %xmm0
302 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3
303 ; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3
304 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
305 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
306 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0
307 ; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0
308 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
309 ; AVX1-NEXT: vpsadbw %xmm1, %xmm3, %xmm3
310 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
311 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
312 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
313 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
314 ; AVX1-NEXT: retq
318 ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
319 ; AVX2-NEXT: vpsubd %ymm0, %ymm1, %ymm2
320 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
321 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
322 ; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0
323 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
324 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3
325 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
326 ; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3
327 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
328 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
329 ; AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0
330 ; AVX2-NEXT: vpaddb %ymm3, %ymm0, %ymm0
331 ; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
332 ; AVX2-NEXT: vpsadbw %ymm1, %ymm2, %ymm2
333 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
334 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
335 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
336 ; AVX2-NEXT: retq
340 ; AVX512CDVL-NEXT: vpxord %ymm1, %ymm1, %ymm1
341 ; AVX512CDVL-NEXT: vpsubd %ymm0, %ymm1, %ymm1
342 ; AVX512CDVL-NEXT: vpandd %ymm1, %ymm0, %ymm0
343 ; AVX512CDVL-NEXT: vplzcntd %ymm0, %ymm0
344 ; AVX512CDVL-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
345 ; AVX512CDVL-NEXT: vpsubd %ymm0, %ymm1, %ymm0
346 ; AVX512CDVL-NEXT: retq
350 ; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1
351 ; AVX512CD-NEXT: vpsubd %ymm0, %ymm1, %ymm1
352 ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0
353 ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
354 ; AVX512CD-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
355 ; AVX512CD-NEXT: vpsubd %ymm0, %ymm1, %ymm0
356 ; AVX512CD-NEXT: retq
364 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
365 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
366 ; AVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1
367 ; AVX1-NEXT: vpsubw %xmm0, %xmm2, %xmm2
368 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
369 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
370 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1]
371 ; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm2
372 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
373 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
374 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
375 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
376 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
377 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
378 ; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2
379 ; AVX1-NEXT: vpaddb %xmm4, %xmm2, %xmm2
380 ; AVX1-NEXT: vpsllw $8, %xmm2, %xmm4
381 ; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
382 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
383 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
384 ; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0
385 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm1
386 ; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1
387 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
388 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
389 ; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0
390 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
391 ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
392 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
393 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
394 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
395 ; AVX1-NEXT: retq
399 ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
400 ; AVX2-NEXT: vpsubw %ymm0, %ymm1, %ymm1
401 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
402 ; AVX2-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
403 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
404 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
405 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
406 ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
407 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
408 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
409 ; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0
410 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0
411 ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1
412 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
413 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
414 ; AVX2-NEXT: retq
418 ; AVX512CDVL-NEXT: vpxord %ymm1, %ymm1, %ymm1
419 ; AVX512CDVL-NEXT: vpsubw %ymm0, %ymm1, %ymm1
420 ; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm0
421 ; AVX512CDVL-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
422 ; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
423 ; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm2
424 ; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
425 ; AVX512CDVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2
426 ; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0
427 ; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm0
428 ; AVX512CDVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0
429 ; AVX512CDVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0
430 ; AVX512CDVL-NEXT: vpsllw $8, %ymm0, %ymm1
431 ; AVX512CDVL-NEXT: vpaddb %ymm0, %ymm1, %ymm0
432 ; AVX512CDVL-NEXT: vpsrlw $8, %ymm0, %ymm0
433 ; AVX512CDVL-NEXT: retq
437 ; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1
438 ; AVX512CD-NEXT: vpsubw %ymm0, %ymm1, %ymm1
439 ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0
440 ; AVX512CD-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
441 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
442 ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2
443 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
444 ; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2
445 ; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0
446 ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0
447 ; AVX512CD-NEXT: vpshufb %ymm0, %ymm3, %ymm0
448 ; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0
449 ; AVX512CD-NEXT: vpsllw $8, %ymm0, %ymm1
450 ; AVX512CD-NEXT: vpaddb %ymm0, %ymm1, %ymm0
451 ; AVX512CD-NEXT: vpsrlw $8, %ymm0, %ymm0
452 ; AVX512CD-NEXT: retq
460 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
461 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
462 ; AVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1
463 ; AVX1-NEXT: vpsubw %xmm0, %xmm2, %xmm2
464 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
465 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
466 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1]
467 ; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm2
468 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
469 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
470 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
471 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
472 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
473 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
474 ; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2
475 ; AVX1-NEXT: vpaddb %xmm4, %xmm2, %xmm2
476 ; AVX1-NEXT: vpsllw $8, %xmm2, %xmm4
477 ; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
478 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
479 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
480 ; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0
481 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm1
482 ; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1
483 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
484 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
485 ; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0
486 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
487 ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
488 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
489 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
490 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
491 ; AVX1-NEXT: retq
495 ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
496 ; AVX2-NEXT: vpsubw %ymm0, %ymm1, %ymm1
497 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
498 ; AVX2-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
499 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
500 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
501 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
502 ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
503 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
504 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
505 ; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0
506 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0
507 ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1
508 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
509 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
510 ; AVX2-NEXT: retq
514 ; AVX512CDVL-NEXT: vpxord %ymm1, %ymm1, %ymm1
515 ; AVX512CDVL-NEXT: vpsubw %ymm0, %ymm1, %ymm1
516 ; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm0
517 ; AVX512CDVL-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
518 ; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
519 ; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm2
520 ; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
521 ; AVX512CDVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2
522 ; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0
523 ; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm0
524 ; AVX512CDVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0
525 ; AVX512CDVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0
526 ; AVX512CDVL-NEXT: vpsllw $8, %ymm0, %ymm1
527 ; AVX512CDVL-NEXT: vpaddb %ymm0, %ymm1, %ymm0
528 ; AVX512CDVL-NEXT: vpsrlw $8, %ymm0, %ymm0
529 ; AVX512CDVL-NEXT: retq
533 ; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1
534 ; AVX512CD-NEXT: vpsubw %ymm0, %ymm1, %ymm1
535 ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0
536 ; AVX512CD-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
537 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
538 ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2
539 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
540 ; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2
541 ; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0
542 ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0
543 ; AVX512CD-NEXT: vpshufb %ymm0, %ymm3, %ymm0
544 ; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0
545 ; AVX512CD-NEXT: vpsllw $8, %ymm0, %ymm1
546 ; AVX512CD-NEXT: vpaddb %ymm0, %ymm1, %ymm0
547 ; AVX512CD-NEXT: vpsrlw $8, %ymm0, %ymm0
548 ; AVX512CD-NEXT: retq
556 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
557 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
558 ; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1
559 ; AVX1-NEXT: vpsubb %xmm0, %xmm2, %xmm2
560 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
561 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
562 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
563 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
564 ; AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm1
565 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
566 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4
567 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
568 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
569 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
570 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
571 ; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1
572 ; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1
573 ; AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm0
574 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2
575 ; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2
576 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
577 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
578 ; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0
579 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
580 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
581 ; AVX1-NEXT: retq
585 ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
586 ; AVX2-NEXT: vpsubb %ymm0, %ymm1, %ymm1
587 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
588 ; AVX2-NEXT: vpsubb {{.*}}(%rip), %ymm0, %ymm0
589 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
590 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
591 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
592 ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
593 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
594 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
595 ; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0
596 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0
597 ; AVX2-NEXT: retq
601 ; AVX512CDVL-NEXT: vpxord %ymm1, %ymm1, %ymm1
602 ; AVX512CDVL-NEXT: vpsubb %ymm0, %ymm1, %ymm1
603 ; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm0
604 ; AVX512CDVL-NEXT: vpsubb {{.*}}(%rip), %ymm0, %ymm0
605 ; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
606 ; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm2
607 ; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
608 ; AVX512CDVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2
609 ; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0
610 ; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm0
611 ; AVX512CDVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0
612 ; AVX512CDVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0
613 ; AVX512CDVL-NEXT: retq
617 ; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1
618 ; AVX512CD-NEXT: vpsubb %ymm0, %ymm1, %ymm1
619 ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0
620 ; AVX512CD-NEXT: vpsubb {{.*}}(%rip), %ymm0, %ymm0
621 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
622 ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2
623 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
624 ; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2
625 ; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0
626 ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0
627 ; AVX512CD-NEXT: vpshufb %ymm0, %ymm3, %ymm0
628 ; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0
629 ; AVX512CD-NEXT: retq
637 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
638 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
639 ; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1
640 ; AVX1-NEXT: vpsubb %xmm0, %xmm2, %xmm2
641 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
642 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
643 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
644 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
645 ; AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm1
646 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
647 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4
648 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
649 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
650 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
651 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
652 ; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1
653 ; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1
654 ; AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm0
655 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2
656 ; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2
657 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
658 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
659 ; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0
660 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
661 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
662 ; AVX1-NEXT: retq
666 ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
667 ; AVX2-NEXT: vpsubb %ymm0, %ymm1, %ymm1
668 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
669 ; AVX2-NEXT: vpsubb {{.*}}(%rip), %ymm0, %ymm0
670 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
671 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
672 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
673 ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
674 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
675 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
676 ; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0
677 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0
678 ; AVX2-NEXT: retq
682 ; AVX512CDVL-NEXT: vpxord %ymm1, %ymm1, %ymm1
683 ; AVX512CDVL-NEXT: vpsubb %ymm0, %ymm1, %ymm1
684 ; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm0
685 ; AVX512CDVL-NEXT: vpsubb {{.*}}(%rip), %ymm0, %ymm0
686 ; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
687 ; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm2
688 ; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
689 ; AVX512CDVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2
690 ; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0
691 ; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm0
692 ; AVX512CDVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0
693 ; AVX512CDVL-NEXT
694 ; AVX512CDVL-NEXT: retq
698 ; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1
699 ; AVX512CD-NEXT: vpsubb %ymm0, %ymm1, %ymm1
700 ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0
701 ; AVX512CD-NEXT: vpsubb {{.*}}(%rip), %ymm0, %ymm0
702 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
703 ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2
704 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
705 ; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2
706 ; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0
707 ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0
708 ; AVX512CD-NEXT: vpshufb %ymm0, %ymm3, %ymm0
709 ; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0
710 ; AVX512CD-NEXT: retq
718 ; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0]
719 ; AVX1-NEXT: retq
723 ; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0]
724 ; AVX2-NEXT: retq
728 ; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm0 = [8,0,64,0]
729 ; AVX512CDVL-NEXT: retq
733 ; AVX512CD-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0]
734 ; AVX512CD-NEXT: retq
742 ; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0]
743 ; AVX1-NEXT: retq
747 ; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0]
748 ; AVX2-NEXT: retq
752 ; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm0 = [8,0,64,0]
753 ; AVX512CDVL-NEXT: retq
757 ; AVX512CD-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0]
758 ; AVX512CD-NEXT: retq
766 ; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
767 ; AVX1-NEXT: retq
771 ; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
772 ; AVX2-NEXT: retq
776 ; AVX512CDVL-NEXT: vmovdqa32 {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
777 ; AVX512CDVL-NEXT: retq
781 ; AVX512CD-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
782 ; AVX512CD-NEXT: retq
790 ; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
791 ; AVX1-NEXT: retq
795 ; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
796 ; AVX2-NEXT: retq
800 ; AVX512CDVL-NEXT: vmovdqa32 {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
801 ; AVX512CDVL-NEXT: retq
805 ; AVX512CD-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
806 ; AVX512CD-NEXT: retq
814 ; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
815 ; AVX1-NEXT: retq
819 ; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
820 ; AVX2-NEXT: retq
824 ; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
825 ; AVX512CDVL-NEXT: retq
829 ; AVX512CD-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
830 ; AVX512CD-NEXT: retq
838 ; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
839 ; AVX1-NEXT: retq
843 ; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
844 ; AVX2-NEXT: retq
848 ; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
849 ; AVX512CDVL-NEXT: retq
853 ; AVX512CD-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
854 ; AVX512CD-NEXT: retq
862 ; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
863 ; AVX1-NEXT: retq
867 ; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
868 ; AVX2-NEXT: retq
872 ; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
873 ; AVX512CDVL-NEXT: retq
877 ; AVX512CD-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
878 ; AVX512CD-NEXT: retq
886 ; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
887 ; AVX1-NEXT: retq
891 ; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
892 ; AVX2-NEXT: retq
896 ; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
897 ; AVX512CDVL-NEXT: retq
901 ; AVX512CD-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
902 ; AVX512CD-NEXT: retq