1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW 7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL --check-prefix=AVX512BWVL 8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ 9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL --check-prefix=AVX512DQVL 10 11 ; 12 ; vXi64 13 ; 14 15 define i64 @test_v2i64(<2 x i64> %a0) { 16 ; SSE-LABEL: test_v2i64: 17 ; SSE: # %bb.0: 18 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 19 ; SSE-NEXT: movdqa %xmm0, %xmm2 20 ; SSE-NEXT: psrlq $32, %xmm2 21 ; SSE-NEXT: pmuludq %xmm1, %xmm2 22 ; SSE-NEXT: movdqa %xmm1, %xmm3 23 ; SSE-NEXT: psrlq $32, %xmm3 24 ; SSE-NEXT: pmuludq %xmm0, %xmm3 25 ; SSE-NEXT: paddq %xmm2, %xmm3 26 ; SSE-NEXT: psllq $32, %xmm3 27 ; SSE-NEXT: pmuludq %xmm1, %xmm0 28 ; SSE-NEXT: paddq %xmm3, %xmm0 29 ; SSE-NEXT: movq %xmm0, %rax 30 ; SSE-NEXT: retq 31 ; 32 ; AVX-LABEL: test_v2i64: 33 ; AVX: # %bb.0: 34 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 35 ; AVX-NEXT: vpsrlq $32, %xmm0, %xmm2 36 ; AVX-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 37 ; AVX-NEXT: vpsrlq $32, %xmm1, %xmm3 38 ; AVX-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 39 ; AVX-NEXT: vpaddq %xmm2, %xmm3, %xmm2 40 ; AVX-NEXT: vpsllq $32, %xmm2, %xmm2 41 ; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 42 ; AVX-NEXT: vpaddq %xmm2, %xmm0, %xmm0 43 ; AVX-NEXT: vmovq %xmm0, %rax 44 ; AVX-NEXT: retq 45 ; 46 ; AVX512BW-LABEL: test_v2i64: 47 ; AVX512BW: # %bb.0: 48 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 49 ; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 50 ; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 51 ; AVX512BW-NEXT: vpsrlq $32, %xmm1, %xmm3 52 ; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 53 ; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2 54 ; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 55 ; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 56 ; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0 57 ; AVX512BW-NEXT: vmovq %xmm0, %rax 58 ; AVX512BW-NEXT: retq 59 ; 60 ; AVX512BWVL-LABEL: test_v2i64: 61 ; AVX512BWVL: # %bb.0: 62 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 63 ; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 64 ; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 65 ; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3 66 ; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 67 ; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2 68 ; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2 69 ; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 70 ; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0 71 ; AVX512BWVL-NEXT: vmovq %xmm0, %rax 72 ; AVX512BWVL-NEXT: retq 73 ; 74 ; AVX512DQ-LABEL: test_v2i64: 75 ; AVX512DQ: # %bb.0: 76 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 77 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 78 ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 79 ; AVX512DQ-NEXT: vmovq %xmm0, %rax 80 ; AVX512DQ-NEXT: vzeroupper 81 ; AVX512DQ-NEXT: retq 82 ; 83 ; AVX512DQVL-LABEL: test_v2i64: 84 ; AVX512DQVL: # %bb.0: 85 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 86 ; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0 87 ; AVX512DQVL-NEXT: vmovq %xmm0, %rax 88 ; AVX512DQVL-NEXT: retq 89 %1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> %a0) 90 ret i64 %1 91 } 92 93 define i64 @test_v4i64(<4 x i64> %a0) { 94 ; SSE-LABEL: test_v4i64: 95 ; SSE: # %bb.0: 96 ; SSE-NEXT: movdqa %xmm0, %xmm2 97 ; SSE-NEXT: psrlq $32, %xmm2 98 ; SSE-NEXT: pmuludq %xmm1, %xmm2 99 ; SSE-NEXT: movdqa %xmm1, %xmm3 100 ; SSE-NEXT: psrlq $32, %xmm3 101 ; SSE-NEXT: pmuludq %xmm0, %xmm3 102 ; SSE-NEXT: paddq %xmm2, %xmm3 103 ; SSE-NEXT: psllq $32, %xmm3 104 ; SSE-NEXT: pmuludq %xmm1, %xmm0 105 ; SSE-NEXT: paddq %xmm3, %xmm0 106 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 107 ; SSE-NEXT: movdqa %xmm0, %xmm2 108 ; SSE-NEXT: psrlq $32, %xmm2 109 ; SSE-NEXT: pmuludq %xmm1, %xmm2 110 ; SSE-NEXT: movdqa %xmm1, %xmm3 111 ; SSE-NEXT: psrlq $32, %xmm3 112 ; SSE-NEXT: pmuludq %xmm0, %xmm3 113 ; SSE-NEXT: paddq %xmm2, %xmm3 114 ; SSE-NEXT: psllq $32, %xmm3 115 ; SSE-NEXT: pmuludq %xmm1, %xmm0 116 ; SSE-NEXT: paddq %xmm3, %xmm0 117 ; SSE-NEXT: movq %xmm0, %rax 118 ; SSE-NEXT: retq 119 ; 120 ; AVX1-LABEL: test_v4i64: 121 ; AVX1: # %bb.0: 122 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 123 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 124 ; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 125 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 126 ; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 127 ; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 128 ; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 129 ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 130 ; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 131 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 132 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 133 ; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 134 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 135 ; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 136 ; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 137 ; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 138 ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 139 ; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 140 ; AVX1-NEXT: vmovq %xmm0, %rax 141 ; AVX1-NEXT: vzeroupper 142 ; AVX1-NEXT: retq 143 ; 144 ; AVX2-LABEL: test_v4i64: 145 ; AVX2: # %bb.0: 146 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 147 ; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2 148 ; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 149 ; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3 150 ; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 151 ; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2 152 ; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 153 ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 154 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 155 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 156 ; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2 157 ; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 158 ; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3 159 ; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 160 ; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2 161 ; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 162 ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 163 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 164 ; AVX2-NEXT: vmovq %xmm0, %rax 165 ; AVX2-NEXT: vzeroupper 166 ; AVX2-NEXT: retq 167 ; 168 ; AVX512BW-LABEL: test_v4i64: 169 ; AVX512BW: # %bb.0: 170 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 171 ; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm2 172 ; AVX512BW-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 173 ; AVX512BW-NEXT: vpsrlq $32, %ymm1, %ymm3 174 ; AVX512BW-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 175 ; AVX512BW-NEXT: vpaddq %ymm2, %ymm3, %ymm2 176 ; AVX512BW-NEXT: vpsllq $32, %ymm2, %ymm2 177 ; AVX512BW-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 178 ; AVX512BW-NEXT: vpaddq %ymm2, %ymm0, %ymm0 179 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 180 ; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm2 181 ; AVX512BW-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 182 ; AVX512BW-NEXT: vpsrlq $32, %ymm1, %ymm3 183 ; AVX512BW-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 184 ; AVX512BW-NEXT: vpaddq %ymm2, %ymm3, %ymm2 185 ; AVX512BW-NEXT: vpsllq $32, %ymm2, %ymm2 186 ; AVX512BW-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 187 ; AVX512BW-NEXT: vpaddq %ymm2, %ymm0, %ymm0 188 ; AVX512BW-NEXT: vmovq %xmm0, %rax 189 ; AVX512BW-NEXT: vzeroupper 190 ; AVX512BW-NEXT: retq 191 ; 192 ; AVX512BWVL-LABEL: test_v4i64: 193 ; AVX512BWVL: # %bb.0: 194 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 195 ; AVX512BWVL-NEXT: vpsrlq $32, %ymm0, %ymm2 196 ; AVX512BWVL-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 197 ; AVX512BWVL-NEXT: vpsrlq $32, %ymm1, %ymm3 198 ; AVX512BWVL-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 199 ; AVX512BWVL-NEXT: vpaddq %ymm2, %ymm3, %ymm2 200 ; AVX512BWVL-NEXT: vpsllq $32, %ymm2, %ymm2 201 ; AVX512BWVL-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 202 ; AVX512BWVL-NEXT: vpaddq %ymm2, %ymm0, %ymm0 203 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 204 ; AVX512BWVL-NEXT: vpsrlq $32, %ymm0, %ymm2 205 ; AVX512BWVL-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 206 ; AVX512BWVL-NEXT: vpsrlq $32, %ymm1, %ymm3 207 ; AVX512BWVL-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 208 ; AVX512BWVL-NEXT: vpaddq %ymm2, %ymm3, %ymm2 209 ; AVX512BWVL-NEXT: vpsllq $32, %ymm2, %ymm2 210 ; AVX512BWVL-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 211 ; AVX512BWVL-NEXT: vpaddq %ymm2, %ymm0, %ymm0 212 ; AVX512BWVL-NEXT: vmovq %xmm0, %rax 213 ; AVX512BWVL-NEXT: vzeroupper 214 ; AVX512BWVL-NEXT: retq 215 ; 216 ; AVX512DQ-LABEL: test_v4i64: 217 ; AVX512DQ: # %bb.0: 218 ; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 219 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 220 ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 221 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 222 ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 223 ; AVX512DQ-NEXT: vmovq %xmm0, %rax 224 ; AVX512DQ-NEXT: vzeroupper 225 ; AVX512DQ-NEXT: retq 226 ; 227 ; AVX512DQVL-LABEL: test_v4i64: 228 ; AVX512DQVL: # %bb.0: 229 ; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 230 ; AVX512DQVL-NEXT: vpmullq %ymm1, %ymm0, %ymm0 231 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 232 ; AVX512DQVL-NEXT: vpmullq %ymm1, %ymm0, %ymm0 233 ; AVX512DQVL-NEXT: vmovq %xmm0, %rax 234 ; AVX512DQVL-NEXT: vzeroupper 235 ; AVX512DQVL-NEXT: retq 236 %1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> %a0) 237 ret i64 %1 238 } 239 240 define i64 @test_v8i64(<8 x i64> %a0) { 241 ; SSE-LABEL: test_v8i64: 242 ; SSE: # %bb.0: 243 ; SSE-NEXT: movdqa %xmm1, %xmm4 244 ; SSE-NEXT: psrlq $32, %xmm4 245 ; SSE-NEXT: pmuludq %xmm3, %xmm4 246 ; SSE-NEXT: movdqa %xmm3, %xmm5 247 ; SSE-NEXT: psrlq $32, %xmm5 248 ; SSE-NEXT: pmuludq %xmm1, %xmm5 249 ; SSE-NEXT: paddq %xmm4, %xmm5 250 ; SSE-NEXT: psllq $32, %xmm5 251 ; SSE-NEXT: pmuludq %xmm3, %xmm1 252 ; SSE-NEXT: paddq %xmm5, %xmm1 253 ; SSE-NEXT: movdqa %xmm0, %xmm3 254 ; SSE-NEXT: psrlq $32, %xmm3 255 ; SSE-NEXT: pmuludq %xmm2, %xmm3 256 ; SSE-NEXT: movdqa %xmm2, %xmm4 257 ; SSE-NEXT: psrlq $32, %xmm4 258 ; SSE-NEXT: pmuludq %xmm0, %xmm4 259 ; SSE-NEXT: paddq %xmm3, %xmm4 260 ; SSE-NEXT: psllq $32, %xmm4 261 ; SSE-NEXT: pmuludq %xmm2, %xmm0 262 ; SSE-NEXT: paddq %xmm4, %xmm0 263 ; SSE-NEXT: movdqa %xmm0, %xmm2 264 ; SSE-NEXT: psrlq $32, %xmm2 265 ; SSE-NEXT: pmuludq %xmm1, %xmm2 266 ; SSE-NEXT: movdqa %xmm1, %xmm3 267 ; SSE-NEXT: psrlq $32, %xmm3 268 ; SSE-NEXT: pmuludq %xmm0, %xmm3 269 ; SSE-NEXT: paddq %xmm2, %xmm3 270 ; SSE-NEXT: psllq $32, %xmm3 271 ; SSE-NEXT: pmuludq %xmm1, %xmm0 272 ; SSE-NEXT: paddq %xmm3, %xmm0 273 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 274 ; SSE-NEXT: movdqa %xmm0, %xmm2 275 ; SSE-NEXT: psrlq $32, %xmm2 276 ; SSE-NEXT: pmuludq %xmm1, %xmm2 277 ; SSE-NEXT: movdqa %xmm1, %xmm3 278 ; SSE-NEXT: psrlq $32, %xmm3 279 ; SSE-NEXT: pmuludq %xmm0, %xmm3 280 ; SSE-NEXT: paddq %xmm2, %xmm3 281 ; SSE-NEXT: psllq $32, %xmm3 282 ; SSE-NEXT: pmuludq %xmm1, %xmm0 283 ; SSE-NEXT: paddq %xmm3, %xmm0 284 ; SSE-NEXT: movq %xmm0, %rax 285 ; SSE-NEXT: retq 286 ; 287 ; AVX1-LABEL: test_v8i64: 288 ; AVX1: # %bb.0: 289 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 290 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 291 ; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm4 292 ; AVX1-NEXT: vpmuludq %xmm2, %xmm4, %xmm4 293 ; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm5 294 ; AVX1-NEXT: vpmuludq %xmm5, %xmm3, %xmm5 295 ; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4 296 ; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4 297 ; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 298 ; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2 299 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm3 300 ; AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm3 301 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 302 ; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm4 303 ; AVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm3 304 ; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3 305 ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 306 ; AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm0 307 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm1 308 ; AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 309 ; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm3 310 ; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 311 ; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1 312 ; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 313 ; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 314 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 315 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 316 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 317 ; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 318 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 319 ; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 320 ; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 321 ; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 322 ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 323 ; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 324 ; AVX1-NEXT: vmovq %xmm0, %rax 325 ; AVX1-NEXT: vzeroupper 326 ; AVX1-NEXT: retq 327 ; 328 ; AVX2-LABEL: test_v8i64: 329 ; AVX2: # %bb.0: 330 ; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2 331 ; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 332 ; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3 333 ; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 334 ; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2 335 ; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 336 ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 337 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 338 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 339 ; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2 340 ; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 341 ; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3 342 ; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 343 ; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2 344 ; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 345 ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 346 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 347 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 348 ; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2 349 ; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 350 ; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3 351 ; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 352 ; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2 353 ; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 354 ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 355 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 356 ; AVX2-NEXT: vmovq %xmm0, %rax 357 ; AVX2-NEXT: vzeroupper 358 ; AVX2-NEXT: retq 359 ; 360 ; AVX512BW-LABEL: test_v8i64: 361 ; AVX512BW: # %bb.0: 362 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 363 ; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2 364 ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 365 ; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3 366 ; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 367 ; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2 368 ; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2 369 ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 370 ; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 371 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 372 ; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2 373 ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 374 ; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3 375 ; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 376 ; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2 377 ; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2 378 ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 379 ; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 380 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 381 ; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2 382 ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 383 ; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3 384 ; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 385 ; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2 386 ; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2 387 ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 388 ; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 389 ; AVX512BW-NEXT: vmovq %xmm0, %rax 390 ; AVX512BW-NEXT: vzeroupper 391 ; AVX512BW-NEXT: retq 392 ; 393 ; AVX512BWVL-LABEL: test_v8i64: 394 ; AVX512BWVL: # %bb.0: 395 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 396 ; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2 397 ; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 398 ; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3 399 ; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 400 ; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2 401 ; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2 402 ; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 403 ; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0 404 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 405 ; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2 406 ; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 407 ; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3 408 ; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 409 ; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2 410 ; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2 411 ; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 412 ; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0 413 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 414 ; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2 415 ; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 416 ; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3 417 ; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 418 ; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2 419 ; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2 420 ; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 421 ; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0 422 ; AVX512BWVL-NEXT: vmovq %xmm0, %rax 423 ; AVX512BWVL-NEXT: vzeroupper 424 ; AVX512BWVL-NEXT: retq 425 ; 426 ; AVX512DQ-LABEL: test_v8i64: 427 ; AVX512DQ: # %bb.0: 428 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 429 ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 430 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 431 ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 432 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 433 ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 434 ; AVX512DQ-NEXT: vmovq %xmm0, %rax 435 ; AVX512DQ-NEXT: vzeroupper 436 ; AVX512DQ-NEXT: retq 437 ; 438 ; AVX512DQVL-LABEL: test_v8i64: 439 ; AVX512DQVL: # %bb.0: 440 ; AVX512DQVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 441 ; AVX512DQVL-NEXT: vpmullq %zmm1, %zmm0, %zmm0 442 ; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 443 ; AVX512DQVL-NEXT: vpmullq %zmm1, %zmm0, %zmm0 444 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 445 ; AVX512DQVL-NEXT: vpmullq %zmm1, %zmm0, %zmm0 446 ; AVX512DQVL-NEXT: vmovq %xmm0, %rax 447 ; AVX512DQVL-NEXT: vzeroupper 448 ; AVX512DQVL-NEXT: retq 449 %1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> %a0) 450 ret i64 %1 451 } 452 453 define i64 @test_v16i64(<16 x i64> %a0) { 454 ; SSE-LABEL: test_v16i64: 455 ; SSE: # %bb.0: 456 ; SSE-NEXT: movdqa %xmm2, %xmm8 457 ; SSE-NEXT: psrlq $32, %xmm8 458 ; SSE-NEXT: pmuludq %xmm6, %xmm8 459 ; SSE-NEXT: movdqa %xmm6, %xmm9 460 ; SSE-NEXT: psrlq $32, %xmm9 461 ; SSE-NEXT: pmuludq %xmm2, %xmm9 462 ; SSE-NEXT: paddq %xmm8, %xmm9 463 ; SSE-NEXT: psllq $32, %xmm9 464 ; SSE-NEXT: pmuludq %xmm6, %xmm2 465 ; SSE-NEXT: paddq %xmm9, %xmm2 466 ; SSE-NEXT: movdqa %xmm0, %xmm8 467 ; SSE-NEXT: psrlq $32, %xmm8 468 ; SSE-NEXT: pmuludq %xmm4, %xmm8 469 ; SSE-NEXT: movdqa %xmm4, %xmm6 470 ; SSE-NEXT: psrlq $32, %xmm6 471 ; SSE-NEXT: pmuludq %xmm0, %xmm6 472 ; SSE-NEXT: paddq %xmm8, %xmm6 473 ; SSE-NEXT: psllq $32, %xmm6 474 ; SSE-NEXT: pmuludq %xmm4, %xmm0 475 ; SSE-NEXT: paddq %xmm6, %xmm0 476 ; SSE-NEXT: movdqa %xmm3, %xmm4 477 ; SSE-NEXT: psrlq $32, %xmm4 478 ; SSE-NEXT: pmuludq %xmm7, %xmm4 479 ; SSE-NEXT: movdqa %xmm7, %xmm6 480 ; SSE-NEXT: psrlq $32, %xmm6 481 ; SSE-NEXT: pmuludq %xmm3, %xmm6 482 ; SSE-NEXT: paddq %xmm4, %xmm6 483 ; SSE-NEXT: psllq $32, %xmm6 484 ; SSE-NEXT: pmuludq %xmm7, %xmm3 485 ; SSE-NEXT: paddq %xmm6, %xmm3 486 ; SSE-NEXT: movdqa %xmm1, %xmm4 487 ; SSE-NEXT: psrlq $32, %xmm4 488 ; SSE-NEXT: pmuludq %xmm5, %xmm4 489 ; SSE-NEXT: movdqa %xmm5, %xmm6 490 ; SSE-NEXT: psrlq $32, %xmm6 491 ; SSE-NEXT: pmuludq %xmm1, %xmm6 492 ; SSE-NEXT: paddq %xmm4, %xmm6 493 ; SSE-NEXT: psllq $32, %xmm6 494 ; SSE-NEXT: pmuludq %xmm5, %xmm1 495 ; SSE-NEXT: paddq %xmm6, %xmm1 496 ; SSE-NEXT: movdqa %xmm1, %xmm4 497 ; SSE-NEXT: psrlq $32, %xmm4 498 ; SSE-NEXT: pmuludq %xmm3, %xmm4 499 ; SSE-NEXT: movdqa %xmm3, %xmm5 500 ; SSE-NEXT: psrlq $32, %xmm5 501 ; SSE-NEXT: pmuludq %xmm1, %xmm5 502 ; SSE-NEXT: paddq %xmm4, %xmm5 503 ; SSE-NEXT: psllq $32, %xmm5 504 ; SSE-NEXT: pmuludq %xmm3, %xmm1 505 ; SSE-NEXT: paddq %xmm5, %xmm1 506 ; SSE-NEXT: movdqa %xmm0, %xmm3 507 ; SSE-NEXT: psrlq $32, %xmm3 508 ; SSE-NEXT: pmuludq %xmm2, %xmm3 509 ; SSE-NEXT: movdqa %xmm2, %xmm4 510 ; SSE-NEXT: psrlq $32, %xmm4 511 ; SSE-NEXT: pmuludq %xmm0, %xmm4 512 ; SSE-NEXT: paddq %xmm3, %xmm4 513 ; SSE-NEXT: psllq $32, %xmm4 514 ; SSE-NEXT: pmuludq %xmm2, %xmm0 515 ; SSE-NEXT: paddq %xmm4, %xmm0 516 ; SSE-NEXT: movdqa %xmm0, %xmm2 517 ; SSE-NEXT: psrlq $32, %xmm2 518 ; SSE-NEXT: pmuludq %xmm1, %xmm2 519 ; SSE-NEXT: movdqa %xmm1, %xmm3 520 ; SSE-NEXT: psrlq $32, %xmm3 521 ; SSE-NEXT: pmuludq %xmm0, %xmm3 522 ; SSE-NEXT: paddq %xmm2, %xmm3 523 ; SSE-NEXT: psllq $32, %xmm3 524 ; SSE-NEXT: pmuludq %xmm1, %xmm0 525 ; SSE-NEXT: paddq %xmm3, %xmm0 526 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 527 ; SSE-NEXT: movdqa %xmm0, %xmm2 528 ; SSE-NEXT: psrlq $32, %xmm2 529 ; SSE-NEXT: pmuludq %xmm1, %xmm2 530 ; SSE-NEXT: movdqa %xmm1, %xmm3 531 ; SSE-NEXT: psrlq $32, %xmm3 532 ; SSE-NEXT: pmuludq %xmm0, %xmm3 533 ; SSE-NEXT: paddq %xmm2, %xmm3 534 ; SSE-NEXT: psllq $32, %xmm3 535 ; SSE-NEXT: pmuludq %xmm1, %xmm0 536 ; SSE-NEXT: paddq %xmm3, %xmm0 537 ; SSE-NEXT: movq %xmm0, %rax 538 ; SSE-NEXT: retq 539 ; 540 ; AVX1-LABEL: test_v16i64: 541 ; AVX1: # %bb.0: 542 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 543 ; AVX1-NEXT: vpmuludq %xmm3, %xmm4, %xmm4 544 ; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5 545 ; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm5 546 ; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4 547 ; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4 548 ; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm5 549 ; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4 550 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm5 551 ; AVX1-NEXT: vpmuludq %xmm2, %xmm5, %xmm5 552 ; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm6 553 ; AVX1-NEXT: vpmuludq %xmm6, %xmm0, %xmm6 554 ; AVX1-NEXT: vpaddq %xmm5, %xmm6, %xmm5 555 ; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5 556 ; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm6 557 ; AVX1-NEXT: vpaddq %xmm5, %xmm6, %xmm5 558 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 559 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 560 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm6 561 ; AVX1-NEXT: vpmuludq %xmm3, %xmm6, %xmm6 562 ; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm7 563 ; AVX1-NEXT: vpmuludq %xmm7, %xmm1, %xmm7 564 ; AVX1-NEXT: vpaddq %xmm6, %xmm7, %xmm6 565 ; AVX1-NEXT: vpsllq $32, %xmm6, %xmm6 566 ; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 567 ; AVX1-NEXT: vpaddq %xmm6, %xmm1, %xmm1 568 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 569 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 570 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm3 571 ; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm3 572 ; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm6 573 ; AVX1-NEXT: vpmuludq %xmm6, %xmm0, %xmm6 574 ; AVX1-NEXT: vpaddq %xmm3, %xmm6, %xmm3 575 ; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3 576 ; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 577 ; AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm0 578 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 579 ; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 580 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 581 ; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 582 ; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 583 ; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 584 ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 585 ; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 586 ; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm1 587 ; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm1 588 ; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm2 589 ; AVX1-NEXT: vpmuludq %xmm2, %xmm5, %xmm2 590 ; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1 591 ; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 592 ; AVX1-NEXT: vpmuludq %xmm4, %xmm5, %xmm2 593 ; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1 594 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm2 595 ; AVX1-NEXT: vpmuludq %xmm0, %xmm2, %xmm2 596 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm3 597 ; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm3 598 ; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 599 ; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 600 ; AVX1-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 601 ; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 602 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 603 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 604 ; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 605 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 606 ; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 607 ; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 608 ; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 609 ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 610 ; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 611 ; AVX1-NEXT: vmovq %xmm0, %rax 612 ; AVX1-NEXT: vzeroupper 613 ; AVX1-NEXT: retq 614 ; 615 ; AVX2-LABEL: test_v16i64: 616 ; AVX2: # %bb.0: 617 ; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm4 618 ; AVX2-NEXT: vpmuludq %ymm3, %ymm4, %ymm4 619 ; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm5 620 ; AVX2-NEXT: vpmuludq %ymm5, %ymm1, %ymm5 621 ; AVX2-NEXT: vpaddq %ymm4, %ymm5, %ymm4 622 ; AVX2-NEXT: vpsllq $32, %ymm4, %ymm4 623 ; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 624 ; AVX2-NEXT: vpaddq %ymm4, %ymm1, %ymm1 625 ; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm3 626 ; AVX2-NEXT: vpmuludq %ymm2, %ymm3, %ymm3 627 ; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm4 628 ; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm4 629 ; AVX2-NEXT: vpaddq %ymm3, %ymm4, %ymm3 630 ; AVX2-NEXT: vpsllq $32, %ymm3, %ymm3 631 ; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm0 632 ; AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0 633 ; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2 634 ; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 635 ; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3 636 ; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 637 ; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2 638 ; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 639 ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 640 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 641 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 642 ; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2 643 ; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 644 ; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3 645 ; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 646 ; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2 647 ; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 648 ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 649 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 650 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 651 ; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2 652 ; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 653 ; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3 654 ; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 655 ; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2 656 ; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 657 ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 658 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 659 ; AVX2-NEXT: vmovq %xmm0, %rax 660 ; AVX2-NEXT: vzeroupper 661 ; AVX2-NEXT: retq 662 ; 663 ; AVX512BW-LABEL: test_v16i64: 664 ; AVX512BW: # %bb.0: 665 ; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2 666 ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 667 ; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3 668 ; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 669 ; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2 670 ; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2 671 ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 672 ; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 673 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 674 ; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2 675 ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 676 ; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3 677 ; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 678 ; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2 679 ; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2 680 ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 681 ; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 682 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 683 ; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2 684 ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 685 ; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3 686 ; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 687 ; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2 688 ; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2 689 ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 690 ; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 691 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 692 ; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2 693 ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 694 ; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3 695 ; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 696 ; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2 697 ; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2 698 ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 699 ; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 700 ; AVX512BW-NEXT: vmovq %xmm0, %rax 701 ; AVX512BW-NEXT: vzeroupper 702 ; AVX512BW-NEXT: retq 703 ; 704 ; AVX512BWVL-LABEL: test_v16i64: 705 ; AVX512BWVL: # %bb.0: 706 ; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2 707 ; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 708 ; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3 709 ; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 710 ; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2 711 ; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2 712 ; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 713 ; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0 714 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 715 ; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2 716 ; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 717 ; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3 718 ; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 719 ; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2 720 ; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2 721 ; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 722 ; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0 723 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 724 ; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2 725 ; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 726 ; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3 727 ; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 728 ; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2 729 ; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2 730 ; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 731 ; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0 732 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 733 ; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2 734 ; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 735 ; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3 736 ; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 737 ; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2 738 ; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2 739 ; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 740 ; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0 741 ; AVX512BWVL-NEXT: vmovq %xmm0, %rax 742 ; AVX512BWVL-NEXT: vzeroupper 743 ; AVX512BWVL-NEXT: retq 744 ; 745 ; AVX512DQ-LABEL: test_v16i64: 746 ; AVX512DQ: # %bb.0: 747 ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 748 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 749 ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 750 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 751 ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 752 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 753 ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 754 ; AVX512DQ-NEXT: vmovq %xmm0, %rax 755 ; AVX512DQ-NEXT: vzeroupper 756 ; AVX512DQ-NEXT: retq 757 ; 758 ; AVX512DQVL-LABEL: test_v16i64: 759 ; AVX512DQVL: # %bb.0: 760 ; AVX512DQVL-NEXT: vpmullq %zmm1, %zmm0, %zmm0 761 ; AVX512DQVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 762 ; AVX512DQVL-NEXT: vpmullq %zmm1, %zmm0, %zmm0 763 ; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 764 ; AVX512DQVL-NEXT: vpmullq %zmm1, %zmm0, %zmm0 765 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 766 ; AVX512DQVL-NEXT: vpmullq %zmm1, %zmm0, %zmm0 767 ; AVX512DQVL-NEXT: vmovq %xmm0, %rax 768 ; AVX512DQVL-NEXT: vzeroupper 769 ; AVX512DQVL-NEXT: retq 770 %1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> %a0) 771 ret i64 %1 772 } 773 774 ; 775 ; vXi32 776 ; 777 778 define i32 @test_v4i32(<4 x i32> %a0) { 779 ; SSE2-LABEL: test_v4i32: 780 ; SSE2: # %bb.0: 781 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 782 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 783 ; SSE2-NEXT: pmuludq %xmm1, %xmm0 784 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 785 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 786 ; SSE2-NEXT: pmuludq %xmm2, %xmm1 787 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 788 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 789 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 790 ; SSE2-NEXT: pmuludq %xmm0, %xmm1 791 ; SSE2-NEXT: movd %xmm1, %eax 792 ; SSE2-NEXT: retq 793 ; 794 ; SSE41-LABEL: test_v4i32: 795 ; SSE41: # %bb.0: 796 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 797 ; SSE41-NEXT: pmulld %xmm0, %xmm1 798 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] 799 ; SSE41-NEXT: pmulld %xmm1, %xmm0 800 ; SSE41-NEXT: movd %xmm0, %eax 801 ; SSE41-NEXT: retq 802 ; 803 ; AVX-LABEL: test_v4i32: 804 ; AVX: # %bb.0: 805 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 806 ; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 807 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 808 ; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 809 ; AVX-NEXT: vmovd %xmm0, %eax 810 ; AVX-NEXT: retq 811 ; 812 ; AVX512-LABEL: test_v4i32: 813 ; AVX512: # %bb.0: 814 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 815 ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 816 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 817 ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 818 ; AVX512-NEXT: vmovd %xmm0, %eax 819 ; AVX512-NEXT: retq 820 %1 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> %a0) 821 ret i32 %1 822 } 823 824 define i32 @test_v8i32(<8 x i32> %a0) { 825 ; SSE2-LABEL: test_v8i32: 826 ; SSE2: # %bb.0: 827 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 828 ; SSE2-NEXT: pmuludq %xmm1, %xmm0 829 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 830 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 831 ; SSE2-NEXT: pmuludq %xmm2, %xmm1 832 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 833 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 834 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 835 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 836 ; SSE2-NEXT: pmuludq %xmm1, %xmm0 837 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 838 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 839 ; SSE2-NEXT: pmuludq %xmm2, %xmm1 840 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 841 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 842 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 843 ; SSE2-NEXT: pmuludq %xmm0, %xmm1 844 ; SSE2-NEXT: movd %xmm1, %eax 845 ; SSE2-NEXT: retq 846 ; 847 ; SSE41-LABEL: test_v8i32: 848 ; SSE41: # %bb.0: 849 ; SSE41-NEXT: pmulld %xmm1, %xmm0 850 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 851 ; SSE41-NEXT: pmulld %xmm0, %xmm1 852 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] 853 ; SSE41-NEXT: pmulld %xmm1, %xmm0 854 ; SSE41-NEXT: movd %xmm0, %eax 855 ; SSE41-NEXT: retq 856 ; 857 ; AVX1-LABEL: test_v8i32: 858 ; AVX1: # %bb.0: 859 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 860 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 861 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 862 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 863 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 864 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 865 ; AVX1-NEXT: vmovd %xmm0, %eax 866 ; AVX1-NEXT: vzeroupper 867 ; AVX1-NEXT: retq 868 ; 869 ; AVX2-LABEL: test_v8i32: 870 ; AVX2: # %bb.0: 871 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 872 ; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 873 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 874 ; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 875 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 876 ; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 877 ; AVX2-NEXT: vmovd %xmm0, %eax 878 ; AVX2-NEXT: vzeroupper 879 ; AVX2-NEXT: retq 880 ; 881 ; AVX512-LABEL: test_v8i32: 882 ; AVX512: # %bb.0: 883 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 884 ; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0 885 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 886 ; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0 887 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 888 ; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0 889 ; AVX512-NEXT: vmovd %xmm0, %eax 890 ; AVX512-NEXT: vzeroupper 891 ; AVX512-NEXT: retq 892 %1 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> %a0) 893 ret i32 %1 894 } 895 896 define i32 @test_v16i32(<16 x i32> %a0) { 897 ; SSE2-LABEL: test_v16i32: 898 ; SSE2: # %bb.0: 899 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] 900 ; SSE2-NEXT: pmuludq %xmm3, %xmm1 901 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 902 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 903 ; SSE2-NEXT: pmuludq %xmm4, %xmm3 904 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 905 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 906 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 907 ; SSE2-NEXT: pmuludq %xmm2, %xmm0 908 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 909 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 910 ; SSE2-NEXT: pmuludq %xmm3, %xmm2 911 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 912 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 913 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 914 ; SSE2-NEXT: pmuludq %xmm1, %xmm0 915 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 916 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 917 ; SSE2-NEXT: pmuludq %xmm2, %xmm1 918 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 919 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 920 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 921 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 922 ; SSE2-NEXT: pmuludq %xmm1, %xmm0 923 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 924 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 925 ; SSE2-NEXT: pmuludq %xmm2, %xmm1 926 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 927 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 928 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 929 ; SSE2-NEXT: pmuludq %xmm0, %xmm1 930 ; SSE2-NEXT: movd %xmm1, %eax 931 ; SSE2-NEXT: retq 932 ; 933 ; SSE41-LABEL: test_v16i32: 934 ; SSE41: # %bb.0: 935 ; SSE41-NEXT: pmulld %xmm3, %xmm1 936 ; SSE41-NEXT: pmulld %xmm2, %xmm0 937 ; SSE41-NEXT: pmulld %xmm1, %xmm0 938 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 939 ; SSE41-NEXT: pmulld %xmm0, %xmm1 940 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] 941 ; SSE41-NEXT: pmulld %xmm1, %xmm0 942 ; SSE41-NEXT: movd %xmm0, %eax 943 ; SSE41-NEXT: retq 944 ; 945 ; AVX1-LABEL: test_v16i32: 946 ; AVX1: # %bb.0: 947 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 948 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 949 ; AVX1-NEXT: vpmulld %xmm2, %xmm3, %xmm2 950 ; AVX1-NEXT: vpmulld %xmm2, %xmm1, %xmm1 951 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 952 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 953 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 954 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 955 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 956 ; AVX1-NEXT: vmovd %xmm0, %eax 957 ; AVX1-NEXT: vzeroupper 958 ; AVX1-NEXT: retq 959 ; 960 ; AVX2-LABEL: test_v16i32: 961 ; AVX2: # %bb.0: 962 ; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 963 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 964 ; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 965 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 966 ; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 967 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 968 ; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 969 ; AVX2-NEXT: vmovd %xmm0, %eax 970 ; AVX2-NEXT: vzeroupper 971 ; AVX2-NEXT: retq 972 ; 973 ; AVX512-LABEL: test_v16i32: 974 ; AVX512: # %bb.0: 975 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 976 ; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 977 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 978 ; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 979 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 980 ; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 981 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 982 ; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 983 ; AVX512-NEXT: vmovd %xmm0, %eax 984 ; AVX512-NEXT: vzeroupper 985 ; AVX512-NEXT: retq 986 %1 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> %a0) 987 ret i32 %1 988 } 989 990 define i32 @test_v32i32(<32 x i32> %a0) { 991 ; SSE2-LABEL: test_v32i32: 992 ; SSE2: # %bb.0: 993 ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm2[1,1,3,3] 994 ; SSE2-NEXT: pmuludq %xmm6, %xmm2 995 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 996 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] 997 ; SSE2-NEXT: pmuludq %xmm8, %xmm6 998 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] 999 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] 1000 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] 1001 ; SSE2-NEXT: pmuludq %xmm4, %xmm0 1002 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1003 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 1004 ; SSE2-NEXT: pmuludq %xmm6, %xmm4 1005 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 1006 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 1007 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] 1008 ; SSE2-NEXT: pmuludq %xmm7, %xmm3 1009 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 1010 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] 1011 ; SSE2-NEXT: pmuludq %xmm4, %xmm6 1012 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,2,2,3] 1013 ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 1014 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] 1015 ; SSE2-NEXT: pmuludq %xmm5, %xmm1 1016 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1017 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] 1018 ; SSE2-NEXT: pmuludq %xmm4, %xmm5 1019 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] 1020 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] 1021 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] 1022 ; SSE2-NEXT: pmuludq %xmm3, %xmm1 1023 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1024 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 1025 ; SSE2-NEXT: pmuludq %xmm4, %xmm3 1026 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 1027 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 1028 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 1029 ; SSE2-NEXT: pmuludq %xmm2, %xmm0 1030 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1031 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 1032 ; SSE2-NEXT: pmuludq %xmm3, %xmm2 1033 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 1034 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 1035 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 1036 ; SSE2-NEXT: pmuludq %xmm1, %xmm0 1037 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1038 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 1039 ; SSE2-NEXT: pmuludq %xmm2, %xmm1 1040 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1041 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1042 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1043 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 1044 ; SSE2-NEXT: pmuludq %xmm1, %xmm0 1045 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1046 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 1047 ; SSE2-NEXT: pmuludq %xmm2, %xmm1 1048 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1049 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1050 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1051 ; SSE2-NEXT: pmuludq %xmm0, %xmm1 1052 ; SSE2-NEXT: movd %xmm1, %eax 1053 ; SSE2-NEXT: retq 1054 ; 1055 ; SSE41-LABEL: test_v32i32: 1056 ; SSE41: # %bb.0: 1057 ; SSE41-NEXT: pmulld %xmm6, %xmm2 1058 ; SSE41-NEXT: pmulld %xmm4, %xmm0 1059 ; SSE41-NEXT: pmulld %xmm2, %xmm0 1060 ; SSE41-NEXT: pmulld %xmm7, %xmm3 1061 ; SSE41-NEXT: pmulld %xmm5, %xmm1 1062 ; SSE41-NEXT: pmulld %xmm3, %xmm1 1063 ; SSE41-NEXT: pmulld %xmm0, %xmm1 1064 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 1065 ; SSE41-NEXT: pmulld %xmm1, %xmm0 1066 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1067 ; SSE41-NEXT: pmulld %xmm0, %xmm1 1068 ; SSE41-NEXT: movd %xmm1, %eax 1069 ; SSE41-NEXT: retq 1070 ; 1071 ; AVX1-LABEL: test_v32i32: 1072 ; AVX1: # %bb.0: 1073 ; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm4 1074 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 1075 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1076 ; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1 1077 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 1078 ; AVX1-NEXT: vpmulld %xmm1, %xmm3, %xmm1 1079 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1080 ; AVX1-NEXT: vpmulld %xmm1, %xmm3, %xmm1 1081 ; AVX1-NEXT: vpmulld %xmm4, %xmm2, %xmm2 1082 ; AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1 1083 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 1084 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1085 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 1086 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1087 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 1088 ; AVX1-NEXT: vmovd %xmm0, %eax 1089 ; AVX1-NEXT: vzeroupper 1090 ; AVX1-NEXT: retq 1091 ; 1092 ; AVX2-LABEL: test_v32i32: 1093 ; AVX2: # %bb.0: 1094 ; AVX2-NEXT: vpmulld %ymm3, %ymm1, %ymm1 1095 ; AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1 1096 ; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 1097 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1098 ; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 1099 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1100 ; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 1101 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1102 ; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 1103 ; AVX2-NEXT: vmovd %xmm0, %eax 1104 ; AVX2-NEXT: vzeroupper 1105 ; AVX2-NEXT: retq 1106 ; 1107 ; AVX512-LABEL: test_v32i32: 1108 ; AVX512: # %bb.0: 1109 ; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 1110 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1111 ; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 1112 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 1113 ; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 1114 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1115 ; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 1116 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1117 ; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 1118 ; AVX512-NEXT: vmovd %xmm0, %eax 1119 ; AVX512-NEXT: vzeroupper 1120 ; AVX512-NEXT: retq 1121 %1 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> %a0) 1122 ret i32 %1 1123 } 1124 1125 ; 1126 ; vXi16 1127 ; 1128 1129 define i16 @test_v8i16(<8 x i16> %a0) { 1130 ; SSE-LABEL: test_v8i16: 1131 ; SSE: # %bb.0: 1132 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1133 ; SSE-NEXT: pmullw %xmm0, %xmm1 1134 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] 1135 ; SSE-NEXT: pmullw %xmm1, %xmm0 1136 ; SSE-NEXT: movdqa %xmm0, %xmm1 1137 ; SSE-NEXT: psrld $16, %xmm1 1138 ; SSE-NEXT: pmullw %xmm0, %xmm1 1139 ; SSE-NEXT: movd %xmm1, %eax 1140 ; SSE-NEXT: # kill: def $ax killed $ax killed $eax 1141 ; SSE-NEXT: retq 1142 ; 1143 ; AVX-LABEL: test_v8i16: 1144 ; AVX: # %bb.0: 1145 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1146 ; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1147 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1148 ; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1149 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 1150 ; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1151 ; AVX-NEXT: vmovd %xmm0, %eax 1152 ; AVX-NEXT: # kill: def $ax killed $ax killed $eax 1153 ; AVX-NEXT: retq 1154 ; 1155 ; AVX512-LABEL: test_v8i16: 1156 ; AVX512: # %bb.0: 1157 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1158 ; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1159 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1160 ; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1161 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 1162 ; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1163 ; AVX512-NEXT: vmovd %xmm0, %eax 1164 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax 1165 ; AVX512-NEXT: retq 1166 %1 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> %a0) 1167 ret i16 %1 1168 } 1169 1170 define i16 @test_v16i16(<16 x i16> %a0) { 1171 ; SSE-LABEL: test_v16i16: 1172 ; SSE: # %bb.0: 1173 ; SSE-NEXT: pmullw %xmm1, %xmm0 1174 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1175 ; SSE-NEXT: pmullw %xmm0, %xmm1 1176 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] 1177 ; SSE-NEXT: pmullw %xmm1, %xmm0 1178 ; SSE-NEXT: movdqa %xmm0, %xmm1 1179 ; SSE-NEXT: psrld $16, %xmm1 1180 ; SSE-NEXT: pmullw %xmm0, %xmm1 1181 ; SSE-NEXT: movd %xmm1, %eax 1182 ; SSE-NEXT: # kill: def $ax killed $ax killed $eax 1183 ; SSE-NEXT: retq 1184 ; 1185 ; AVX1-LABEL: test_v16i16: 1186 ; AVX1: # %bb.0: 1187 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1188 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1189 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1190 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1191 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1192 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1193 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 1194 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1195 ; AVX1-NEXT: vmovd %xmm0, %eax 1196 ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax 1197 ; AVX1-NEXT: vzeroupper 1198 ; AVX1-NEXT: retq 1199 ; 1200 ; AVX2-LABEL: test_v16i16: 1201 ; AVX2: # %bb.0: 1202 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1203 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1204 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1205 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1206 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1207 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1208 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 1209 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1210 ; AVX2-NEXT: vmovd %xmm0, %eax 1211 ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax 1212 ; AVX2-NEXT: vzeroupper 1213 ; AVX2-NEXT: retq 1214 ; 1215 ; AVX512-LABEL: test_v16i16: 1216 ; AVX512: # %bb.0: 1217 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 1218 ; AVX512-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1219 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1220 ; AVX512-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1221 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1222 ; AVX512-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1223 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 1224 ; AVX512-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1225 ; AVX512-NEXT: vmovd %xmm0, %eax 1226 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax 1227 ; AVX512-NEXT: vzeroupper 1228 ; AVX512-NEXT: retq 1229 %1 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> %a0) 1230 ret i16 %1 1231 } 1232 1233 define i16 @test_v32i16(<32 x i16> %a0) { 1234 ; SSE-LABEL: test_v32i16: 1235 ; SSE: # %bb.0: 1236 ; SSE-NEXT: pmullw %xmm3, %xmm1 1237 ; SSE-NEXT: pmullw %xmm2, %xmm0 1238 ; SSE-NEXT: pmullw %xmm1, %xmm0 1239 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1240 ; SSE-NEXT: pmullw %xmm0, %xmm1 1241 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] 1242 ; SSE-NEXT: pmullw %xmm1, %xmm0 1243 ; SSE-NEXT: movdqa %xmm0, %xmm1 1244 ; SSE-NEXT: psrld $16, %xmm1 1245 ; SSE-NEXT: pmullw %xmm0, %xmm1 1246 ; SSE-NEXT: movd %xmm1, %eax 1247 ; SSE-NEXT: # kill: def $ax killed $ax killed $eax 1248 ; SSE-NEXT: retq 1249 ; 1250 ; AVX1-LABEL: test_v32i16: 1251 ; AVX1: # %bb.0: 1252 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1253 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1254 ; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2 1255 ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 1256 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1257 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1258 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1259 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1260 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1261 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 1262 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1263 ; AVX1-NEXT: vmovd %xmm0, %eax 1264 ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax 1265 ; AVX1-NEXT: vzeroupper 1266 ; AVX1-NEXT: retq 1267 ; 1268 ; AVX2-LABEL: test_v32i16: 1269 ; AVX2: # %bb.0: 1270 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1271 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1272 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1273 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1274 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1275 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1276 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1277 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 1278 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1279 ; AVX2-NEXT: vmovd %xmm0, %eax 1280 ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax 1281 ; AVX2-NEXT: vzeroupper 1282 ; AVX2-NEXT: retq 1283 ; 1284 ; AVX512BW-LABEL: test_v32i16: 1285 ; AVX512BW: # %bb.0: 1286 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1287 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1288 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 1289 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1290 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1291 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1292 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1293 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1294 ; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm1 1295 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1296 ; AVX512BW-NEXT: vmovd %xmm0, %eax 1297 ; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax 1298 ; AVX512BW-NEXT: vzeroupper 1299 ; AVX512BW-NEXT: retq 1300 ; 1301 ; AVX512BWVL-LABEL: test_v32i16: 1302 ; AVX512BWVL: # %bb.0: 1303 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1304 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1305 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 1306 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1307 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1308 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1309 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1310 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1311 ; AVX512BWVL-NEXT: vpsrld $16, %xmm0, %xmm1 1312 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1313 ; AVX512BWVL-NEXT: vmovd %xmm0, %eax 1314 ; AVX512BWVL-NEXT: # kill: def $ax killed $ax killed $eax 1315 ; AVX512BWVL-NEXT: vzeroupper 1316 ; AVX512BWVL-NEXT: retq 1317 ; 1318 ; AVX512DQ-LABEL: test_v32i16: 1319 ; AVX512DQ: # %bb.0: 1320 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1321 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 1322 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1323 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1324 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1325 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1326 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1327 ; AVX512DQ-NEXT: vpsrld $16, %xmm0, %xmm1 1328 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1329 ; AVX512DQ-NEXT: vmovd %xmm0, %eax 1330 ; AVX512DQ-NEXT: # kill: def $ax killed $ax killed $eax 1331 ; AVX512DQ-NEXT: vzeroupper 1332 ; AVX512DQ-NEXT: retq 1333 ; 1334 ; AVX512DQVL-LABEL: test_v32i16: 1335 ; AVX512DQVL: # %bb.0: 1336 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1337 ; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 1338 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1339 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1340 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1341 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1342 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1343 ; AVX512DQVL-NEXT: vpsrld $16, %xmm0, %xmm1 1344 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1345 ; AVX512DQVL-NEXT: vmovd %xmm0, %eax 1346 ; AVX512DQVL-NEXT: # kill: def $ax killed $ax killed $eax 1347 ; AVX512DQVL-NEXT: vzeroupper 1348 ; AVX512DQVL-NEXT: retq 1349 %1 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> %a0) 1350 ret i16 %1 1351 } 1352 1353 define i16 @test_v64i16(<64 x i16> %a0) { 1354 ; SSE-LABEL: test_v64i16: 1355 ; SSE: # %bb.0: 1356 ; SSE-NEXT: pmullw %xmm6, %xmm2 1357 ; SSE-NEXT: pmullw %xmm4, %xmm0 1358 ; SSE-NEXT: pmullw %xmm2, %xmm0 1359 ; SSE-NEXT: pmullw %xmm7, %xmm3 1360 ; SSE-NEXT: pmullw %xmm5, %xmm1 1361 ; SSE-NEXT: pmullw %xmm3, %xmm1 1362 ; SSE-NEXT: pmullw %xmm0, %xmm1 1363 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 1364 ; SSE-NEXT: pmullw %xmm1, %xmm0 1365 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1366 ; SSE-NEXT: pmullw %xmm0, %xmm1 1367 ; SSE-NEXT: movdqa %xmm1, %xmm0 1368 ; SSE-NEXT: psrld $16, %xmm0 1369 ; SSE-NEXT: pmullw %xmm1, %xmm0 1370 ; SSE-NEXT: movd %xmm0, %eax 1371 ; SSE-NEXT: # kill: def $ax killed $ax killed $eax 1372 ; SSE-NEXT: retq 1373 ; 1374 ; AVX1-LABEL: test_v64i16: 1375 ; AVX1: # %bb.0: 1376 ; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm4 1377 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 1378 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1379 ; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1 1380 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 1381 ; AVX1-NEXT: vpmullw %xmm1, %xmm3, %xmm1 1382 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1383 ; AVX1-NEXT: vpmullw %xmm1, %xmm3, %xmm1 1384 ; AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2 1385 ; AVX1-NEXT: vpmullw %xmm1, %xmm2, %xmm1 1386 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1387 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1388 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1389 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1390 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1391 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 1392 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1393 ; AVX1-NEXT: vmovd %xmm0, %eax 1394 ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax 1395 ; AVX1-NEXT: vzeroupper 1396 ; AVX1-NEXT: retq 1397 ; 1398 ; AVX2-LABEL: test_v64i16: 1399 ; AVX2: # %bb.0: 1400 ; AVX2-NEXT: vpmullw %ymm3, %ymm1, %ymm1 1401 ; AVX2-NEXT: vpmullw %ymm1, %ymm2, %ymm1 1402 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1403 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1404 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1405 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1406 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1407 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1408 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1409 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 1410 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1411 ; AVX2-NEXT: vmovd %xmm0, %eax 1412 ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax 1413 ; AVX2-NEXT: vzeroupper 1414 ; AVX2-NEXT: retq 1415 ; 1416 ; AVX512BW-LABEL: test_v64i16: 1417 ; AVX512BW: # %bb.0: 1418 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1419 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1420 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1421 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 1422 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1423 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1424 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1425 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1426 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1427 ; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm1 1428 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1429 ; AVX512BW-NEXT: vmovd %xmm0, %eax 1430 ; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax 1431 ; AVX512BW-NEXT: vzeroupper 1432 ; AVX512BW-NEXT: retq 1433 ; 1434 ; AVX512BWVL-LABEL: test_v64i16: 1435 ; AVX512BWVL: # %bb.0: 1436 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1437 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1438 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1439 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 1440 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1441 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1442 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1443 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1444 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1445 ; AVX512BWVL-NEXT: vpsrld $16, %xmm0, %xmm1 1446 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1447 ; AVX512BWVL-NEXT: vmovd %xmm0, %eax 1448 ; AVX512BWVL-NEXT: # kill: def $ax killed $ax killed $eax 1449 ; AVX512BWVL-NEXT: vzeroupper 1450 ; AVX512BWVL-NEXT: retq 1451 ; 1452 ; AVX512DQ-LABEL: test_v64i16: 1453 ; AVX512DQ: # %bb.0: 1454 ; AVX512DQ-NEXT: vpmullw %ymm3, %ymm1, %ymm1 1455 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm2, %ymm1 1456 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1457 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 1458 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1459 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1460 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1461 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1462 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1463 ; AVX512DQ-NEXT: vpsrld $16, %xmm0, %xmm1 1464 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1465 ; AVX512DQ-NEXT: vmovd %xmm0, %eax 1466 ; AVX512DQ-NEXT: # kill: def $ax killed $ax killed $eax 1467 ; AVX512DQ-NEXT: vzeroupper 1468 ; AVX512DQ-NEXT: retq 1469 ; 1470 ; AVX512DQVL-LABEL: test_v64i16: 1471 ; AVX512DQVL: # %bb.0: 1472 ; AVX512DQVL-NEXT: vpmullw %ymm3, %ymm1, %ymm1 1473 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm2, %ymm1 1474 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1475 ; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 1476 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1477 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1478 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1479 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1480 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1481 ; AVX512DQVL-NEXT: vpsrld $16, %xmm0, %xmm1 1482 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1483 ; AVX512DQVL-NEXT: vmovd %xmm0, %eax 1484 ; AVX512DQVL-NEXT: # kill: def $ax killed $ax killed $eax 1485 ; AVX512DQVL-NEXT: vzeroupper 1486 ; AVX512DQVL-NEXT: retq 1487 %1 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> %a0) 1488 ret i16 %1 1489 } 1490 1491 ; 1492 ; vXi8 1493 ; 1494 1495 define i8 @test_v16i8(<16 x i8> %a0) { 1496 ; SSE2-LABEL: test_v16i8: 1497 ; SSE2: # %bb.0: 1498 ; SSE2-NEXT: movdqa %xmm0, %xmm1 1499 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 1500 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1501 ; SSE2-NEXT: pmullw %xmm1, %xmm0 1502 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] 1503 ; SSE2-NEXT: pand %xmm1, %xmm0 1504 ; SSE2-NEXT: pxor %xmm3, %xmm3 1505 ; SSE2-NEXT: packuswb %xmm3, %xmm0 1506 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,2,3,3] 1507 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1508 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 1509 ; SSE2-NEXT: pmullw %xmm0, %xmm2 1510 ; SSE2-NEXT: pand %xmm1, %xmm2 1511 ; SSE2-NEXT: packuswb %xmm3, %xmm2 1512 ; SSE2-NEXT: movdqa %xmm2, %xmm0 1513 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1514 ; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 1515 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 1516 ; SSE2-NEXT: pmullw %xmm0, %xmm2 1517 ; SSE2-NEXT: pand %xmm1, %xmm2 1518 ; SSE2-NEXT: packuswb %xmm3, %xmm2 1519 ; SSE2-NEXT: movdqa %xmm2, %xmm0 1520 ; SSE2-NEXT: psrlw $8, %xmm0 1521 ; SSE2-NEXT: movdqa %xmm2, %xmm3 1522 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] 1523 ; SSE2-NEXT: pmullw %xmm0, %xmm3 1524 ; SSE2-NEXT: pand %xmm1, %xmm3 1525 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 1526 ; SSE2-NEXT: pmullw %xmm0, %xmm2 1527 ; SSE2-NEXT: pand %xmm1, %xmm2 1528 ; SSE2-NEXT: packuswb %xmm3, %xmm2 1529 ; SSE2-NEXT: movd %xmm2, %eax 1530 ; SSE2-NEXT: # kill: def $al killed $al killed $eax 1531 ; SSE2-NEXT: retq 1532 ; 1533 ; SSE41-LABEL: test_v16i8: 1534 ; SSE41: # %bb.0: 1535 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1536 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1537 ; SSE41-NEXT: pmullw %xmm1, %xmm0 1538 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] 1539 ; SSE41-NEXT: pand %xmm1, %xmm0 1540 ; SSE41-NEXT: pxor %xmm2, %xmm2 1541 ; SSE41-NEXT: packuswb %xmm2, %xmm0 1542 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1543 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1544 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 1545 ; SSE41-NEXT: pmullw %xmm3, %xmm0 1546 ; SSE41-NEXT: pand %xmm1, %xmm0 1547 ; SSE41-NEXT: packuswb %xmm2, %xmm0 1548 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1549 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1550 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] 1551 ; SSE41-NEXT: pmullw %xmm3, %xmm0 1552 ; SSE41-NEXT: pand %xmm1, %xmm0 1553 ; SSE41-NEXT: packuswb %xmm2, %xmm0 1554 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1555 ; SSE41-NEXT: psrlw $8, %xmm0 1556 ; SSE41-NEXT: pmullw %xmm1, %xmm0 1557 ; SSE41-NEXT: pextrb $0, %xmm0, %eax 1558 ; SSE41-NEXT: # kill: def $al killed $al killed $eax 1559 ; SSE41-NEXT: retq 1560 ; 1561 ; AVX1-LABEL: test_v16i8: 1562 ; AVX1: # %bb.0: 1563 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1564 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1565 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1566 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] 1567 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 1568 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1569 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 1570 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1571 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1572 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 1573 ; AVX1-NEXT: vpmullw %xmm0, %xmm3, %xmm0 1574 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 1575 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 1576 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1577 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1578 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] 1579 ; AVX1-NEXT: vpmullw %xmm0, %xmm3, %xmm0 1580 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 1581 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 1582 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 1583 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1584 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1585 ; AVX1-NEXT: vpextrb $0, %xmm0, %eax 1586 ; AVX1-NEXT: # kill: def $al killed $al killed $eax 1587 ; AVX1-NEXT: retq 1588 ; 1589 ; AVX2-LABEL: test_v16i8: 1590 ; AVX2: # %bb.0: 1591 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1592 ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 1593 ; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 1594 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1595 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1596 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 1597 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1598 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1599 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1600 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1601 ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 1602 ; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 1603 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1604 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1605 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1606 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1607 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1608 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 1609 ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 1610 ; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 1611 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1612 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1613 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1614 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1615 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1616 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 1617 ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 1618 ; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 1619 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1620 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax 1621 ; AVX2-NEXT: # kill: def $al killed $al killed $eax 1622 ; AVX2-NEXT: vzeroupper 1623 ; AVX2-NEXT: retq 1624 ; 1625 ; AVX512BW-LABEL: test_v16i8: 1626 ; AVX512BW: # %bb.0: 1627 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1628 ; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 1629 ; AVX512BW-NEXT: vpmovsxbw %xmm1, %ymm1 1630 ; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1631 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1632 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1633 ; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 1634 ; AVX512BW-NEXT: vpmovsxbw %xmm1, %ymm1 1635 ; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1636 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1637 ; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm1 1638 ; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 1639 ; AVX512BW-NEXT: vpmovsxbw %xmm1, %ymm1 1640 ; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1641 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1642 ; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 1643 ; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 1644 ; AVX512BW-NEXT: vpmovsxbw %xmm1, %ymm1 1645 ; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1646 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1647 ; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax 1648 ; AVX512BW-NEXT: # kill: def $al killed $al killed $eax 1649 ; AVX512BW-NEXT: vzeroupper 1650 ; AVX512BW-NEXT: retq 1651 ; 1652 ; AVX512BWVL-LABEL: test_v16i8: 1653 ; AVX512BWVL: # %bb.0: 1654 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1655 ; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 1656 ; AVX512BWVL-NEXT: vpmovsxbw %xmm1, %ymm1 1657 ; AVX512BWVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1658 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 1659 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1660 ; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 1661 ; AVX512BWVL-NEXT: vpmovsxbw %xmm1, %ymm1 1662 ; AVX512BWVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1663 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 1664 ; AVX512BWVL-NEXT: vpsrld $16, %xmm0, %xmm1 1665 ; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 1666 ; AVX512BWVL-NEXT: vpmovsxbw %xmm1, %ymm1 1667 ; AVX512BWVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1668 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 1669 ; AVX512BWVL-NEXT: vpsrlw $8, %xmm0, %xmm1 1670 ; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 1671 ; AVX512BWVL-NEXT: vpmovsxbw %xmm1, %ymm1 1672 ; AVX512BWVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1673 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 1674 ; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %eax 1675 ; AVX512BWVL-NEXT: # kill: def $al killed $al killed $eax 1676 ; AVX512BWVL-NEXT: vzeroupper 1677 ; AVX512BWVL-NEXT: retq 1678 ; 1679 ; AVX512DQ-LABEL: test_v16i8: 1680 ; AVX512DQ: # %bb.0: 1681 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1682 ; AVX512DQ-NEXT: vpmovsxbw %xmm0, %ymm0 1683 ; AVX512DQ-NEXT: vpmovsxbw %xmm1, %ymm1 1684 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1685 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 1686 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 1687 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1688 ; AVX512DQ-NEXT: vpmovsxbw %xmm0, %ymm0 1689 ; AVX512DQ-NEXT: vpmovsxbw %xmm1, %ymm1 1690 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1691 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 1692 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 1693 ; AVX512DQ-NEXT: vpsrld $16, %xmm0, %xmm1 1694 ; AVX512DQ-NEXT: vpmovsxbw %xmm0, %ymm0 1695 ; AVX512DQ-NEXT: vpmovsxbw %xmm1, %ymm1 1696 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1697 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 1698 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 1699 ; AVX512DQ-NEXT: vpsrlw $8, %xmm0, %xmm1 1700 ; AVX512DQ-NEXT: vpmovsxbw %xmm0, %ymm0 1701 ; AVX512DQ-NEXT: vpmovsxbw %xmm1, %ymm1 1702 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1703 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 1704 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 1705 ; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax 1706 ; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax 1707 ; AVX512DQ-NEXT: vzeroupper 1708 ; AVX512DQ-NEXT: retq 1709 ; 1710 ; AVX512DQVL-LABEL: test_v16i8: 1711 ; AVX512DQVL: # %bb.0: 1712 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1713 ; AVX512DQVL-NEXT: vpmovsxbw %xmm0, %ymm0 1714 ; AVX512DQVL-NEXT: vpmovsxbw %xmm1, %ymm1 1715 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1716 ; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0 1717 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 1718 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1719 ; AVX512DQVL-NEXT: vpmovsxbw %xmm0, %ymm0 1720 ; AVX512DQVL-NEXT: vpmovsxbw %xmm1, %ymm1 1721 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1722 ; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0 1723 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 1724 ; AVX512DQVL-NEXT: vpsrld $16, %xmm0, %xmm1 1725 ; AVX512DQVL-NEXT: vpmovsxbw %xmm0, %ymm0 1726 ; AVX512DQVL-NEXT: vpmovsxbw %xmm1, %ymm1 1727 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1728 ; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0 1729 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 1730 ; AVX512DQVL-NEXT: vpsrlw $8, %xmm0, %xmm1 1731 ; AVX512DQVL-NEXT: vpmovsxbw %xmm0, %ymm0 1732 ; AVX512DQVL-NEXT: vpmovsxbw %xmm1, %ymm1 1733 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1734 ; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0 1735 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 1736 ; AVX512DQVL-NEXT: vpextrb $0, %xmm0, %eax 1737 ; AVX512DQVL-NEXT: # kill: def $al killed $al killed $eax 1738 ; AVX512DQVL-NEXT: vzeroupper 1739 ; AVX512DQVL-NEXT: retq 1740 %1 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> %a0) 1741 ret i8 %1 1742 } 1743 1744 define i8 @test_v32i8(<32 x i8> %a0) { 1745 ; SSE2-LABEL: test_v32i8: 1746 ; SSE2: # %bb.0: 1747 ; SSE2-NEXT: movdqa %xmm1, %xmm2 1748 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 1749 ; SSE2-NEXT: movdqa %xmm0, %xmm3 1750 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] 1751 ; SSE2-NEXT: pmullw %xmm2, %xmm3 1752 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 1753 ; SSE2-NEXT: pand %xmm2, %xmm3 1754 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1755 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1756 ; SSE2-NEXT: pmullw %xmm1, %xmm0 1757 ; SSE2-NEXT: pand %xmm2, %xmm0 1758 ; SSE2-NEXT: packuswb %xmm3, %xmm0 1759 ; SSE2-NEXT: movdqa %xmm0, %xmm1 1760 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 1761 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1762 ; SSE2-NEXT: pmullw %xmm1, %xmm0 1763 ; SSE2-NEXT: pand %xmm2, %xmm0 1764 ; SSE2-NEXT: pxor %xmm3, %xmm3 1765 ; SSE2-NEXT: packuswb %xmm3, %xmm0 1766 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,2,3,3] 1767 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1768 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1769 ; SSE2-NEXT: pmullw %xmm0, %xmm1 1770 ; SSE2-NEXT: pand %xmm2, %xmm1 1771 ; SSE2-NEXT: packuswb %xmm3, %xmm1 1772 ; SSE2-NEXT: movdqa %xmm1, %xmm0 1773 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1774 ; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 1775 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1776 ; SSE2-NEXT: pmullw %xmm0, %xmm1 1777 ; SSE2-NEXT: pand %xmm2, %xmm1 1778 ; SSE2-NEXT: packuswb %xmm3, %xmm1 1779 ; SSE2-NEXT: movdqa %xmm1, %xmm0 1780 ; SSE2-NEXT: psrlw $8, %xmm0 1781 ; SSE2-NEXT: movdqa %xmm1, %xmm3 1782 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] 1783 ; SSE2-NEXT: pmullw %xmm0, %xmm3 1784 ; SSE2-NEXT: pand %xmm2, %xmm3 1785 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1786 ; SSE2-NEXT: pmullw %xmm0, %xmm1 1787 ; SSE2-NEXT: pand %xmm2, %xmm1 1788 ; SSE2-NEXT: packuswb %xmm3, %xmm1 1789 ; SSE2-NEXT: movd %xmm1, %eax 1790 ; SSE2-NEXT: # kill: def $al killed $al killed $eax 1791 ; SSE2-NEXT: retq 1792 ; 1793 ; SSE41-LABEL: test_v32i8: 1794 ; SSE41: # %bb.0: 1795 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 1796 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1797 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1798 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1799 ; SSE41-NEXT: pmullw %xmm1, %xmm0 1800 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] 1801 ; SSE41-NEXT: pand %xmm1, %xmm0 1802 ; SSE41-NEXT: pmullw %xmm2, %xmm3 1803 ; SSE41-NEXT: pand %xmm1, %xmm3 1804 ; SSE41-NEXT: packuswb %xmm0, %xmm3 1805 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 1806 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1807 ; SSE41-NEXT: pmullw %xmm0, %xmm3 1808 ; SSE41-NEXT: pand %xmm1, %xmm3 1809 ; SSE41-NEXT: pxor %xmm0, %xmm0 1810 ; SSE41-NEXT: packuswb %xmm0, %xmm3 1811 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 1812 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1813 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] 1814 ; SSE41-NEXT: pmullw %xmm2, %xmm3 1815 ; SSE41-NEXT: pand %xmm1, %xmm3 1816 ; SSE41-NEXT: packuswb %xmm0, %xmm3 1817 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 1818 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1819 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] 1820 ; SSE41-NEXT: pmullw %xmm2, %xmm3 1821 ; SSE41-NEXT: pand %xmm1, %xmm3 1822 ; SSE41-NEXT: packuswb %xmm0, %xmm3 1823 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 1824 ; SSE41-NEXT: psrlw $8, %xmm3 1825 ; SSE41-NEXT: pmullw %xmm0, %xmm3 1826 ; SSE41-NEXT: pextrb $0, %xmm3, %eax 1827 ; SSE41-NEXT: # kill: def $al killed $al killed $eax 1828 ; SSE41-NEXT: retq 1829 ; 1830 ; AVX1-LABEL: test_v32i8: 1831 ; AVX1: # %bb.0: 1832 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1833 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1834 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1835 ; AVX1-NEXT: vpmullw %xmm1, %xmm3, %xmm3 1836 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] 1837 ; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm3 1838 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 1839 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1840 ; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 1841 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 1842 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 1843 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] 1844 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1845 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1846 ; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 1847 ; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm3 1848 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1849 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 1850 ; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 1851 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 1852 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 1853 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] 1854 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1855 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1856 ; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 1857 ; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm3 1858 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1859 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 1860 ; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 1861 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 1862 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 1863 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm2 1864 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1865 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1866 ; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 1867 ; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm3 1868 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1869 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 1870 ; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 1871 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 1872 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 1873 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2 1874 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1875 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1876 ; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 1877 ; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm3 1878 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1879 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 1880 ; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 1881 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 1882 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 1883 ; AVX1-NEXT: vpextrb $0, %xmm0, %eax 1884 ; AVX1-NEXT: # kill: def $al killed $al killed $eax 1885 ; AVX1-NEXT: vzeroupper 1886 ; AVX1-NEXT: retq 1887 ; 1888 ; AVX2-LABEL: test_v32i8: 1889 ; AVX2: # %bb.0: 1890 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1891 ; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 1892 ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 1893 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1894 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1895 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 1896 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1897 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1898 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1899 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1900 ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 1901 ; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 1902 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1903 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1904 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1905 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1906 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1907 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1908 ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 1909 ; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 1910 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1911 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1912 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1913 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1914 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1915 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 1916 ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 1917 ; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 1918 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1919 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1920 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1921 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1922 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1923 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 1924 ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 1925 ; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 1926 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1927 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax 1928 ; AVX2-NEXT: # kill: def $al killed $al killed $eax 1929 ; AVX2-NEXT: vzeroupper 1930 ; AVX2-NEXT: retq 1931 ; 1932 ; AVX512BW-LABEL: test_v32i8: 1933 ; AVX512BW: # %bb.0: 1934 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 1935 ; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1 1936 ; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 1937 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1938 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1939 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1940 ; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 1941 ; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1 1942 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1943 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1944 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1945 ; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 1946 ; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1 1947 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1948 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1949 ; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm1 1950 ; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 1951 ; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1 1952 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1953 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1954 ; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 1955 ; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 1956 ; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1 1957 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1958 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1959 ; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax 1960 ; AVX512BW-NEXT: # kill: def $al killed $al killed $eax 1961 ; AVX512BW-NEXT: vzeroupper 1962 ; AVX512BW-NEXT: retq 1963 ; 1964 ; AVX512BWVL-LABEL: test_v32i8: 1965 ; AVX512BWVL: # %bb.0: 1966 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 1967 ; AVX512BWVL-NEXT: vpmovsxbw %ymm1, %zmm1 1968 ; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0 1969 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1970 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 1971 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 1972 ; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0 1973 ; AVX512BWVL-NEXT: vpmovsxbw %ymm1, %zmm1 1974 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1975 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 1976 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1977 ; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0 1978 ; AVX512BWVL-NEXT: vpmovsxbw %ymm1, %zmm1 1979 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1980 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 1981 ; AVX512BWVL-NEXT: vpsrld $16, %xmm0, %xmm1 1982 ; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0 1983 ; AVX512BWVL-NEXT: vpmovsxbw %ymm1, %zmm1 1984 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1985 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 1986 ; AVX512BWVL-NEXT: vpsrlw $8, %xmm0, %xmm1 1987 ; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0 1988 ; AVX512BWVL-NEXT: vpmovsxbw %ymm1, %zmm1 1989 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1990 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 1991 ; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %eax 1992 ; AVX512BWVL-NEXT: # kill: def $al killed $al killed $eax 1993 ; AVX512BWVL-NEXT: vzeroupper 1994 ; AVX512BWVL-NEXT: retq 1995 ; 1996 ; AVX512DQ-LABEL: test_v32i8: 1997 ; AVX512DQ: # %bb.0: 1998 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 1999 ; AVX512DQ-NEXT: vpmovsxbw %xmm1, %ymm1 2000 ; AVX512DQ-NEXT: vpmovsxbw %xmm0, %ymm0 2001 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2002 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 2003 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 2004 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 2005 ; AVX512DQ-NEXT: vpmovsxbw %xmm0, %ymm0 2006 ; AVX512DQ-NEXT: vpmovsxbw %xmm1, %ymm1 2007 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2008 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 2009 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 2010 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 2011 ; AVX512DQ-NEXT: vpmovsxbw %xmm0, %ymm0 2012 ; AVX512DQ-NEXT: vpmovsxbw %xmm1, %ymm1 2013 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2014 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 2015 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 2016 ; AVX512DQ-NEXT: vpsrld $16, %xmm0, %xmm1 2017 ; AVX512DQ-NEXT: vpmovsxbw %xmm0, %ymm0 2018 ; AVX512DQ-NEXT: vpmovsxbw %xmm1, %ymm1 2019 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2020 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 2021 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 2022 ; AVX512DQ-NEXT: vpsrlw $8, %xmm0, %xmm1 2023 ; AVX512DQ-NEXT: vpmovsxbw %xmm0, %ymm0 2024 ; AVX512DQ-NEXT: vpmovsxbw %xmm1, %ymm1 2025 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2026 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 2027 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 2028 ; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax 2029 ; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax 2030 ; AVX512DQ-NEXT: vzeroupper 2031 ; AVX512DQ-NEXT: retq 2032 ; 2033 ; AVX512DQVL-LABEL: test_v32i8: 2034 ; AVX512DQVL: # %bb.0: 2035 ; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 2036 ; AVX512DQVL-NEXT: vpmovsxbw %xmm1, %ymm1 2037 ; AVX512DQVL-NEXT: vpmovsxbw %xmm0, %ymm0 2038 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2039 ; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0 2040 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 2041 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 2042 ; AVX512DQVL-NEXT: vpmovsxbw %xmm0, %ymm0 2043 ; AVX512DQVL-NEXT: vpmovsxbw %xmm1, %ymm1 2044 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2045 ; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0 2046 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 2047 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 2048 ; AVX512DQVL-NEXT: vpmovsxbw %xmm0, %ymm0 2049 ; AVX512DQVL-NEXT: vpmovsxbw %xmm1, %ymm1 2050 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2051 ; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0 2052 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 2053 ; AVX512DQVL-NEXT: vpsrld $16, %xmm0, %xmm1 2054 ; AVX512DQVL-NEXT: vpmovsxbw %xmm0, %ymm0 2055 ; AVX512DQVL-NEXT: vpmovsxbw %xmm1, %ymm1 2056 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2057 ; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0 2058 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 2059 ; AVX512DQVL-NEXT: vpsrlw $8, %xmm0, %xmm1 2060 ; AVX512DQVL-NEXT: vpmovsxbw %xmm0, %ymm0 2061 ; AVX512DQVL-NEXT: vpmovsxbw %xmm1, %ymm1 2062 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2063 ; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0 2064 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 2065 ; AVX512DQVL-NEXT: vpextrb $0, %xmm0, %eax 2066 ; AVX512DQVL-NEXT: # kill: def $al killed $al killed $eax 2067 ; AVX512DQVL-NEXT: vzeroupper 2068 ; AVX512DQVL-NEXT: retq 2069 %1 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> %a0) 2070 ret i8 %1 2071 } 2072 2073 define i8 @test_v64i8(<64 x i8> %a0) { 2074 ; SSE2-LABEL: test_v64i8: 2075 ; SSE2: # %bb.0: 2076 ; SSE2-NEXT: movdqa %xmm2, %xmm4 2077 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] 2078 ; SSE2-NEXT: movdqa %xmm0, %xmm5 2079 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] 2080 ; SSE2-NEXT: pmullw %xmm4, %xmm5 2081 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] 2082 ; SSE2-NEXT: pand %xmm4, %xmm5 2083 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 2084 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2085 ; SSE2-NEXT: pmullw %xmm2, %xmm0 2086 ; SSE2-NEXT: pand %xmm4, %xmm0 2087 ; SSE2-NEXT: packuswb %xmm5, %xmm0 2088 ; SSE2-NEXT: movdqa %xmm3, %xmm2 2089 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 2090 ; SSE2-NEXT: movdqa %xmm1, %xmm5 2091 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] 2092 ; SSE2-NEXT: pmullw %xmm2, %xmm5 2093 ; SSE2-NEXT: pand %xmm4, %xmm5 2094 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] 2095 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 2096 ; SSE2-NEXT: pmullw %xmm3, %xmm1 2097 ; SSE2-NEXT: pand %xmm4, %xmm1 2098 ; SSE2-NEXT: packuswb %xmm5, %xmm1 2099 ; SSE2-NEXT: movdqa %xmm1, %xmm2 2100 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 2101 ; SSE2-NEXT: movdqa %xmm0, %xmm3 2102 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] 2103 ; SSE2-NEXT: pmullw %xmm2, %xmm3 2104 ; SSE2-NEXT: pand %xmm4, %xmm3 2105 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 2106 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2107 ; SSE2-NEXT: pmullw %xmm1, %xmm0 2108 ; SSE2-NEXT: pand %xmm4, %xmm0 2109 ; SSE2-NEXT: packuswb %xmm3, %xmm0 2110 ; SSE2-NEXT: movdqa %xmm0, %xmm1 2111 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 2112 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2113 ; SSE2-NEXT: pmullw %xmm1, %xmm0 2114 ; SSE2-NEXT: pand %xmm4, %xmm0 2115 ; SSE2-NEXT: pxor %xmm2, %xmm2 2116 ; SSE2-NEXT: packuswb %xmm2, %xmm0 2117 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,2,3,3] 2118 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2119 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 2120 ; SSE2-NEXT: pmullw %xmm0, %xmm1 2121 ; SSE2-NEXT: pand %xmm4, %xmm1 2122 ; SSE2-NEXT: packuswb %xmm2, %xmm1 2123 ; SSE2-NEXT: movdqa %xmm1, %xmm0 2124 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2125 ; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 2126 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 2127 ; SSE2-NEXT: pmullw %xmm0, %xmm1 2128 ; SSE2-NEXT: pand %xmm4, %xmm1 2129 ; SSE2-NEXT: packuswb %xmm2, %xmm1 2130 ; SSE2-NEXT: movdqa %xmm1, %xmm0 2131 ; SSE2-NEXT: psrlw $8, %xmm0 2132 ; SSE2-NEXT: movdqa %xmm1, %xmm2 2133 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 2134 ; SSE2-NEXT: pmullw %xmm0, %xmm2 2135 ; SSE2-NEXT: pand %xmm4, %xmm2 2136 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 2137 ; SSE2-NEXT: pmullw %xmm0, %xmm1 2138 ; SSE2-NEXT: pand %xmm4, %xmm1 2139 ; SSE2-NEXT: packuswb %xmm2, %xmm1 2140 ; SSE2-NEXT: movd %xmm1, %eax 2141 ; SSE2-NEXT: # kill: def $al killed $al killed $eax 2142 ; SSE2-NEXT: retq 2143 ; 2144 ; SSE41-LABEL: test_v64i8: 2145 ; SSE41: # %bb.0: 2146 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 2147 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2148 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2149 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2150 ; SSE41-NEXT: pmullw %xmm2, %xmm0 2151 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 2152 ; SSE41-NEXT: pand %xmm2, %xmm0 2153 ; SSE41-NEXT: pmullw %xmm5, %xmm4 2154 ; SSE41-NEXT: pand %xmm2, %xmm4 2155 ; SSE41-NEXT: packuswb %xmm0, %xmm4 2156 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 2157 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2158 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2159 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2160 ; SSE41-NEXT: pmullw %xmm3, %xmm1 2161 ; SSE41-NEXT: pand %xmm2, %xmm1 2162 ; SSE41-NEXT: pmullw %xmm0, %xmm5 2163 ; SSE41-NEXT: pand %xmm2, %xmm5 2164 ; SSE41-NEXT: packuswb %xmm1, %xmm5 2165 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero 2166 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2167 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero 2168 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2169 ; SSE41-NEXT: pmullw %xmm5, %xmm4 2170 ; SSE41-NEXT: pand %xmm2, %xmm4 2171 ; SSE41-NEXT: pmullw %xmm0, %xmm1 2172 ; SSE41-NEXT: pand %xmm2, %xmm1 2173 ; SSE41-NEXT: packuswb %xmm4, %xmm1 2174 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2175 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2176 ; SSE41-NEXT: pmullw %xmm0, %xmm1 2177 ; SSE41-NEXT: pand %xmm2, %xmm1 2178 ; SSE41-NEXT: pxor %xmm0, %xmm0 2179 ; SSE41-NEXT: packuswb %xmm0, %xmm1 2180 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2181 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2182 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 2183 ; SSE41-NEXT: pmullw %xmm3, %xmm1 2184 ; SSE41-NEXT: pand %xmm2, %xmm1 2185 ; SSE41-NEXT: packuswb %xmm0, %xmm1 2186 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2187 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2188 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] 2189 ; SSE41-NEXT: pmullw %xmm3, %xmm1 2190 ; SSE41-NEXT: pand %xmm2, %xmm1 2191 ; SSE41-NEXT: packuswb %xmm0, %xmm1 2192 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2193 ; SSE41-NEXT: psrlw $8, %xmm1 2194 ; SSE41-NEXT: pmullw %xmm0, %xmm1 2195 ; SSE41-NEXT: pextrb $0, %xmm1, %eax 2196 ; SSE41-NEXT: # kill: def $al killed $al killed $eax 2197 ; SSE41-NEXT: retq 2198 ; 2199 ; AVX1-LABEL: test_v64i8: 2200 ; AVX1: # %bb.0: 2201 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2202 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2203 ; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm3 2204 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 2205 ; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm3 2206 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2207 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2208 ; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm4 2209 ; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm4 2210 ; AVX1-NEXT: vpackuswb %xmm3, %xmm4, %xmm3 2211 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 2212 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2213 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2214 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2215 ; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm4 2216 ; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm4 2217 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2218 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2219 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 2220 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 2221 ; AVX1-NEXT: vpackuswb %xmm4, %xmm0, %xmm0 2222 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2223 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2224 ; AVX1-NEXT: vpmullw %xmm1, %xmm4, %xmm1 2225 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 2226 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2227 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 2228 ; AVX1-NEXT: vpmullw %xmm0, %xmm3, %xmm0 2229 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 2230 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2231 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 2232 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2233 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2234 ; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 2235 ; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm3 2236 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2237 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2238 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 2239 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 2240 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 2241 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 2242 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2243 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2244 ; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 2245 ; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm3 2246 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2247 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2248 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 2249 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 2250 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 2251 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 2252 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2253 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2254 ; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 2255 ; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm3 2256 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2257 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2258 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 2259 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 2260 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 2261 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 2262 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2263 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2264 ; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 2265 ; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm3 2266 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2267 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2268 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 2269 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 2270 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 2271 ; AVX1-NEXT: vpextrb $0, %xmm0, %eax 2272 ; AVX1-NEXT: # kill: def $al killed $al killed $eax 2273 ; AVX1-NEXT: vzeroupper 2274 ; AVX1-NEXT: retq 2275 ; 2276 ; AVX2-LABEL: test_v64i8: 2277 ; AVX2: # %bb.0: 2278 ; AVX2-NEXT: vpmovsxbw %xmm1, %ymm2 2279 ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm3 2280 ; AVX2-NEXT: vpmullw %ymm2, %ymm3, %ymm3 2281 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 2282 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 2283 ; AVX2-NEXT: vpshufb %xmm2, %xmm4, %xmm4 2284 ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm3 2285 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] 2286 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 2287 ; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 2288 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 2289 ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 2290 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2291 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2292 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 2293 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 2294 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2295 ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 2296 ; AVX2-NEXT: vpmovsxbw %xmm3, %ymm1 2297 ; AVX2-NEXT: vpmullw %ymm0, %ymm1, %ymm0 2298 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2299 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 2300 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 2301 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2302 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 2303 ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 2304 ; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 2305 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2306 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2307 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 2308 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 2309 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2310 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 2311 ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 2312 ; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 2313 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2314 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2315 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 2316 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 2317 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2318 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 2319 ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 2320 ; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 2321 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2322 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2323 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 2324 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 2325 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2326 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 2327 ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 2328 ; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 2329 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2330 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax 2331 ; AVX2-NEXT: # kill: def $al killed $al killed $eax 2332 ; AVX2-NEXT: vzeroupper 2333 ; AVX2-NEXT: retq 2334 ; 2335 ; AVX512BW-LABEL: test_v64i8: 2336 ; AVX512BW: # %bb.0: 2337 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 2338 ; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1 2339 ; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 2340 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 2341 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 2342 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 2343 ; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 2344 ; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1 2345 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 2346 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 2347 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 2348 ; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 2349 ; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1 2350 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 2351 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 2352 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 2353 ; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 2354 ; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1 2355 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 2356 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 2357 ; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm1 2358 ; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 2359 ; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1 2360 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 2361 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 2362 ; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 2363 ; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 2364 ; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1 2365 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 2366 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 2367 ; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax 2368 ; AVX512BW-NEXT: # kill: def $al killed $al killed $eax 2369 ; AVX512BW-NEXT: vzeroupper 2370 ; AVX512BW-NEXT: retq 2371 ; 2372 ; AVX512BWVL-LABEL: test_v64i8: 2373 ; AVX512BWVL: # %bb.0: 2374 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 2375 ; AVX512BWVL-NEXT: vpmovsxbw %ymm1, %zmm1 2376 ; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0 2377 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 2378 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 2379 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 2380 ; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0 2381 ; AVX512BWVL-NEXT: vpmovsxbw %ymm1, %zmm1 2382 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 2383 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 2384 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 2385 ; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0 2386 ; AVX512BWVL-NEXT: vpmovsxbw %ymm1, %zmm1 2387 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 2388 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 2389 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 2390 ; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0 2391 ; AVX512BWVL-NEXT: vpmovsxbw %ymm1, %zmm1 2392 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 2393 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 2394 ; AVX512BWVL-NEXT: vpsrld $16, %xmm0, %xmm1 2395 ; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0 2396 ; AVX512BWVL-NEXT: vpmovsxbw %ymm1, %zmm1 2397 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 2398 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 2399 ; AVX512BWVL-NEXT: vpsrlw $8, %xmm0, %xmm1 2400 ; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0 2401 ; AVX512BWVL-NEXT: vpmovsxbw %ymm1, %zmm1 2402 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 2403 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 2404 ; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %eax 2405 ; AVX512BWVL-NEXT: # kill: def $al killed $al killed $eax 2406 ; AVX512BWVL-NEXT: vzeroupper 2407 ; AVX512BWVL-NEXT: retq 2408 ; 2409 ; AVX512DQ-LABEL: test_v64i8: 2410 ; AVX512DQ: # %bb.0: 2411 ; AVX512DQ-NEXT: vpmovsxbw %xmm1, %ymm2 2412 ; AVX512DQ-NEXT: vpmovsxbw %xmm0, %ymm3 2413 ; AVX512DQ-NEXT: vpmullw %ymm2, %ymm3, %ymm2 2414 ; AVX512DQ-NEXT: vpmovsxwd %ymm2, %zmm2 2415 ; AVX512DQ-NEXT: vpmovdb %zmm2, %xmm2 2416 ; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm1 2417 ; AVX512DQ-NEXT: vpmovsxbw %xmm1, %ymm1 2418 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0 2419 ; AVX512DQ-NEXT: vpmovsxbw %xmm0, %ymm0 2420 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2421 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 2422 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 2423 ; AVX512DQ-NEXT: vpmovsxbw %xmm0, %ymm0 2424 ; AVX512DQ-NEXT: vpmovsxbw %xmm2, %ymm1 2425 ; AVX512DQ-NEXT: vpmullw %ymm0, %ymm1, %ymm0 2426 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 2427 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 2428 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 2429 ; AVX512DQ-NEXT: vpmovsxbw %xmm0, %ymm0 2430 ; AVX512DQ-NEXT: vpmovsxbw %xmm1, %ymm1 2431 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2432 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 2433 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 2434 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 2435 ; AVX512DQ-NEXT: vpmovsxbw %xmm0, %ymm0 2436 ; AVX512DQ-NEXT: vpmovsxbw %xmm1, %ymm1 2437 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2438 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 2439 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 2440 ; AVX512DQ-NEXT: vpsrld $16, %xmm0, %xmm1 2441 ; AVX512DQ-NEXT: vpmovsxbw %xmm0, %ymm0 2442 ; AVX512DQ-NEXT: vpmovsxbw %xmm1, %ymm1 2443 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2444 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 2445 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 2446 ; AVX512DQ-NEXT: vpsrlw $8, %xmm0, %xmm1 2447 ; AVX512DQ-NEXT: vpmovsxbw %xmm0, %ymm0 2448 ; AVX512DQ-NEXT: vpmovsxbw %xmm1, %ymm1 2449 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2450 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 2451 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 2452 ; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax 2453 ; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax 2454 ; AVX512DQ-NEXT: vzeroupper 2455 ; AVX512DQ-NEXT: retq 2456 ; 2457 ; AVX512DQVL-LABEL: test_v64i8: 2458 ; AVX512DQVL: # %bb.0: 2459 ; AVX512DQVL-NEXT: vpmovsxbw %xmm1, %ymm2 2460 ; AVX512DQVL-NEXT: vpmovsxbw %xmm0, %ymm3 2461 ; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm3, %ymm2 2462 ; AVX512DQVL-NEXT: vpmovsxwd %ymm2, %zmm2 2463 ; AVX512DQVL-NEXT: vpmovdb %zmm2, %xmm2 2464 ; AVX512DQVL-NEXT: vextracti128 $1, %ymm1, %xmm1 2465 ; AVX512DQVL-NEXT: vpmovsxbw %xmm1, %ymm1 2466 ; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm0 2467 ; AVX512DQVL-NEXT: vpmovsxbw %xmm0, %ymm0 2468 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2469 ; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0 2470 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 2471 ; AVX512DQVL-NEXT: vpmovsxbw %xmm0, %ymm0 2472 ; AVX512DQVL-NEXT: vpmovsxbw %xmm2, %ymm1 2473 ; AVX512DQVL-NEXT: vpmullw %ymm0, %ymm1, %ymm0 2474 ; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0 2475 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 2476 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 2477 ; AVX512DQVL-NEXT: vpmovsxbw %xmm0, %ymm0 2478 ; AVX512DQVL-NEXT: vpmovsxbw %xmm1, %ymm1 2479 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2480 ; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0 2481 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 2482 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 2483 ; AVX512DQVL-NEXT: vpmovsxbw %xmm0, %ymm0 2484 ; AVX512DQVL-NEXT: vpmovsxbw %xmm1, %ymm1 2485 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2486 ; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0 2487 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 2488 ; AVX512DQVL-NEXT: vpsrld $16, %xmm0, %xmm1 2489 ; AVX512DQVL-NEXT: vpmovsxbw %xmm0, %ymm0 2490 ; AVX512DQVL-NEXT: vpmovsxbw %xmm1, %ymm1 2491 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2492 ; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0 2493 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 2494 ; AVX512DQVL-NEXT: vpsrlw $8, %xmm0, %xmm1 2495 ; AVX512DQVL-NEXT: vpmovsxbw %xmm0, %ymm0 2496 ; AVX512DQVL-NEXT: vpmovsxbw %xmm1, %ymm1 2497 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2498 ; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0 2499 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 2500 ; AVX512DQVL-NEXT: vpextrb $0, %xmm0, %eax 2501 ; AVX512DQVL-NEXT: # kill: def $al killed $al killed $eax 2502 ; AVX512DQVL-NEXT: vzeroupper 2503 ; AVX512DQVL-NEXT: retq 2504 %1 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> %a0) 2505 ret i8 %1 2506 } 2507 2508 define i8 @test_v128i8(<128 x i8> %a0) { 2509 ; SSE2-LABEL: test_v128i8: 2510 ; SSE2: # %bb.0: 2511 ; SSE2-NEXT: movdqa %xmm5, %xmm8 2512 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15] 2513 ; SSE2-NEXT: movdqa %xmm1, %xmm9 2514 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm0[8],xmm9[9],xmm0[9],xmm9[10],xmm0[10],xmm9[11],xmm0[11],xmm9[12],xmm0[12],xmm9[13],xmm0[13],xmm9[14],xmm0[14],xmm9[15],xmm0[15] 2515 ; SSE2-NEXT: pmullw %xmm8, %xmm9 2516 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] 2517 ; SSE2-NEXT: pand %xmm8, %xmm9 2518 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] 2519 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 2520 ; SSE2-NEXT: pmullw %xmm5, %xmm1 2521 ; SSE2-NEXT: pand %xmm8, %xmm1 2522 ; SSE2-NEXT: packuswb %xmm9, %xmm1 2523 ; SSE2-NEXT: movdqa %xmm7, %xmm9 2524 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm0[8],xmm9[9],xmm0[9],xmm9[10],xmm0[10],xmm9[11],xmm0[11],xmm9[12],xmm0[12],xmm9[13],xmm0[13],xmm9[14],xmm0[14],xmm9[15],xmm0[15] 2525 ; SSE2-NEXT: movdqa %xmm3, %xmm5 2526 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] 2527 ; SSE2-NEXT: pmullw %xmm9, %xmm5 2528 ; SSE2-NEXT: pand %xmm8, %xmm5 2529 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3],xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] 2530 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] 2531 ; SSE2-NEXT: pmullw %xmm7, %xmm3 2532 ; SSE2-NEXT: pand %xmm8, %xmm3 2533 ; SSE2-NEXT: packuswb %xmm5, %xmm3 2534 ; SSE2-NEXT: movdqa %xmm4, %xmm5 2535 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] 2536 ; SSE2-NEXT: movdqa %xmm0, %xmm7 2537 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15] 2538 ; SSE2-NEXT: pmullw %xmm5, %xmm7 2539 ; SSE2-NEXT: pand %xmm8, %xmm7 2540 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] 2541 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2542 ; SSE2-NEXT: pmullw %xmm4, %xmm0 2543 ; SSE2-NEXT: pand %xmm8, %xmm0 2544 ; SSE2-NEXT: packuswb %xmm7, %xmm0 2545 ; SSE2-NEXT: movdqa %xmm6, %xmm4 2546 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] 2547 ; SSE2-NEXT: movdqa %xmm2, %xmm5 2548 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] 2549 ; SSE2-NEXT: pmullw %xmm4, %xmm5 2550 ; SSE2-NEXT: pand %xmm8, %xmm5 2551 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] 2552 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 2553 ; SSE2-NEXT: pmullw %xmm6, %xmm2 2554 ; SSE2-NEXT: pand %xmm8, %xmm2 2555 ; SSE2-NEXT: packuswb %xmm5, %xmm2 2556 ; SSE2-NEXT: movdqa %xmm2, %xmm4 2557 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] 2558 ; SSE2-NEXT: movdqa %xmm0, %xmm5 2559 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] 2560 ; SSE2-NEXT: pmullw %xmm4, %xmm5 2561 ; SSE2-NEXT: pand %xmm8, %xmm5 2562 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 2563 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2564 ; SSE2-NEXT: pmullw %xmm2, %xmm0 2565 ; SSE2-NEXT: pand %xmm8, %xmm0 2566 ; SSE2-NEXT: packuswb %xmm5, %xmm0 2567 ; SSE2-NEXT: movdqa %xmm3, %xmm2 2568 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 2569 ; SSE2-NEXT: movdqa %xmm1, %xmm4 2570 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] 2571 ; SSE2-NEXT: pmullw %xmm2, %xmm4 2572 ; SSE2-NEXT: pand %xmm8, %xmm4 2573 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] 2574 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 2575 ; SSE2-NEXT: pmullw %xmm3, %xmm1 2576 ; SSE2-NEXT: pand %xmm8, %xmm1 2577 ; SSE2-NEXT: packuswb %xmm4, %xmm1 2578 ; SSE2-NEXT: movdqa %xmm1, %xmm2 2579 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 2580 ; SSE2-NEXT: movdqa %xmm0, %xmm3 2581 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] 2582 ; SSE2-NEXT: pmullw %xmm2, %xmm3 2583 ; SSE2-NEXT: pand %xmm8, %xmm3 2584 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 2585 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2586 ; SSE2-NEXT: pmullw %xmm1, %xmm0 2587 ; SSE2-NEXT: pand %xmm8, %xmm0 2588 ; SSE2-NEXT: packuswb %xmm3, %xmm0 2589 ; SSE2-NEXT: movdqa %xmm0, %xmm1 2590 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 2591 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2592 ; SSE2-NEXT: pmullw %xmm1, %xmm0 2593 ; SSE2-NEXT: pand %xmm8, %xmm0 2594 ; SSE2-NEXT: pxor %xmm2, %xmm2 2595 ; SSE2-NEXT: packuswb %xmm2, %xmm0 2596 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,2,3,3] 2597 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2598 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 2599 ; SSE2-NEXT: pmullw %xmm0, %xmm1 2600 ; SSE2-NEXT: pand %xmm8, %xmm1 2601 ; SSE2-NEXT: packuswb %xmm2, %xmm1 2602 ; SSE2-NEXT: movdqa %xmm1, %xmm0 2603 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2604 ; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 2605 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 2606 ; SSE2-NEXT: pmullw %xmm0, %xmm1 2607 ; SSE2-NEXT: pand %xmm8, %xmm1 2608 ; SSE2-NEXT: packuswb %xmm2, %xmm1 2609 ; SSE2-NEXT: movdqa %xmm1, %xmm0 2610 ; SSE2-NEXT: psrlw $8, %xmm0 2611 ; SSE2-NEXT: movdqa %xmm1, %xmm2 2612 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 2613 ; SSE2-NEXT: pmullw %xmm0, %xmm2 2614 ; SSE2-NEXT: pand %xmm8, %xmm2 2615 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 2616 ; SSE2-NEXT: pmullw %xmm0, %xmm1 2617 ; SSE2-NEXT: pand %xmm8, %xmm1 2618 ; SSE2-NEXT: packuswb %xmm2, %xmm1 2619 ; SSE2-NEXT: movd %xmm1, %eax 2620 ; SSE2-NEXT: # kill: def $al killed $al killed $eax 2621 ; SSE2-NEXT: retq 2622 ; 2623 ; SSE41-LABEL: test_v128i8: 2624 ; SSE41: # %bb.0: 2625 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm9 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero 2626 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2627 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm8 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2628 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2629 ; SSE41-NEXT: pmullw %xmm5, %xmm1 2630 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] 2631 ; SSE41-NEXT: pand %xmm5, %xmm1 2632 ; SSE41-NEXT: pmullw %xmm9, %xmm8 2633 ; SSE41-NEXT: pand %xmm5, %xmm8 2634 ; SSE41-NEXT: packuswb %xmm1, %xmm8 2635 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm9 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero 2636 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2637 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 2638 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2639 ; SSE41-NEXT: pmullw %xmm7, %xmm3 2640 ; SSE41-NEXT: pand %xmm5, %xmm3 2641 ; SSE41-NEXT: pmullw %xmm9, %xmm1 2642 ; SSE41-NEXT: pand %xmm5, %xmm1 2643 ; SSE41-NEXT: packuswb %xmm3, %xmm1 2644 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm7 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero 2645 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2646 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2647 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2648 ; SSE41-NEXT: pmullw %xmm4, %xmm0 2649 ; SSE41-NEXT: pand %xmm5, %xmm0 2650 ; SSE41-NEXT: pmullw %xmm7, %xmm3 2651 ; SSE41-NEXT: pand %xmm5, %xmm3 2652 ; SSE41-NEXT: packuswb %xmm0, %xmm3 2653 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero 2654 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2655 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 2656 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2657 ; SSE41-NEXT: pmullw %xmm6, %xmm2 2658 ; SSE41-NEXT: pand %xmm5, %xmm2 2659 ; SSE41-NEXT: pmullw %xmm0, %xmm4 2660 ; SSE41-NEXT: pand %xmm5, %xmm4 2661 ; SSE41-NEXT: packuswb %xmm2, %xmm4 2662 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero 2663 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2664 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 2665 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2666 ; SSE41-NEXT: pmullw %xmm4, %xmm3 2667 ; SSE41-NEXT: pand %xmm5, %xmm3 2668 ; SSE41-NEXT: pmullw %xmm2, %xmm0 2669 ; SSE41-NEXT: pand %xmm5, %xmm0 2670 ; SSE41-NEXT: packuswb %xmm3, %xmm0 2671 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2672 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2673 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero,xmm8[4],zero,xmm8[5],zero,xmm8[6],zero,xmm8[7],zero 2674 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2675 ; SSE41-NEXT: pmullw %xmm1, %xmm8 2676 ; SSE41-NEXT: pand %xmm5, %xmm8 2677 ; SSE41-NEXT: pmullw %xmm2, %xmm3 2678 ; SSE41-NEXT: pand %xmm5, %xmm3 2679 ; SSE41-NEXT: packuswb %xmm8, %xmm3 2680 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 2681 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2682 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2683 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2684 ; SSE41-NEXT: pmullw %xmm3, %xmm0 2685 ; SSE41-NEXT: pand %xmm5, %xmm0 2686 ; SSE41-NEXT: pmullw %xmm1, %xmm2 2687 ; SSE41-NEXT: pand %xmm5, %xmm2 2688 ; SSE41-NEXT: packuswb %xmm0, %xmm2 2689 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 2690 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2691 ; SSE41-NEXT: pmullw %xmm0, %xmm2 2692 ; SSE41-NEXT: pand %xmm5, %xmm2 2693 ; SSE41-NEXT: pxor %xmm0, %xmm0 2694 ; SSE41-NEXT: packuswb %xmm0, %xmm2 2695 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 2696 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2697 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 2698 ; SSE41-NEXT: pmullw %xmm1, %xmm2 2699 ; SSE41-NEXT: pand %xmm5, %xmm2 2700 ; SSE41-NEXT: packuswb %xmm0, %xmm2 2701 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 2702 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2703 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] 2704 ; SSE41-NEXT: pmullw %xmm1, %xmm2 2705 ; SSE41-NEXT: pand %xmm5, %xmm2 2706 ; SSE41-NEXT: packuswb %xmm0, %xmm2 2707 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 2708 ; SSE41-NEXT: psrlw $8, %xmm2 2709 ; SSE41-NEXT: pmullw %xmm0, %xmm2 2710 ; SSE41-NEXT: pextrb $0, %xmm2, %eax 2711 ; SSE41-NEXT: # kill: def $al killed $al killed $eax 2712 ; SSE41-NEXT: retq 2713 ; 2714 ; AVX1-LABEL: test_v128i8: 2715 ; AVX1: # %bb.0: 2716 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 2717 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2718 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 2719 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2720 ; AVX1-NEXT: vpmullw %xmm4, %xmm7, %xmm7 2721 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] 2722 ; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm7 2723 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero 2724 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero 2725 ; AVX1-NEXT: vpmullw %xmm5, %xmm6, %xmm5 2726 ; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm5 2727 ; AVX1-NEXT: vpackuswb %xmm7, %xmm5, %xmm8 2728 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6 2729 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2730 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 2731 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2732 ; AVX1-NEXT: vpmullw %xmm9, %xmm7, %xmm7 2733 ; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm7 2734 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero 2735 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero 2736 ; AVX1-NEXT: vpmullw %xmm6, %xmm5, %xmm5 2737 ; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm5 2738 ; AVX1-NEXT: vpackuswb %xmm7, %xmm5, %xmm6 2739 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2740 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2741 ; AVX1-NEXT: vpmullw %xmm5, %xmm7, %xmm5 2742 ; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm5 2743 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 2744 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2745 ; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 2746 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 2747 ; AVX1-NEXT: vpackuswb %xmm5, %xmm0, %xmm0 2748 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2749 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2750 ; AVX1-NEXT: vpmullw %xmm2, %xmm5, %xmm2 2751 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2752 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 2753 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2754 ; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1 2755 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 2756 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 2757 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2758 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2759 ; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2 2760 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2761 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2762 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2763 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 2764 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 2765 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 2766 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2767 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2768 ; AVX1-NEXT: vpmullw %xmm1, %xmm2, %xmm1 2769 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 2770 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero 2771 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero,xmm8[4],zero,xmm8[5],zero,xmm8[6],zero,xmm8[7],zero 2772 ; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2 2773 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2774 ; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 2775 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2776 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2777 ; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2 2778 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2779 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2780 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2781 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 2782 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 2783 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 2784 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 2785 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2786 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2787 ; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 2788 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2789 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2790 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2791 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 2792 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 2793 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 2794 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 2795 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2796 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2797 ; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 2798 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2799 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2800 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2801 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 2802 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 2803 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 2804 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 2805 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2806 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2807 ; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 2808 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2809 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2810 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2811 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 2812 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 2813 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 2814 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 2815 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2816 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2817 ; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 2818 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2819 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2820 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2821 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 2822 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 2823 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 2824 ; AVX1-NEXT: vpextrb $0, %xmm0, %eax 2825 ; AVX1-NEXT: # kill: def $al killed $al killed $eax 2826 ; AVX1-NEXT: vzeroupper 2827 ; AVX1-NEXT: retq 2828 ; 2829 ; AVX2-LABEL: test_v128i8: 2830 ; AVX2: # %bb.0: 2831 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 2832 ; AVX2-NEXT: vpmovsxbw %xmm4, %ymm4 2833 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm5 2834 ; AVX2-NEXT: vpmovsxbw %xmm5, %ymm5 2835 ; AVX2-NEXT: vpmullw %ymm4, %ymm5, %ymm5 2836 ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 2837 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 2838 ; AVX2-NEXT: vpshufb %xmm4, %xmm6, %xmm6 2839 ; AVX2-NEXT: vpshufb %xmm4, %xmm5, %xmm5 2840 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0] 2841 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm6 2842 ; AVX2-NEXT: vpmovsxbw %xmm6, %ymm6 2843 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm7 2844 ; AVX2-NEXT: vpmovsxbw %xmm7, %ymm7 2845 ; AVX2-NEXT: vpmullw %ymm6, %ymm7, %ymm6 2846 ; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7 2847 ; AVX2-NEXT: vpshufb %xmm4, %xmm7, %xmm7 2848 ; AVX2-NEXT: vpshufb %xmm4, %xmm6, %xmm6 2849 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0] 2850 ; AVX2-NEXT: vpmovsxbw %xmm2, %ymm2 2851 ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 2852 ; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0 2853 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 2854 ; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 2855 ; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 2856 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 2857 ; AVX2-NEXT: vpmovsxbw %xmm3, %ymm2 2858 ; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 2859 ; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1 2860 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 2861 ; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 2862 ; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1 2863 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 2864 ; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 2865 ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 2866 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2867 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2868 ; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1 2869 ; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 2870 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2871 ; AVX2-NEXT: vpmovsxbw %xmm6, %ymm1 2872 ; AVX2-NEXT: vpmovsxbw %xmm5, %ymm2 2873 ; AVX2-NEXT: vpmullw %ymm1, %ymm2, %ymm1 2874 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 2875 ; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 2876 ; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1 2877 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 2878 ; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 2879 ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 2880 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2881 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2882 ; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1 2883 ; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 2884 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2885 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 2886 ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 2887 ; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 2888 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2889 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2890 ; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1 2891 ; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 2892 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2893 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 2894 ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 2895 ; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 2896 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2897 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2898 ; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1 2899 ; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 2900 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2901 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 2902 ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 2903 ; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 2904 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2905 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2906 ; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1 2907 ; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 2908 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2909 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 2910 ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 2911 ; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 2912 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2913 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax 2914 ; AVX2-NEXT: # kill: def $al killed $al killed $eax 2915 ; AVX2-NEXT: vzeroupper 2916 ; AVX2-NEXT: retq 2917 ; 2918 ; AVX512BW-LABEL: test_v128i8: 2919 ; AVX512BW: # %bb.0: 2920 ; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm2 2921 ; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm3 2922 ; AVX512BW-NEXT: vpmullw %zmm2, %zmm3, %zmm2 2923 ; AVX512BW-NEXT: vpmovwb %zmm2, %ymm2 2924 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm1 2925 ; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1 2926 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 2927 ; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 2928 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 2929 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 2930 ; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 2931 ; AVX512BW-NEXT: vpmovsxbw %ymm2, %zmm1 2932 ; AVX512BW-NEXT: vpmullw %zmm0, %zmm1, %zmm0 2933 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 2934 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 2935 ; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 2936 ; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1 2937 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 2938 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 2939 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 2940 ; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 2941 ; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1 2942 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 2943 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 2944 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 2945 ; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 2946 ; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1 2947 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 2948 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 2949 ; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm1 2950 ; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 2951 ; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1 2952 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 2953 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 2954 ; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 2955 ; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 2956 ; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1 2957 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 2958 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 2959 ; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax 2960 ; AVX512BW-NEXT: # kill: def $al killed $al killed $eax 2961 ; AVX512BW-NEXT: vzeroupper 2962 ; AVX512BW-NEXT: retq 2963 ; 2964 ; AVX512BWVL-LABEL: test_v128i8: 2965 ; AVX512BWVL: # %bb.0: 2966 ; AVX512BWVL-NEXT: vpmovsxbw %ymm1, %zmm2 2967 ; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm3 2968 ; AVX512BWVL-NEXT: vpmullw %zmm2, %zmm3, %zmm2 2969 ; AVX512BWVL-NEXT: vpmovwb %zmm2, %ymm2 2970 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 2971 ; AVX512BWVL-NEXT: vpmovsxbw %ymm1, %zmm1 2972 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 2973 ; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0 2974 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 2975 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 2976 ; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0 2977 ; AVX512BWVL-NEXT: vpmovsxbw %ymm2, %zmm1 2978 ; AVX512BWVL-NEXT: vpmullw %zmm0, %zmm1, %zmm0 2979 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 2980 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 2981 ; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0 2982 ; AVX512BWVL-NEXT: vpmovsxbw %ymm1, %zmm1 2983 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 2984 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 2985 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 2986 ; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0 2987 ; AVX512BWVL-NEXT: vpmovsxbw %ymm1, %zmm1 2988 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 2989 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 2990 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 2991 ; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0 2992 ; AVX512BWVL-NEXT: vpmovsxbw %ymm1, %zmm1 2993 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 2994 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 2995 ; AVX512BWVL-NEXT: vpsrld $16, %xmm0, %xmm1 2996 ; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0 2997 ; AVX512BWVL-NEXT: vpmovsxbw %ymm1, %zmm1 2998 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 2999 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 3000 ; AVX512BWVL-NEXT: vpsrlw $8, %xmm0, %xmm1 3001 ; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0 3002 ; AVX512BWVL-NEXT: vpmovsxbw %ymm1, %zmm1 3003 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 3004 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 3005 ; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %eax 3006 ; AVX512BWVL-NEXT: # kill: def $al killed $al killed $eax 3007 ; AVX512BWVL-NEXT: vzeroupper 3008 ; AVX512BWVL-NEXT: retq 3009 ; 3010 ; AVX512DQ-LABEL: test_v128i8: 3011 ; AVX512DQ: # %bb.0: 3012 ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm4 3013 ; AVX512DQ-NEXT: vpmovsxbw %xmm4, %ymm4 3014 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm5 3015 ; AVX512DQ-NEXT: vpmovsxbw %xmm5, %ymm5 3016 ; AVX512DQ-NEXT: vpmullw %ymm4, %ymm5, %ymm4 3017 ; AVX512DQ-NEXT: vpmovsxwd %ymm4, %zmm4 3018 ; AVX512DQ-NEXT: vpmovdb %zmm4, %xmm4 3019 ; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm5 3020 ; AVX512DQ-NEXT: vpmovsxbw %xmm5, %ymm5 3021 ; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm6 3022 ; AVX512DQ-NEXT: vpmovsxbw %xmm6, %ymm6 3023 ; AVX512DQ-NEXT: vpmullw %ymm5, %ymm6, %ymm5 3024 ; AVX512DQ-NEXT: vpmovsxwd %ymm5, %zmm5 3025 ; AVX512DQ-NEXT: vpmovdb %zmm5, %xmm5 3026 ; AVX512DQ-NEXT: vpmovsxbw %xmm2, %ymm2 3027 ; AVX512DQ-NEXT: vpmovsxbw %xmm0, %ymm0 3028 ; AVX512DQ-NEXT: vpmullw %ymm2, %ymm0, %ymm0 3029 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 3030 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 3031 ; AVX512DQ-NEXT: vpmovsxbw %xmm3, %ymm2 3032 ; AVX512DQ-NEXT: vpmovsxbw %xmm1, %ymm1 3033 ; AVX512DQ-NEXT: vpmullw %ymm2, %ymm1, %ymm1 3034 ; AVX512DQ-NEXT: vpmovsxwd %ymm1, %zmm1 3035 ; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1 3036 ; AVX512DQ-NEXT: vpmovsxbw %xmm1, %ymm1 3037 ; AVX512DQ-NEXT: vpmovsxbw %xmm0, %ymm0 3038 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 3039 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 3040 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 3041 ; AVX512DQ-NEXT: vpmovsxbw %xmm5, %ymm1 3042 ; AVX512DQ-NEXT: vpmovsxbw %xmm4, %ymm2 3043 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm2, %ymm1 3044 ; AVX512DQ-NEXT: vpmovsxwd %ymm1, %zmm1 3045 ; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1 3046 ; AVX512DQ-NEXT: vpmovsxbw %xmm1, %ymm1 3047 ; AVX512DQ-NEXT: vpmovsxbw %xmm0, %ymm0 3048 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 3049 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 3050 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 3051 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 3052 ; AVX512DQ-NEXT: vpmovsxbw %xmm0, %ymm0 3053 ; AVX512DQ-NEXT: vpmovsxbw %xmm1, %ymm1 3054 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 3055 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 3056 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 3057 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 3058 ; AVX512DQ-NEXT: vpmovsxbw %xmm0, %ymm0 3059 ; AVX512DQ-NEXT: vpmovsxbw %xmm1, %ymm1 3060 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 3061 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 3062 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 3063 ; AVX512DQ-NEXT: vpsrld $16, %xmm0, %xmm1 3064 ; AVX512DQ-NEXT: vpmovsxbw %xmm0, %ymm0 3065 ; AVX512DQ-NEXT: vpmovsxbw %xmm1, %ymm1 3066 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 3067 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 3068 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 3069 ; AVX512DQ-NEXT: vpsrlw $8, %xmm0, %xmm1 3070 ; AVX512DQ-NEXT: vpmovsxbw %xmm0, %ymm0 3071 ; AVX512DQ-NEXT: vpmovsxbw %xmm1, %ymm1 3072 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 3073 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 3074 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 3075 ; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax 3076 ; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax 3077 ; AVX512DQ-NEXT: vzeroupper 3078 ; AVX512DQ-NEXT: retq 3079 ; 3080 ; AVX512DQVL-LABEL: test_v128i8: 3081 ; AVX512DQVL: # %bb.0: 3082 ; AVX512DQVL-NEXT: vextracti128 $1, %ymm2, %xmm4 3083 ; AVX512DQVL-NEXT: vpmovsxbw %xmm4, %ymm4 3084 ; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm5 3085 ; AVX512DQVL-NEXT: vpmovsxbw %xmm5, %ymm5 3086 ; AVX512DQVL-NEXT: vpmullw %ymm4, %ymm5, %ymm4 3087 ; AVX512DQVL-NEXT: vpmovsxwd %ymm4, %zmm4 3088 ; AVX512DQVL-NEXT: vpmovdb %zmm4, %xmm4 3089 ; AVX512DQVL-NEXT: vextracti128 $1, %ymm3, %xmm5 3090 ; AVX512DQVL-NEXT: vpmovsxbw %xmm5, %ymm5 3091 ; AVX512DQVL-NEXT: vextracti128 $1, %ymm1, %xmm6 3092 ; AVX512DQVL-NEXT: vpmovsxbw %xmm6, %ymm6 3093 ; AVX512DQVL-NEXT: vpmullw %ymm5, %ymm6, %ymm5 3094 ; AVX512DQVL-NEXT: vpmovsxwd %ymm5, %zmm5 3095 ; AVX512DQVL-NEXT: vpmovdb %zmm5, %xmm5 3096 ; AVX512DQVL-NEXT: vpmovsxbw %xmm2, %ymm2 3097 ; AVX512DQVL-NEXT: vpmovsxbw %xmm0, %ymm0 3098 ; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm0, %ymm0 3099 ; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0 3100 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 3101 ; AVX512DQVL-NEXT: vpmovsxbw %xmm3, %ymm2 3102 ; AVX512DQVL-NEXT: vpmovsxbw %xmm1, %ymm1 3103 ; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm1, %ymm1 3104 ; AVX512DQVL-NEXT: vpmovsxwd %ymm1, %zmm1 3105 ; AVX512DQVL-NEXT: vpmovdb %zmm1, %xmm1 3106 ; AVX512DQVL-NEXT: vpmovsxbw %xmm1, %ymm1 3107 ; AVX512DQVL-NEXT: vpmovsxbw %xmm0, %ymm0 3108 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 3109 ; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0 3110 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 3111 ; AVX512DQVL-NEXT: vpmovsxbw %xmm5, %ymm1 3112 ; AVX512DQVL-NEXT: vpmovsxbw %xmm4, %ymm2 3113 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm2, %ymm1 3114 ; AVX512DQVL-NEXT: vpmovsxwd %ymm1, %zmm1 3115 ; AVX512DQVL-NEXT: vpmovdb %zmm1, %xmm1 3116 ; AVX512DQVL-NEXT: vpmovsxbw %xmm1, %ymm1 3117 ; AVX512DQVL-NEXT: vpmovsxbw %xmm0, %ymm0 3118 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 3119 ; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0 3120 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 3121 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 3122 ; AVX512DQVL-NEXT: vpmovsxbw %xmm0, %ymm0 3123 ; AVX512DQVL-NEXT: vpmovsxbw %xmm1, %ymm1 3124 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 3125 ; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0 3126 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 3127 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 3128 ; AVX512DQVL-NEXT: vpmovsxbw %xmm0, %ymm0 3129 ; AVX512DQVL-NEXT: vpmovsxbw %xmm1, %ymm1 3130 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 3131 ; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0 3132 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 3133 ; AVX512DQVL-NEXT: vpsrld $16, %xmm0, %xmm1 3134 ; AVX512DQVL-NEXT: vpmovsxbw %xmm0, %ymm0 3135 ; AVX512DQVL-NEXT: vpmovsxbw %xmm1, %ymm1 3136 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 3137 ; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0 3138 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 3139 ; AVX512DQVL-NEXT: vpsrlw $8, %xmm0, %xmm1 3140 ; AVX512DQVL-NEXT: vpmovsxbw %xmm0, %ymm0 3141 ; AVX512DQVL-NEXT: vpmovsxbw %xmm1, %ymm1 3142 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 3143 ; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0 3144 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 3145 ; AVX512DQVL-NEXT: vpextrb $0, %xmm0, %eax 3146 ; AVX512DQVL-NEXT: # kill: def $al killed $al killed $eax 3147 ; AVX512DQVL-NEXT: vzeroupper 3148 ; AVX512DQVL-NEXT: retq 3149 %1 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> %a0) 3150 ret i8 %1 3151 } 3152 3153 declare i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64>) 3154 declare i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64>) 3155 declare i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64>) 3156 declare i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64>) 3157 3158 declare i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32>) 3159 declare i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32>) 3160 declare i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32>) 3161 declare i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32>) 3162 3163 declare i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16>) 3164 declare i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16>) 3165 declare i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16>) 3166 declare i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16>) 3167 3168 declare i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8>) 3169 declare i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8>) 3170 declare i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8>) 3171 declare i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8>) 3172