1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE42 4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW 6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST 7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F 8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL 9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW 10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL 11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL 12 13 ; PR31551 14 ; Pairs of shufflevector:trunc functions with functional equivalence. 15 ; Ideally, the shuffles should be lowered to code with the same quality as the truncates. 16 17 define void @shuffle_v16i8_to_v8i8(<16 x i8>* %L, <8 x i8>* %S) nounwind { 18 ; SSE2-LABEL: shuffle_v16i8_to_v8i8: 19 ; SSE2: # %bb.0: 20 ; SSE2-NEXT: movdqa (%rdi), %xmm0 21 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 22 ; SSE2-NEXT: packuswb %xmm0, %xmm0 23 ; SSE2-NEXT: movq %xmm0, (%rsi) 24 ; SSE2-NEXT: retq 25 ; 26 ; SSE42-LABEL: shuffle_v16i8_to_v8i8: 27 ; SSE42: # %bb.0: 28 ; SSE42-NEXT: movdqa (%rdi), %xmm0 29 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 30 ; SSE42-NEXT: movq %xmm0, (%rsi) 31 ; SSE42-NEXT: retq 32 ; 33 ; AVX-LABEL: shuffle_v16i8_to_v8i8: 34 ; AVX: # %bb.0: 35 ; AVX-NEXT: vmovdqa (%rdi), %xmm0 36 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 37 ; AVX-NEXT: vmovq %xmm0, (%rsi) 38 ; AVX-NEXT: retq 39 ; 40 ; AVX512F-LABEL: shuffle_v16i8_to_v8i8: 41 ; AVX512F: # %bb.0: 42 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 43 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 44 ; AVX512F-NEXT: vmovq %xmm0, (%rsi) 45 ; AVX512F-NEXT: retq 46 ; 47 ; AVX512VL-LABEL: shuffle_v16i8_to_v8i8: 48 ; AVX512VL: # %bb.0: 49 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 50 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 51 ; AVX512VL-NEXT: vmovq %xmm0, (%rsi) 52 ; AVX512VL-NEXT: retq 53 ; 54 ; AVX512BW-LABEL: shuffle_v16i8_to_v8i8: 55 ; AVX512BW: # %bb.0: 56 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 57 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 58 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) 59 ; AVX512BW-NEXT: retq 60 ; 61 ; AVX512BWVL-LABEL: shuffle_v16i8_to_v8i8: 62 ; AVX512BWVL: # %bb.0: 63 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 64 ; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi) 65 ; AVX512BWVL-NEXT: retq 66 %vec = load <16 x i8>, <16 x i8>* %L 67 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 68 store <8 x i8> %strided.vec, <8 x i8>* %S 69 ret void 70 } 71 72 define void @trunc_v8i16_to_v8i8(<16 x i8>* %L, <8 x i8>* %S) nounwind { 73 ; SSE2-LABEL: trunc_v8i16_to_v8i8: 74 ; SSE2: # %bb.0: 75 ; SSE2-NEXT: movdqa (%rdi), %xmm0 76 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 77 ; SSE2-NEXT: packuswb %xmm0, %xmm0 78 ; SSE2-NEXT: movq %xmm0, (%rsi) 79 ; SSE2-NEXT: retq 80 ; 81 ; SSE42-LABEL: trunc_v8i16_to_v8i8: 82 ; SSE42: # %bb.0: 83 ; SSE42-NEXT: movdqa (%rdi), %xmm0 84 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 85 ; SSE42-NEXT: movq %xmm0, (%rsi) 86 ; SSE42-NEXT: retq 87 ; 88 ; AVX-LABEL: trunc_v8i16_to_v8i8: 89 ; AVX: # %bb.0: 90 ; AVX-NEXT: vmovdqa (%rdi), %xmm0 91 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 92 ; AVX-NEXT: vmovq %xmm0, (%rsi) 93 ; AVX-NEXT: retq 94 ; 95 ; AVX512F-LABEL: trunc_v8i16_to_v8i8: 96 ; AVX512F: # %bb.0: 97 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 98 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 99 ; AVX512F-NEXT: vmovq %xmm0, (%rsi) 100 ; AVX512F-NEXT: retq 101 ; 102 ; AVX512VL-LABEL: trunc_v8i16_to_v8i8: 103 ; AVX512VL: # %bb.0: 104 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 105 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 106 ; AVX512VL-NEXT: vmovq %xmm0, (%rsi) 107 ; AVX512VL-NEXT: retq 108 ; 109 ; AVX512BW-LABEL: trunc_v8i16_to_v8i8: 110 ; AVX512BW: # %bb.0: 111 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 112 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 113 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) 114 ; AVX512BW-NEXT: retq 115 ; 116 ; AVX512BWVL-LABEL: trunc_v8i16_to_v8i8: 117 ; AVX512BWVL: # %bb.0: 118 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 119 ; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi) 120 ; AVX512BWVL-NEXT: retq 121 %vec = load <16 x i8>, <16 x i8>* %L 122 %bc = bitcast <16 x i8> %vec to <8 x i16> 123 %strided.vec = trunc <8 x i16> %bc to <8 x i8> 124 store <8 x i8> %strided.vec, <8 x i8>* %S 125 ret void 126 } 127 128 define void @shuffle_v8i16_to_v4i16(<8 x i16>* %L, <4 x i16>* %S) nounwind { 129 ; SSE2-LABEL: shuffle_v8i16_to_v4i16: 130 ; SSE2: # %bb.0: 131 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7] 132 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 133 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 134 ; SSE2-NEXT: movq %xmm0, (%rsi) 135 ; SSE2-NEXT: retq 136 ; 137 ; SSE42-LABEL: shuffle_v8i16_to_v4i16: 138 ; SSE42: # %bb.0: 139 ; SSE42-NEXT: movdqa (%rdi), %xmm0 140 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 141 ; SSE42-NEXT: movq %xmm0, (%rsi) 142 ; SSE42-NEXT: retq 143 ; 144 ; AVX-LABEL: shuffle_v8i16_to_v4i16: 145 ; AVX: # %bb.0: 146 ; AVX-NEXT: vmovdqa (%rdi), %xmm0 147 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 148 ; AVX-NEXT: vmovq %xmm0, (%rsi) 149 ; AVX-NEXT: retq 150 ; 151 ; AVX512F-LABEL: shuffle_v8i16_to_v4i16: 152 ; AVX512F: # %bb.0: 153 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 154 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 155 ; AVX512F-NEXT: vmovq %xmm0, (%rsi) 156 ; AVX512F-NEXT: retq 157 ; 158 ; AVX512VL-LABEL: shuffle_v8i16_to_v4i16: 159 ; AVX512VL: # %bb.0: 160 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 161 ; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi) 162 ; AVX512VL-NEXT: retq 163 ; 164 ; AVX512BW-LABEL: shuffle_v8i16_to_v4i16: 165 ; AVX512BW: # %bb.0: 166 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 167 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 168 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) 169 ; AVX512BW-NEXT: retq 170 ; 171 ; AVX512BWVL-LABEL: shuffle_v8i16_to_v4i16: 172 ; AVX512BWVL: # %bb.0: 173 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 174 ; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi) 175 ; AVX512BWVL-NEXT: retq 176 %vec = load <8 x i16>, <8 x i16>* %L 177 %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 178 store <4 x i16> %strided.vec, <4 x i16>* %S 179 ret void 180 } 181 182 define void @trunc_v4i32_to_v4i16(<8 x i16>* %L, <4 x i16>* %S) nounwind { 183 ; SSE2-LABEL: trunc_v4i32_to_v4i16: 184 ; SSE2: # %bb.0: 185 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7] 186 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 187 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 188 ; SSE2-NEXT: movq %xmm0, (%rsi) 189 ; SSE2-NEXT: retq 190 ; 191 ; SSE42-LABEL: trunc_v4i32_to_v4i16: 192 ; SSE42: # %bb.0: 193 ; SSE42-NEXT: movdqa (%rdi), %xmm0 194 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 195 ; SSE42-NEXT: movq %xmm0, (%rsi) 196 ; SSE42-NEXT: retq 197 ; 198 ; AVX-LABEL: trunc_v4i32_to_v4i16: 199 ; AVX: # %bb.0: 200 ; AVX-NEXT: vmovdqa (%rdi), %xmm0 201 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 202 ; AVX-NEXT: vmovq %xmm0, (%rsi) 203 ; AVX-NEXT: retq 204 ; 205 ; AVX512F-LABEL: trunc_v4i32_to_v4i16: 206 ; AVX512F: # %bb.0: 207 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 208 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 209 ; AVX512F-NEXT: vmovq %xmm0, (%rsi) 210 ; AVX512F-NEXT: retq 211 ; 212 ; AVX512VL-LABEL: trunc_v4i32_to_v4i16: 213 ; AVX512VL: # %bb.0: 214 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 215 ; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi) 216 ; AVX512VL-NEXT: retq 217 ; 218 ; AVX512BW-LABEL: trunc_v4i32_to_v4i16: 219 ; AVX512BW: # %bb.0: 220 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 221 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 222 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) 223 ; AVX512BW-NEXT: retq 224 ; 225 ; AVX512BWVL-LABEL: trunc_v4i32_to_v4i16: 226 ; AVX512BWVL: # %bb.0: 227 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 228 ; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi) 229 ; AVX512BWVL-NEXT: retq 230 %vec = load <8 x i16>, <8 x i16>* %L 231 %bc = bitcast <8 x i16> %vec to <4 x i32> 232 %strided.vec = trunc <4 x i32> %bc to <4 x i16> 233 store <4 x i16> %strided.vec, <4 x i16>* %S 234 ret void 235 } 236 237 define void @shuffle_v4i32_to_v2i32(<4 x i32>* %L, <2 x i32>* %S) nounwind { 238 ; SSE-LABEL: shuffle_v4i32_to_v2i32: 239 ; SSE: # %bb.0: 240 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,2,2,3] 241 ; SSE-NEXT: movq %xmm0, (%rsi) 242 ; SSE-NEXT: retq 243 ; 244 ; AVX-LABEL: shuffle_v4i32_to_v2i32: 245 ; AVX: # %bb.0: 246 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3] 247 ; AVX-NEXT: vmovlps %xmm0, (%rsi) 248 ; AVX-NEXT: retq 249 ; 250 ; AVX512F-LABEL: shuffle_v4i32_to_v2i32: 251 ; AVX512F: # %bb.0: 252 ; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3] 253 ; AVX512F-NEXT: vmovlps %xmm0, (%rsi) 254 ; AVX512F-NEXT: retq 255 ; 256 ; AVX512VL-LABEL: shuffle_v4i32_to_v2i32: 257 ; AVX512VL: # %bb.0: 258 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 259 ; AVX512VL-NEXT: vpmovqd %xmm0, (%rsi) 260 ; AVX512VL-NEXT: retq 261 ; 262 ; AVX512BW-LABEL: shuffle_v4i32_to_v2i32: 263 ; AVX512BW: # %bb.0: 264 ; AVX512BW-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3] 265 ; AVX512BW-NEXT: vmovlps %xmm0, (%rsi) 266 ; AVX512BW-NEXT: retq 267 ; 268 ; AVX512BWVL-LABEL: shuffle_v4i32_to_v2i32: 269 ; AVX512BWVL: # %bb.0: 270 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 271 ; AVX512BWVL-NEXT: vpmovqd %xmm0, (%rsi) 272 ; AVX512BWVL-NEXT: retq 273 %vec = load <4 x i32>, <4 x i32>* %L 274 %strided.vec = shufflevector <4 x i32> %vec, <4 x i32> undef, <2 x i32> <i32 0, i32 2> 275 store <2 x i32> %strided.vec, <2 x i32>* %S 276 ret void 277 } 278 279 define void @trunc_v2i64_to_v2i32(<4 x i32>* %L, <2 x i32>* %S) nounwind { 280 ; SSE-LABEL: trunc_v2i64_to_v2i32: 281 ; SSE: # %bb.0: 282 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,2,2,3] 283 ; SSE-NEXT: movq %xmm0, (%rsi) 284 ; SSE-NEXT: retq 285 ; 286 ; AVX-LABEL: trunc_v2i64_to_v2i32: 287 ; AVX: # %bb.0: 288 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3] 289 ; AVX-NEXT: vmovlps %xmm0, (%rsi) 290 ; AVX-NEXT: retq 291 ; 292 ; AVX512F-LABEL: trunc_v2i64_to_v2i32: 293 ; AVX512F: # %bb.0: 294 ; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3] 295 ; AVX512F-NEXT: vmovlps %xmm0, (%rsi) 296 ; AVX512F-NEXT: retq 297 ; 298 ; AVX512VL-LABEL: trunc_v2i64_to_v2i32: 299 ; AVX512VL: # %bb.0: 300 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 301 ; AVX512VL-NEXT: vpmovqd %xmm0, (%rsi) 302 ; AVX512VL-NEXT: retq 303 ; 304 ; AVX512BW-LABEL: trunc_v2i64_to_v2i32: 305 ; AVX512BW: # %bb.0: 306 ; AVX512BW-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3] 307 ; AVX512BW-NEXT: vmovlps %xmm0, (%rsi) 308 ; AVX512BW-NEXT: retq 309 ; 310 ; AVX512BWVL-LABEL: trunc_v2i64_to_v2i32: 311 ; AVX512BWVL: # %bb.0: 312 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 313 ; AVX512BWVL-NEXT: vpmovqd %xmm0, (%rsi) 314 ; AVX512BWVL-NEXT: retq 315 %vec = load <4 x i32>, <4 x i32>* %L 316 %bc = bitcast <4 x i32> %vec to <2 x i64> 317 %strided.vec = trunc <2 x i64> %bc to <2 x i32> 318 store <2 x i32> %strided.vec, <2 x i32>* %S 319 ret void 320 } 321 322 define void @shuffle_v16i8_to_v4i8(<16 x i8>* %L, <4 x i8>* %S) nounwind { 323 ; SSE2-LABEL: shuffle_v16i8_to_v4i8: 324 ; SSE2: # %bb.0: 325 ; SSE2-NEXT: movdqa (%rdi), %xmm0 326 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 327 ; SSE2-NEXT: packuswb %xmm0, %xmm0 328 ; SSE2-NEXT: packuswb %xmm0, %xmm0 329 ; SSE2-NEXT: movd %xmm0, (%rsi) 330 ; SSE2-NEXT: retq 331 ; 332 ; SSE42-LABEL: shuffle_v16i8_to_v4i8: 333 ; SSE42: # %bb.0: 334 ; SSE42-NEXT: movdqa (%rdi), %xmm0 335 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] 336 ; SSE42-NEXT: movd %xmm0, (%rsi) 337 ; SSE42-NEXT: retq 338 ; 339 ; AVX-LABEL: shuffle_v16i8_to_v4i8: 340 ; AVX: # %bb.0: 341 ; AVX-NEXT: vmovdqa (%rdi), %xmm0 342 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] 343 ; AVX-NEXT: vmovd %xmm0, (%rsi) 344 ; AVX-NEXT: retq 345 ; 346 ; AVX512F-LABEL: shuffle_v16i8_to_v4i8: 347 ; AVX512F: # %bb.0: 348 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 349 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] 350 ; AVX512F-NEXT: vmovd %xmm0, (%rsi) 351 ; AVX512F-NEXT: retq 352 ; 353 ; AVX512VL-LABEL: shuffle_v16i8_to_v4i8: 354 ; AVX512VL: # %bb.0: 355 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 356 ; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi) 357 ; AVX512VL-NEXT: retq 358 ; 359 ; AVX512BW-LABEL: shuffle_v16i8_to_v4i8: 360 ; AVX512BW: # %bb.0: 361 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 362 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] 363 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi) 364 ; AVX512BW-NEXT: retq 365 ; 366 ; AVX512BWVL-LABEL: shuffle_v16i8_to_v4i8: 367 ; AVX512BWVL: # %bb.0: 368 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 369 ; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi) 370 ; AVX512BWVL-NEXT: retq 371 %vec = load <16 x i8>, <16 x i8>* %L 372 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12> 373 store <4 x i8> %strided.vec, <4 x i8>* %S 374 ret void 375 } 376 377 define void @trunc_v4i32_to_v4i8(<16 x i8>* %L, <4 x i8>* %S) nounwind { 378 ; SSE2-LABEL: trunc_v4i32_to_v4i8: 379 ; SSE2: # %bb.0: 380 ; SSE2-NEXT: movdqa (%rdi), %xmm0 381 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 382 ; SSE2-NEXT: packuswb %xmm0, %xmm0 383 ; SSE2-NEXT: packuswb %xmm0, %xmm0 384 ; SSE2-NEXT: movd %xmm0, (%rsi) 385 ; SSE2-NEXT: retq 386 ; 387 ; SSE42-LABEL: trunc_v4i32_to_v4i8: 388 ; SSE42: # %bb.0: 389 ; SSE42-NEXT: movdqa (%rdi), %xmm0 390 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] 391 ; SSE42-NEXT: movd %xmm0, (%rsi) 392 ; SSE42-NEXT: retq 393 ; 394 ; AVX-LABEL: trunc_v4i32_to_v4i8: 395 ; AVX: # %bb.0: 396 ; AVX-NEXT: vmovdqa (%rdi), %xmm0 397 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] 398 ; AVX-NEXT: vmovd %xmm0, (%rsi) 399 ; AVX-NEXT: retq 400 ; 401 ; AVX512F-LABEL: trunc_v4i32_to_v4i8: 402 ; AVX512F: # %bb.0: 403 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 404 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] 405 ; AVX512F-NEXT: vmovd %xmm0, (%rsi) 406 ; AVX512F-NEXT: retq 407 ; 408 ; AVX512VL-LABEL: trunc_v4i32_to_v4i8: 409 ; AVX512VL: # %bb.0: 410 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 411 ; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi) 412 ; AVX512VL-NEXT: retq 413 ; 414 ; AVX512BW-LABEL: trunc_v4i32_to_v4i8: 415 ; AVX512BW: # %bb.0: 416 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 417 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] 418 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi) 419 ; AVX512BW-NEXT: retq 420 ; 421 ; AVX512BWVL-LABEL: trunc_v4i32_to_v4i8: 422 ; AVX512BWVL: # %bb.0: 423 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 424 ; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi) 425 ; AVX512BWVL-NEXT: retq 426 %vec = load <16 x i8>, <16 x i8>* %L 427 %bc = bitcast <16 x i8> %vec to <4 x i32> 428 %strided.vec = trunc <4 x i32> %bc to <4 x i8> 429 store <4 x i8> %strided.vec, <4 x i8>* %S 430 ret void 431 } 432 433 define void @shuffle_v8i16_to_v2i16(<8 x i16>* %L, <2 x i16>* %S) nounwind { 434 ; SSE-LABEL: shuffle_v8i16_to_v2i16: 435 ; SSE: # %bb.0: 436 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,2,2,3] 437 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 438 ; SSE-NEXT: movd %xmm0, (%rsi) 439 ; SSE-NEXT: retq 440 ; 441 ; AVX1-LABEL: shuffle_v8i16_to_v2i16: 442 ; AVX1: # %bb.0: 443 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] 444 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 445 ; AVX1-NEXT: vmovd %xmm0, (%rsi) 446 ; AVX1-NEXT: retq 447 ; 448 ; AVX2-SLOW-LABEL: shuffle_v8i16_to_v2i16: 449 ; AVX2-SLOW: # %bb.0: 450 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] 451 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 452 ; AVX2-SLOW-NEXT: vmovd %xmm0, (%rsi) 453 ; AVX2-SLOW-NEXT: retq 454 ; 455 ; AVX2-FAST-LABEL: shuffle_v8i16_to_v2i16: 456 ; AVX2-FAST: # %bb.0: 457 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 458 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] 459 ; AVX2-FAST-NEXT: vmovd %xmm0, (%rsi) 460 ; AVX2-FAST-NEXT: retq 461 ; 462 ; AVX512F-LABEL: shuffle_v8i16_to_v2i16: 463 ; AVX512F: # %bb.0: 464 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] 465 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 466 ; AVX512F-NEXT: vmovd %xmm0, (%rsi) 467 ; AVX512F-NEXT: retq 468 ; 469 ; AVX512VL-LABEL: shuffle_v8i16_to_v2i16: 470 ; AVX512VL: # %bb.0: 471 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 472 ; AVX512VL-NEXT: vpmovqw %xmm0, (%rsi) 473 ; AVX512VL-NEXT: retq 474 ; 475 ; AVX512BW-LABEL: shuffle_v8i16_to_v2i16: 476 ; AVX512BW: # %bb.0: 477 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 478 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] 479 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi) 480 ; AVX512BW-NEXT: retq 481 ; 482 ; AVX512BWVL-LABEL: shuffle_v8i16_to_v2i16: 483 ; AVX512BWVL: # %bb.0: 484 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 485 ; AVX512BWVL-NEXT: vpmovqw %xmm0, (%rsi) 486 ; AVX512BWVL-NEXT: retq 487 %vec = load <8 x i16>, <8 x i16>* %L 488 %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <2 x i32> <i32 0, i32 4> 489 store <2 x i16> %strided.vec, <2 x i16>* %S 490 ret void 491 } 492 493 define void @trunc_v2i64_to_v2i16(<8 x i16>* %L, <2 x i16>* %S) nounwind { 494 ; SSE-LABEL: trunc_v2i64_to_v2i16: 495 ; SSE: # %bb.0: 496 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,2,2,3] 497 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 498 ; SSE-NEXT: movd %xmm0, (%rsi) 499 ; SSE-NEXT: retq 500 ; 501 ; AVX1-LABEL: trunc_v2i64_to_v2i16: 502 ; AVX1: # %bb.0: 503 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] 504 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 505 ; AVX1-NEXT: vmovd %xmm0, (%rsi) 506 ; AVX1-NEXT: retq 507 ; 508 ; AVX2-SLOW-LABEL: trunc_v2i64_to_v2i16: 509 ; AVX2-SLOW: # %bb.0: 510 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] 511 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 512 ; AVX2-SLOW-NEXT: vmovd %xmm0, (%rsi) 513 ; AVX2-SLOW-NEXT: retq 514 ; 515 ; AVX2-FAST-LABEL: trunc_v2i64_to_v2i16: 516 ; AVX2-FAST: # %bb.0: 517 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 518 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] 519 ; AVX2-FAST-NEXT: vmovd %xmm0, (%rsi) 520 ; AVX2-FAST-NEXT: retq 521 ; 522 ; AVX512F-LABEL: trunc_v2i64_to_v2i16: 523 ; AVX512F: # %bb.0: 524 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] 525 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 526 ; AVX512F-NEXT: vmovd %xmm0, (%rsi) 527 ; AVX512F-NEXT: retq 528 ; 529 ; AVX512VL-LABEL: trunc_v2i64_to_v2i16: 530 ; AVX512VL: # %bb.0: 531 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 532 ; AVX512VL-NEXT: vpmovqw %xmm0, (%rsi) 533 ; AVX512VL-NEXT: retq 534 ; 535 ; AVX512BW-LABEL: trunc_v2i64_to_v2i16: 536 ; AVX512BW: # %bb.0: 537 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 538 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] 539 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi) 540 ; AVX512BW-NEXT: retq 541 ; 542 ; AVX512BWVL-LABEL: trunc_v2i64_to_v2i16: 543 ; AVX512BWVL: # %bb.0: 544 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 545 ; AVX512BWVL-NEXT: vpmovqw %xmm0, (%rsi) 546 ; AVX512BWVL-NEXT: retq 547 %vec = load <8 x i16>, <8 x i16>* %L 548 %bc = bitcast <8 x i16> %vec to <2 x i64> 549 %strided.vec = trunc <2 x i64> %bc to <2 x i16> 550 store <2 x i16> %strided.vec, <2 x i16>* %S 551 ret void 552 } 553 554 define void @shuffle_v16i8_to_v2i8(<16 x i8>* %L, <2 x i8>* %S) nounwind { 555 ; SSE2-LABEL: shuffle_v16i8_to_v2i8: 556 ; SSE2: # %bb.0: 557 ; SSE2-NEXT: movdqa (%rdi), %xmm0 558 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 559 ; SSE2-NEXT: packuswb %xmm0, %xmm0 560 ; SSE2-NEXT: packuswb %xmm0, %xmm0 561 ; SSE2-NEXT: packuswb %xmm0, %xmm0 562 ; SSE2-NEXT: movd %xmm0, %eax 563 ; SSE2-NEXT: movw %ax, (%rsi) 564 ; SSE2-NEXT: retq 565 ; 566 ; SSE42-LABEL: shuffle_v16i8_to_v2i8: 567 ; SSE42: # %bb.0: 568 ; SSE42-NEXT: movdqa (%rdi), %xmm0 569 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 570 ; SSE42-NEXT: pextrw $0, %xmm0, (%rsi) 571 ; SSE42-NEXT: retq 572 ; 573 ; AVX-LABEL: shuffle_v16i8_to_v2i8: 574 ; AVX: # %bb.0: 575 ; AVX-NEXT: vmovdqa (%rdi), %xmm0 576 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 577 ; AVX-NEXT: vpextrw $0, %xmm0, (%rsi) 578 ; AVX-NEXT: retq 579 ; 580 ; AVX512F-LABEL: shuffle_v16i8_to_v2i8: 581 ; AVX512F: # %bb.0: 582 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 583 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 584 ; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi) 585 ; AVX512F-NEXT: retq 586 ; 587 ; AVX512VL-LABEL: shuffle_v16i8_to_v2i8: 588 ; AVX512VL: # %bb.0: 589 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 590 ; AVX512VL-NEXT: vpmovqb %xmm0, (%rsi) 591 ; AVX512VL-NEXT: retq 592 ; 593 ; AVX512BW-LABEL: shuffle_v16i8_to_v2i8: 594 ; AVX512BW: # %bb.0: 595 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 596 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 597 ; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi) 598 ; AVX512BW-NEXT: retq 599 ; 600 ; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8: 601 ; AVX512BWVL: # %bb.0: 602 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 603 ; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rsi) 604 ; AVX512BWVL-NEXT: retq 605 %vec = load <16 x i8>, <16 x i8>* %L 606 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 0, i32 8> 607 store <2 x i8> %strided.vec, <2 x i8>* %S 608 ret void 609 } 610 611 define void @trunc_v2i64_to_v2i8(<16 x i8>* %L, <2 x i8>* %S) nounwind { 612 ; SSE2-LABEL: trunc_v2i64_to_v2i8: 613 ; SSE2: # %bb.0: 614 ; SSE2-NEXT: movdqa (%rdi), %xmm0 615 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 616 ; SSE2-NEXT: packuswb %xmm0, %xmm0 617 ; SSE2-NEXT: packuswb %xmm0, %xmm0 618 ; SSE2-NEXT: packuswb %xmm0, %xmm0 619 ; SSE2-NEXT: movd %xmm0, %eax 620 ; SSE2-NEXT: movw %ax, (%rsi) 621 ; SSE2-NEXT: retq 622 ; 623 ; SSE42-LABEL: trunc_v2i64_to_v2i8: 624 ; SSE42: # %bb.0: 625 ; SSE42-NEXT: movdqa (%rdi), %xmm0 626 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 627 ; SSE42-NEXT: pextrw $0, %xmm0, (%rsi) 628 ; SSE42-NEXT: retq 629 ; 630 ; AVX-LABEL: trunc_v2i64_to_v2i8: 631 ; AVX: # %bb.0: 632 ; AVX-NEXT: vmovdqa (%rdi), %xmm0 633 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 634 ; AVX-NEXT: vpextrw $0, %xmm0, (%rsi) 635 ; AVX-NEXT: retq 636 ; 637 ; AVX512F-LABEL: trunc_v2i64_to_v2i8: 638 ; AVX512F: # %bb.0: 639 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 640 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 641 ; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi) 642 ; AVX512F-NEXT: retq 643 ; 644 ; AVX512VL-LABEL: trunc_v2i64_to_v2i8: 645 ; AVX512VL: # %bb.0: 646 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 647 ; AVX512VL-NEXT: vpmovqb %xmm0, (%rsi) 648 ; AVX512VL-NEXT: retq 649 ; 650 ; AVX512BW-LABEL: trunc_v2i64_to_v2i8: 651 ; AVX512BW: # %bb.0: 652 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 653 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 654 ; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi) 655 ; AVX512BW-NEXT: retq 656 ; 657 ; AVX512BWVL-LABEL: trunc_v2i64_to_v2i8: 658 ; AVX512BWVL: # %bb.0: 659 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 660 ; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rsi) 661 ; AVX512BWVL-NEXT: retq 662 %vec = load <16 x i8>, <16 x i8>* %L 663 %bc = bitcast <16 x i8> %vec to <2 x i64> 664 %strided.vec = trunc <2 x i64> %bc to <2 x i8> 665 store <2 x i8> %strided.vec, <2 x i8>* %S 666 ret void 667 } 668