1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW 4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST 5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F 6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL 7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW 8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL 9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMIVL 10 11 ; PR31551 12 ; Pairs of shufflevector:trunc functions with functional equivalence. 13 ; Ideally, the shuffles should be lowered to code with the same quality as the truncates. 14 15 define void @shuffle_v32i8_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind { 16 ; AVX1-LABEL: shuffle_v32i8_to_v16i8: 17 ; AVX1: # %bb.0: 18 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 19 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 20 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 21 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 22 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 23 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 24 ; AVX1-NEXT: vmovdqa %xmm0, (%rsi) 25 ; AVX1-NEXT: vzeroupper 26 ; AVX1-NEXT: retq 27 ; 28 ; AVX2-LABEL: shuffle_v32i8_to_v16i8: 29 ; AVX2: # %bb.0: 30 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 31 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 32 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 33 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 34 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 35 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 36 ; AVX2-NEXT: vmovdqa %xmm0, (%rsi) 37 ; AVX2-NEXT: vzeroupper 38 ; AVX2-NEXT: retq 39 ; 40 ; AVX512-LABEL: shuffle_v32i8_to_v16i8: 41 ; AVX512: # %bb.0: 42 ; AVX512-NEXT: vmovdqa (%rdi), %ymm0 43 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 44 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 45 ; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 46 ; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 47 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 48 ; AVX512-NEXT: vmovdqa %xmm0, (%rsi) 49 ; AVX512-NEXT: vzeroupper 50 ; AVX512-NEXT: retq 51 %vec = load <32 x i8>, <32 x i8>* %L 52 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> 53 store <16 x i8> %strided.vec, <16 x i8>* %S 54 ret void 55 } 56 57 define void @trunc_v16i16_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind { 58 ; AVX1-LABEL: trunc_v16i16_to_v16i8: 59 ; AVX1: # %bb.0: 60 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 61 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 62 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 63 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 64 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 65 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 66 ; AVX1-NEXT: vmovdqa %xmm0, (%rsi) 67 ; AVX1-NEXT: vzeroupper 68 ; AVX1-NEXT: retq 69 ; 70 ; AVX2-LABEL: trunc_v16i16_to_v16i8: 71 ; AVX2: # %bb.0: 72 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 73 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 74 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 75 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 76 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 77 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 78 ; AVX2-NEXT: vmovdqa %xmm0, (%rsi) 79 ; AVX2-NEXT: vzeroupper 80 ; AVX2-NEXT: retq 81 ; 82 ; AVX512F-LABEL: trunc_v16i16_to_v16i8: 83 ; AVX512F: # %bb.0: 84 ; AVX512F-NEXT: vpmovsxwd (%rdi), %zmm0 85 ; AVX512F-NEXT: vpmovdb %zmm0, (%rsi) 86 ; AVX512F-NEXT: vzeroupper 87 ; AVX512F-NEXT: retq 88 ; 89 ; AVX512VL-LABEL: trunc_v16i16_to_v16i8: 90 ; AVX512VL: # %bb.0: 91 ; AVX512VL-NEXT: vpmovsxwd (%rdi), %zmm0 92 ; AVX512VL-NEXT: vpmovdb %zmm0, (%rsi) 93 ; AVX512VL-NEXT: vzeroupper 94 ; AVX512VL-NEXT: retq 95 ; 96 ; AVX512BW-LABEL: trunc_v16i16_to_v16i8: 97 ; AVX512BW: # %bb.0: 98 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 99 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 100 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) 101 ; AVX512BW-NEXT: vzeroupper 102 ; AVX512BW-NEXT: retq 103 ; 104 ; AVX512BWVL-LABEL: trunc_v16i16_to_v16i8: 105 ; AVX512BWVL: # %bb.0: 106 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 107 ; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rsi) 108 ; AVX512BWVL-NEXT: vzeroupper 109 ; AVX512BWVL-NEXT: retq 110 ; 111 ; AVX512VBMIVL-LABEL: trunc_v16i16_to_v16i8: 112 ; AVX512VBMIVL: # %bb.0: 113 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 114 ; AVX512VBMIVL-NEXT: vpmovwb %ymm0, (%rsi) 115 ; AVX512VBMIVL-NEXT: vzeroupper 116 ; AVX512VBMIVL-NEXT: retq 117 %vec = load <32 x i8>, <32 x i8>* %L 118 %bc = bitcast <32 x i8> %vec to <16 x i16> 119 %strided.vec = trunc <16 x i16> %bc to <16 x i8> 120 store <16 x i8> %strided.vec, <16 x i8>* %S 121 ret void 122 } 123 124 define void @shuffle_v16i16_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind { 125 ; AVX1-LABEL: shuffle_v16i16_to_v8i16: 126 ; AVX1: # %bb.0: 127 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 128 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 129 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 130 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 131 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 132 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 133 ; AVX1-NEXT: vmovdqa %xmm0, (%rsi) 134 ; AVX1-NEXT: vzeroupper 135 ; AVX1-NEXT: retq 136 ; 137 ; AVX2-LABEL: shuffle_v16i16_to_v8i16: 138 ; AVX2: # %bb.0: 139 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 140 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 141 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 142 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 143 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 144 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 145 ; AVX2-NEXT: vmovdqa %xmm0, (%rsi) 146 ; AVX2-NEXT: vzeroupper 147 ; AVX2-NEXT: retq 148 ; 149 ; AVX512-LABEL: shuffle_v16i16_to_v8i16: 150 ; AVX512: # %bb.0: 151 ; AVX512-NEXT: vmovdqa (%rdi), %ymm0 152 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 153 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 154 ; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 155 ; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 156 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 157 ; AVX512-NEXT: vmovdqa %xmm0, (%rsi) 158 ; AVX512-NEXT: vzeroupper 159 ; AVX512-NEXT: retq 160 %vec = load <16 x i16>, <16 x i16>* %L 161 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 162 store <8 x i16> %strided.vec, <8 x i16>* %S 163 ret void 164 } 165 166 define void @trunc_v8i32_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind { 167 ; AVX1-LABEL: trunc_v8i32_to_v8i16: 168 ; AVX1: # %bb.0: 169 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 170 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 171 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 172 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 173 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 174 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 175 ; AVX1-NEXT: vmovdqa %xmm0, (%rsi) 176 ; AVX1-NEXT: vzeroupper 177 ; AVX1-NEXT: retq 178 ; 179 ; AVX2-LABEL: trunc_v8i32_to_v8i16: 180 ; AVX2: # %bb.0: 181 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 182 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 183 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 184 ; AVX2-NEXT: vmovdqa %xmm0, (%rsi) 185 ; AVX2-NEXT: vzeroupper 186 ; AVX2-NEXT: retq 187 ; 188 ; AVX512F-LABEL: trunc_v8i32_to_v8i16: 189 ; AVX512F: # %bb.0: 190 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 191 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 192 ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) 193 ; AVX512F-NEXT: vzeroupper 194 ; AVX512F-NEXT: retq 195 ; 196 ; AVX512VL-LABEL: trunc_v8i32_to_v8i16: 197 ; AVX512VL: # %bb.0: 198 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 199 ; AVX512VL-NEXT: vpmovdw %ymm0, (%rsi) 200 ; AVX512VL-NEXT: vzeroupper 201 ; AVX512VL-NEXT: retq 202 ; 203 ; AVX512BW-LABEL: trunc_v8i32_to_v8i16: 204 ; AVX512BW: # %bb.0: 205 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 206 ; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 207 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) 208 ; AVX512BW-NEXT: vzeroupper 209 ; AVX512BW-NEXT: retq 210 ; 211 ; AVX512BWVL-LABEL: trunc_v8i32_to_v8i16: 212 ; AVX512BWVL: # %bb.0: 213 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 214 ; AVX512BWVL-NEXT: vpmovdw %ymm0, (%rsi) 215 ; AVX512BWVL-NEXT: vzeroupper 216 ; AVX512BWVL-NEXT: retq 217 ; 218 ; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i16: 219 ; AVX512VBMIVL: # %bb.0: 220 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 221 ; AVX512VBMIVL-NEXT: vpmovdw %ymm0, (%rsi) 222 ; AVX512VBMIVL-NEXT: vzeroupper 223 ; AVX512VBMIVL-NEXT: retq 224 %vec = load <16 x i16>, <16 x i16>* %L 225 %bc = bitcast <16 x i16> %vec to <8 x i32> 226 %strided.vec = trunc <8 x i32> %bc to <8 x i16> 227 store <8 x i16> %strided.vec, <8 x i16>* %S 228 ret void 229 } 230 231 define void @shuffle_v8i32_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind { 232 ; AVX-LABEL: shuffle_v8i32_to_v4i32: 233 ; AVX: # %bb.0: 234 ; AVX-NEXT: vmovaps (%rdi), %ymm0 235 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 236 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 237 ; AVX-NEXT: vmovaps %xmm0, (%rsi) 238 ; AVX-NEXT: vzeroupper 239 ; AVX-NEXT: retq 240 ; 241 ; AVX512-LABEL: shuffle_v8i32_to_v4i32: 242 ; AVX512: # %bb.0: 243 ; AVX512-NEXT: vmovaps (%rdi), %ymm0 244 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 245 ; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 246 ; AVX512-NEXT: vmovaps %xmm0, (%rsi) 247 ; AVX512-NEXT: vzeroupper 248 ; AVX512-NEXT: retq 249 %vec = load <8 x i32>, <8 x i32>* %L 250 %strided.vec = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 251 store <4 x i32> %strided.vec, <4 x i32>* %S 252 ret void 253 } 254 255 define void @trunc_v4i64_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind { 256 ; AVX1-LABEL: trunc_v4i64_to_v4i32: 257 ; AVX1: # %bb.0: 258 ; AVX1-NEXT: vmovaps (%rdi), %ymm0 259 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 260 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 261 ; AVX1-NEXT: vmovaps %xmm0, (%rsi) 262 ; AVX1-NEXT: vzeroupper 263 ; AVX1-NEXT: retq 264 ; 265 ; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i32: 266 ; AVX2-SLOW: # %bb.0: 267 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7] 268 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] 269 ; AVX2-SLOW-NEXT: vmovaps %xmm0, (%rsi) 270 ; AVX2-SLOW-NEXT: vzeroupper 271 ; AVX2-SLOW-NEXT: retq 272 ; 273 ; AVX2-FAST-LABEL: trunc_v4i64_to_v4i32: 274 ; AVX2-FAST: # %bb.0: 275 ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7] 276 ; AVX2-FAST-NEXT: vpermps (%rdi), %ymm0, %ymm0 277 ; AVX2-FAST-NEXT: vmovaps %xmm0, (%rsi) 278 ; AVX2-FAST-NEXT: vzeroupper 279 ; AVX2-FAST-NEXT: retq 280 ; 281 ; AVX512F-LABEL: trunc_v4i64_to_v4i32: 282 ; AVX512F: # %bb.0: 283 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 284 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 285 ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) 286 ; AVX512F-NEXT: vzeroupper 287 ; AVX512F-NEXT: retq 288 ; 289 ; AVX512VL-LABEL: trunc_v4i64_to_v4i32: 290 ; AVX512VL: # %bb.0: 291 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 292 ; AVX512VL-NEXT: vpmovqd %ymm0, (%rsi) 293 ; AVX512VL-NEXT: vzeroupper 294 ; AVX512VL-NEXT: retq 295 ; 296 ; AVX512BW-LABEL: trunc_v4i64_to_v4i32: 297 ; AVX512BW: # %bb.0: 298 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 299 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 300 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) 301 ; AVX512BW-NEXT: vzeroupper 302 ; AVX512BW-NEXT: retq 303 ; 304 ; AVX512BWVL-LABEL: trunc_v4i64_to_v4i32: 305 ; AVX512BWVL: # %bb.0: 306 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 307 ; AVX512BWVL-NEXT: vpmovqd %ymm0, (%rsi) 308 ; AVX512BWVL-NEXT: vzeroupper 309 ; AVX512BWVL-NEXT: retq 310 ; 311 ; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i32: 312 ; AVX512VBMIVL: # %bb.0: 313 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 314 ; AVX512VBMIVL-NEXT: vpmovqd %ymm0, (%rsi) 315 ; AVX512VBMIVL-NEXT: vzeroupper 316 ; AVX512VBMIVL-NEXT: retq 317 %vec = load <8 x i32>, <8 x i32>* %L 318 %bc = bitcast <8 x i32> %vec to <4 x i64> 319 %strided.vec = trunc <4 x i64> %bc to <4 x i32> 320 store <4 x i32> %strided.vec, <4 x i32>* %S 321 ret void 322 } 323 324 define void @shuffle_v32i8_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind { 325 ; AVX1-LABEL: shuffle_v32i8_to_v8i8: 326 ; AVX1: # %bb.0: 327 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 328 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 329 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> 330 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 331 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 332 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 333 ; AVX1-NEXT: vmovq %xmm0, (%rsi) 334 ; AVX1-NEXT: vzeroupper 335 ; AVX1-NEXT: retq 336 ; 337 ; AVX2-LABEL: shuffle_v32i8_to_v8i8: 338 ; AVX2: # %bb.0: 339 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 340 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 341 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> 342 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 343 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 344 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 345 ; AVX2-NEXT: vmovq %xmm0, (%rsi) 346 ; AVX2-NEXT: vzeroupper 347 ; AVX2-NEXT: retq 348 ; 349 ; AVX512F-LABEL: shuffle_v32i8_to_v8i8: 350 ; AVX512F: # %bb.0: 351 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 352 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 353 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> 354 ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 355 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 356 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 357 ; AVX512F-NEXT: vmovq %xmm0, (%rsi) 358 ; AVX512F-NEXT: vzeroupper 359 ; AVX512F-NEXT: retq 360 ; 361 ; AVX512VL-LABEL: shuffle_v32i8_to_v8i8: 362 ; AVX512VL: # %bb.0: 363 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 364 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 365 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> 366 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 367 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 368 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 369 ; AVX512VL-NEXT: vmovq %xmm0, (%rsi) 370 ; AVX512VL-NEXT: vzeroupper 371 ; AVX512VL-NEXT: retq 372 ; 373 ; AVX512BW-LABEL: shuffle_v32i8_to_v8i8: 374 ; AVX512BW: # %bb.0: 375 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 376 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 377 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> 378 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 379 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 380 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 381 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) 382 ; AVX512BW-NEXT: vzeroupper 383 ; AVX512BW-NEXT: retq 384 ; 385 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8: 386 ; AVX512BWVL: # %bb.0: 387 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 388 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 389 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 390 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 391 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 392 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 393 ; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi) 394 ; AVX512BWVL-NEXT: vzeroupper 395 ; AVX512BWVL-NEXT: retq 396 ; 397 ; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v8i8: 398 ; AVX512VBMIVL: # %bb.0: 399 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 400 ; AVX512VBMIVL-NEXT: vextracti128 $1, %ymm0, %xmm1 401 ; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 402 ; AVX512VBMIVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 403 ; AVX512VBMIVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 404 ; AVX512VBMIVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 405 ; AVX512VBMIVL-NEXT: vpmovwb %xmm0, (%rsi) 406 ; AVX512VBMIVL-NEXT: vzeroupper 407 ; AVX512VBMIVL-NEXT: retq 408 %vec = load <32 x i8>, <32 x i8>* %L 409 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28> 410 store <8 x i8> %strided.vec, <8 x i8>* %S 411 ret void 412 } 413 414 define void @trunc_v8i32_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind { 415 ; AVX1-LABEL: trunc_v8i32_to_v8i8: 416 ; AVX1: # %bb.0: 417 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 418 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 419 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> 420 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 421 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 422 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 423 ; AVX1-NEXT: vmovq %xmm0, (%rsi) 424 ; AVX1-NEXT: vzeroupper 425 ; AVX1-NEXT: retq 426 ; 427 ; AVX2-LABEL: trunc_v8i32_to_v8i8: 428 ; AVX2: # %bb.0: 429 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 430 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 431 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 432 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 433 ; AVX2-NEXT: vmovq %xmm0, (%rsi) 434 ; AVX2-NEXT: vzeroupper 435 ; AVX2-NEXT: retq 436 ; 437 ; AVX512F-LABEL: trunc_v8i32_to_v8i8: 438 ; AVX512F: # %bb.0: 439 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 440 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 441 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 442 ; AVX512F-NEXT: vmovq %xmm0, (%rsi) 443 ; AVX512F-NEXT: vzeroupper 444 ; AVX512F-NEXT: retq 445 ; 446 ; AVX512VL-LABEL: trunc_v8i32_to_v8i8: 447 ; AVX512VL: # %bb.0: 448 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 449 ; AVX512VL-NEXT: vpmovdb %ymm0, (%rsi) 450 ; AVX512VL-NEXT: vzeroupper 451 ; AVX512VL-NEXT: retq 452 ; 453 ; AVX512BW-LABEL: trunc_v8i32_to_v8i8: 454 ; AVX512BW: # %bb.0: 455 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 456 ; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 457 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 458 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) 459 ; AVX512BW-NEXT: vzeroupper 460 ; AVX512BW-NEXT: retq 461 ; 462 ; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8: 463 ; AVX512BWVL: # %bb.0: 464 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 465 ; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rsi) 466 ; AVX512BWVL-NEXT: vzeroupper 467 ; AVX512BWVL-NEXT: retq 468 ; 469 ; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8: 470 ; AVX512VBMIVL: # %bb.0: 471 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 472 ; AVX512VBMIVL-NEXT: vpmovdb %ymm0, (%rsi) 473 ; AVX512VBMIVL-NEXT: vzeroupper 474 ; AVX512VBMIVL-NEXT: retq 475 %vec = load <32 x i8>, <32 x i8>* %L 476 %bc = bitcast <32 x i8> %vec to <8 x i32> 477 %strided.vec = trunc <8 x i32> %bc to <8 x i8> 478 store <8 x i8> %strided.vec, <8 x i8>* %S 479 ret void 480 } 481 482 define <2 x i64> @trunc_v8i32_to_v8i8_return_v2i64(<8 x i32> %vec) nounwind { 483 ; IR generated from: 484 ; return (__m128i) {(long long)__builtin_convertvector((__v8si)__A, __v8qi), 0}; 485 ; AVX1-LABEL: trunc_v8i32_to_v8i8_return_v2i64: 486 ; AVX1: # %bb.0: 487 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 488 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 489 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 490 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 491 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 492 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero 493 ; AVX1-NEXT: vzeroupper 494 ; AVX1-NEXT: retq 495 ; 496 ; AVX2-LABEL: trunc_v8i32_to_v8i8_return_v2i64: 497 ; AVX2: # %bb.0: 498 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 499 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 500 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero 501 ; AVX2-NEXT: vzeroupper 502 ; AVX2-NEXT: retq 503 ; 504 ; AVX512F-LABEL: trunc_v8i32_to_v8i8_return_v2i64: 505 ; AVX512F: # %bb.0: 506 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 507 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 508 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero 509 ; AVX512F-NEXT: vzeroupper 510 ; AVX512F-NEXT: retq 511 ; 512 ; AVX512VL-LABEL: trunc_v8i32_to_v8i8_return_v2i64: 513 ; AVX512VL: # %bb.0: 514 ; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0 515 ; AVX512VL-NEXT: vzeroupper 516 ; AVX512VL-NEXT: retq 517 ; 518 ; AVX512BW-LABEL: trunc_v8i32_to_v8i8_return_v2i64: 519 ; AVX512BW: # %bb.0: 520 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 521 ; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 522 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero 523 ; AVX512BW-NEXT: vzeroupper 524 ; AVX512BW-NEXT: retq 525 ; 526 ; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_return_v2i64: 527 ; AVX512BWVL: # %bb.0: 528 ; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0 529 ; AVX512BWVL-NEXT: vzeroupper 530 ; AVX512BWVL-NEXT: retq 531 ; 532 ; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_return_v2i64: 533 ; AVX512VBMIVL: # %bb.0: 534 ; AVX512VBMIVL-NEXT: vpmovdb %ymm0, %xmm0 535 ; AVX512VBMIVL-NEXT: vzeroupper 536 ; AVX512VBMIVL-NEXT: retq 537 %truncated.vec = trunc <8 x i32> %vec to <8 x i8> 538 %bc = bitcast <8 x i8> %truncated.vec to i64 539 %result = insertelement <2 x i64> zeroinitializer, i64 %bc, i32 0 540 ret <2 x i64> %result 541 } 542 543 define <16 x i8> @trunc_v8i32_to_v8i8_with_zext_return_v16i8(<8 x i32> %vec) nounwind { 544 ; AVX1-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8: 545 ; AVX1: # %bb.0: 546 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 547 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 548 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 549 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 550 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 551 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero 552 ; AVX1-NEXT: vzeroupper 553 ; AVX1-NEXT: retq 554 ; 555 ; AVX2-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8: 556 ; AVX2: # %bb.0: 557 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 558 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 559 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero 560 ; AVX2-NEXT: vzeroupper 561 ; AVX2-NEXT: retq 562 ; 563 ; AVX512F-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8: 564 ; AVX512F: # %bb.0: 565 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 566 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 567 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero 568 ; AVX512F-NEXT: vzeroupper 569 ; AVX512F-NEXT: retq 570 ; 571 ; AVX512VL-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8: 572 ; AVX512VL: # %bb.0: 573 ; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0 574 ; AVX512VL-NEXT: vzeroupper 575 ; AVX512VL-NEXT: retq 576 ; 577 ; AVX512BW-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8: 578 ; AVX512BW: # %bb.0: 579 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 580 ; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 581 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero 582 ; AVX512BW-NEXT: vzeroupper 583 ; AVX512BW-NEXT: retq 584 ; 585 ; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8: 586 ; AVX512BWVL: # %bb.0: 587 ; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0 588 ; AVX512BWVL-NEXT: vzeroupper 589 ; AVX512BWVL-NEXT: retq 590 ; 591 ; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8: 592 ; AVX512VBMIVL: # %bb.0: 593 ; AVX512VBMIVL-NEXT: vpmovdb %ymm0, %xmm0 594 ; AVX512VBMIVL-NEXT: vzeroupper 595 ; AVX512VBMIVL-NEXT: retq 596 %truncated = trunc <8 x i32> %vec to <8 x i8> 597 %truncated.ext = zext <8 x i8> %truncated to <8 x i16> 598 %bc = bitcast <8 x i16> %truncated.ext to <16 x i8> 599 %result = shufflevector <16 x i8> %bc, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 600 ret <16 x i8> %result 601 } 602 603 define <16 x i8> @trunc_v8i32_to_v8i8_via_v8i16_return_v16i8(<8 x i32> %vec) nounwind { 604 ; AVX1-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8: 605 ; AVX1: # %bb.0: 606 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 607 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 608 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 609 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 610 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 611 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero 612 ; AVX1-NEXT: vzeroupper 613 ; AVX1-NEXT: retq 614 ; 615 ; AVX2-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8: 616 ; AVX2: # %bb.0: 617 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 618 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 619 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero 620 ; AVX2-NEXT: vzeroupper 621 ; AVX2-NEXT: retq 622 ; 623 ; AVX512F-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8: 624 ; AVX512F: # %bb.0: 625 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 626 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 627 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero 628 ; AVX512F-NEXT: vzeroupper 629 ; AVX512F-NEXT: retq 630 ; 631 ; AVX512VL-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8: 632 ; AVX512VL: # %bb.0: 633 ; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0 634 ; AVX512VL-NEXT: vzeroupper 635 ; AVX512VL-NEXT: retq 636 ; 637 ; AVX512BW-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8: 638 ; AVX512BW: # %bb.0: 639 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 640 ; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 641 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero 642 ; AVX512BW-NEXT: vzeroupper 643 ; AVX512BW-NEXT: retq 644 ; 645 ; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8: 646 ; AVX512BWVL: # %bb.0: 647 ; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0 648 ; AVX512BWVL-NEXT: vzeroupper 649 ; AVX512BWVL-NEXT: retq 650 ; 651 ; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8: 652 ; AVX512VBMIVL: # %bb.0: 653 ; AVX512VBMIVL-NEXT: vpmovdb %ymm0, %xmm0 654 ; AVX512VBMIVL-NEXT: vzeroupper 655 ; AVX512VBMIVL-NEXT: retq 656 %truncated = trunc <8 x i32> %vec to <8 x i16> 657 %bc = bitcast <8 x i16> %truncated to <16 x i8> 658 %result = shufflevector <16 x i8> %bc, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 17, i32 20, i32 24, i32 22, i32 31, i32 28, i32 28, i32 29> 659 ret <16 x i8> %result 660 } 661 662 define <16 x i8> @trunc_v8i32_to_v8i8_return_v16i8(<8 x i32> %vec) nounwind { 663 ; AVX1-LABEL: trunc_v8i32_to_v8i8_return_v16i8: 664 ; AVX1: # %bb.0: 665 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 666 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 667 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 668 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 669 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 670 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero 671 ; AVX1-NEXT: vzeroupper 672 ; AVX1-NEXT: retq 673 ; 674 ; AVX2-LABEL: trunc_v8i32_to_v8i8_return_v16i8: 675 ; AVX2: # %bb.0: 676 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 677 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 678 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero 679 ; AVX2-NEXT: vzeroupper 680 ; AVX2-NEXT: retq 681 ; 682 ; AVX512F-LABEL: trunc_v8i32_to_v8i8_return_v16i8: 683 ; AVX512F: # %bb.0: 684 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 685 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 686 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero 687 ; AVX512F-NEXT: vzeroupper 688 ; AVX512F-NEXT: retq 689 ; 690 ; AVX512VL-LABEL: trunc_v8i32_to_v8i8_return_v16i8: 691 ; AVX512VL: # %bb.0: 692 ; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0 693 ; AVX512VL-NEXT: vzeroupper 694 ; AVX512VL-NEXT: retq 695 ; 696 ; AVX512BW-LABEL: trunc_v8i32_to_v8i8_return_v16i8: 697 ; AVX512BW: # %bb.0: 698 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 699 ; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 700 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero 701 ; AVX512BW-NEXT: vzeroupper 702 ; AVX512BW-NEXT: retq 703 ; 704 ; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_return_v16i8: 705 ; AVX512BWVL: # %bb.0: 706 ; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0 707 ; AVX512BWVL-NEXT: vzeroupper 708 ; AVX512BWVL-NEXT: retq 709 ; 710 ; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_return_v16i8: 711 ; AVX512VBMIVL: # %bb.0: 712 ; AVX512VBMIVL-NEXT: vpmovdb %ymm0, %xmm0 713 ; AVX512VBMIVL-NEXT: vzeroupper 714 ; AVX512VBMIVL-NEXT: retq 715 %truncated = trunc <8 x i32> %vec to <8 x i8> 716 %result = shufflevector <8 x i8> %truncated, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 717 ret <16 x i8> %result 718 } 719 720 define <2 x i64> @trunc_v4i64_to_v4i16_return_v2i64(<4 x i64> %vec) nounwind { 721 ; IR generated from: 722 ; return (__m128i) {(long long)__builtin_convertvector((__v4di)x, __v4hi), 0}; 723 ; AVX1-LABEL: trunc_v4i64_to_v4i16_return_v2i64: 724 ; AVX1: # %bb.0: 725 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 726 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 727 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero 728 ; AVX1-NEXT: vzeroupper 729 ; AVX1-NEXT: retq 730 ; 731 ; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_return_v2i64: 732 ; AVX2-SLOW: # %bb.0: 733 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] 734 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 735 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero 736 ; AVX2-SLOW-NEXT: vzeroupper 737 ; AVX2-SLOW-NEXT: retq 738 ; 739 ; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16_return_v2i64: 740 ; AVX2-FAST: # %bb.0: 741 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] 742 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 743 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero 744 ; AVX2-FAST-NEXT: vzeroupper 745 ; AVX2-FAST-NEXT: retq 746 ; 747 ; AVX512F-LABEL: trunc_v4i64_to_v4i16_return_v2i64: 748 ; AVX512F: # %bb.0: 749 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 750 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 751 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero 752 ; AVX512F-NEXT: vzeroupper 753 ; AVX512F-NEXT: retq 754 ; 755 ; AVX512VL-LABEL: trunc_v4i64_to_v4i16_return_v2i64: 756 ; AVX512VL: # %bb.0: 757 ; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0 758 ; AVX512VL-NEXT: vzeroupper 759 ; AVX512VL-NEXT: retq 760 ; 761 ; AVX512BW-LABEL: trunc_v4i64_to_v4i16_return_v2i64: 762 ; AVX512BW: # %bb.0: 763 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 764 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 765 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero 766 ; AVX512BW-NEXT: vzeroupper 767 ; AVX512BW-NEXT: retq 768 ; 769 ; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_return_v2i64: 770 ; AVX512BWVL: # %bb.0: 771 ; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0 772 ; AVX512BWVL-NEXT: vzeroupper 773 ; AVX512BWVL-NEXT: retq 774 ; 775 ; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_return_v2i64: 776 ; AVX512VBMIVL: # %bb.0: 777 ; AVX512VBMIVL-NEXT: vpmovqw %ymm0, %xmm0 778 ; AVX512VBMIVL-NEXT: vzeroupper 779 ; AVX512VBMIVL-NEXT: retq 780 %truncated = trunc <4 x i64> %vec to <4 x i16> 781 %bc = bitcast <4 x i16> %truncated to i64 782 %result = insertelement <2 x i64> zeroinitializer, i64 %bc, i32 0 783 ret <2 x i64> %result 784 } 785 786 define <8 x i16> @trunc_v4i64_to_v4i16_with_zext_return_v8i16(<4 x i64> %vec) nounwind { 787 ; AVX1-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16: 788 ; AVX1: # %bb.0: 789 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 790 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 791 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero 792 ; AVX1-NEXT: vzeroupper 793 ; AVX1-NEXT: retq 794 ; 795 ; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16: 796 ; AVX2-SLOW: # %bb.0: 797 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] 798 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 799 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero 800 ; AVX2-SLOW-NEXT: vzeroupper 801 ; AVX2-SLOW-NEXT: retq 802 ; 803 ; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16: 804 ; AVX2-FAST: # %bb.0: 805 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] 806 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 807 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero 808 ; AVX2-FAST-NEXT: vzeroupper 809 ; AVX2-FAST-NEXT: retq 810 ; 811 ; AVX512F-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16: 812 ; AVX512F: # %bb.0: 813 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 814 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 815 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero 816 ; AVX512F-NEXT: vzeroupper 817 ; AVX512F-NEXT: retq 818 ; 819 ; AVX512VL-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16: 820 ; AVX512VL: # %bb.0: 821 ; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0 822 ; AVX512VL-NEXT: vzeroupper 823 ; AVX512VL-NEXT: retq 824 ; 825 ; AVX512BW-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16: 826 ; AVX512BW: # %bb.0: 827 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 828 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 829 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero 830 ; AVX512BW-NEXT: vzeroupper 831 ; AVX512BW-NEXT: retq 832 ; 833 ; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16: 834 ; AVX512BWVL: # %bb.0: 835 ; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0 836 ; AVX512BWVL-NEXT: vzeroupper 837 ; AVX512BWVL-NEXT: retq 838 ; 839 ; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16: 840 ; AVX512VBMIVL: # %bb.0: 841 ; AVX512VBMIVL-NEXT: vpmovqw %ymm0, %xmm0 842 ; AVX512VBMIVL-NEXT: vzeroupper 843 ; AVX512VBMIVL-NEXT: retq 844 %truncated = trunc <4 x i64> %vec to <4 x i16> 845 %truncated.ext = zext <4 x i16> %truncated to <4 x i32> 846 %bc = bitcast <4 x i32> %truncated.ext to <8 x i16> 847 %result = shufflevector <8 x i16> %bc, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7> 848 ret <8 x i16> %result 849 } 850 851 define <8 x i16> @trunc_v4i64_to_v4i16_via_v4i32_return_v8i16(<4 x i64> %vec) nounwind { 852 ; AVX1-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16: 853 ; AVX1: # %bb.0: 854 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 855 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 856 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero 857 ; AVX1-NEXT: vzeroupper 858 ; AVX1-NEXT: retq 859 ; 860 ; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16: 861 ; AVX2-SLOW: # %bb.0: 862 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] 863 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 864 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero 865 ; AVX2-SLOW-NEXT: vzeroupper 866 ; AVX2-SLOW-NEXT: retq 867 ; 868 ; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16: 869 ; AVX2-FAST: # %bb.0: 870 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] 871 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 872 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero 873 ; AVX2-FAST-NEXT: vzeroupper 874 ; AVX2-FAST-NEXT: retq 875 ; 876 ; AVX512F-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16: 877 ; AVX512F: # %bb.0: 878 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 879 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 880 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero 881 ; AVX512F-NEXT: vzeroupper 882 ; AVX512F-NEXT: retq 883 ; 884 ; AVX512VL-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16: 885 ; AVX512VL: # %bb.0: 886 ; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0 887 ; AVX512VL-NEXT: vzeroupper 888 ; AVX512VL-NEXT: retq 889 ; 890 ; AVX512BW-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16: 891 ; AVX512BW: # %bb.0: 892 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 893 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 894 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero 895 ; AVX512BW-NEXT: vzeroupper 896 ; AVX512BW-NEXT: retq 897 ; 898 ; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16: 899 ; AVX512BWVL: # %bb.0: 900 ; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0 901 ; AVX512BWVL-NEXT: vzeroupper 902 ; AVX512BWVL-NEXT: retq 903 ; 904 ; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16: 905 ; AVX512VBMIVL: # %bb.0: 906 ; AVX512VBMIVL-NEXT: vpmovqw %ymm0, %xmm0 907 ; AVX512VBMIVL-NEXT: vzeroupper 908 ; AVX512VBMIVL-NEXT: retq 909 %truncated = trunc <4 x i64> %vec to <4 x i32> 910 %bc = bitcast <4 x i32> %truncated to <8 x i16> 911 %result = shufflevector <8 x i16> %bc, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 8, i32 undef, i32 13> 912 ret <8 x i16> %result 913 } 914 915 define <8 x i16> @trunc_v4i64_to_v4i16_return_v8i16(<4 x i64> %vec) nounwind { 916 ; AVX1-LABEL: trunc_v4i64_to_v4i16_return_v8i16: 917 ; AVX1: # %bb.0: 918 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 919 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 920 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero 921 ; AVX1-NEXT: vzeroupper 922 ; AVX1-NEXT: retq 923 ; 924 ; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_return_v8i16: 925 ; AVX2-SLOW: # %bb.0: 926 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] 927 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 928 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero 929 ; AVX2-SLOW-NEXT: vzeroupper 930 ; AVX2-SLOW-NEXT: retq 931 ; 932 ; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16_return_v8i16: 933 ; AVX2-FAST: # %bb.0: 934 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] 935 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 936 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero 937 ; AVX2-FAST-NEXT: vzeroupper 938 ; AVX2-FAST-NEXT: retq 939 ; 940 ; AVX512F-LABEL: trunc_v4i64_to_v4i16_return_v8i16: 941 ; AVX512F: # %bb.0: 942 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 943 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 944 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero 945 ; AVX512F-NEXT: vzeroupper 946 ; AVX512F-NEXT: retq 947 ; 948 ; AVX512VL-LABEL: trunc_v4i64_to_v4i16_return_v8i16: 949 ; AVX512VL: # %bb.0: 950 ; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0 951 ; AVX512VL-NEXT: vzeroupper 952 ; AVX512VL-NEXT: retq 953 ; 954 ; AVX512BW-LABEL: trunc_v4i64_to_v4i16_return_v8i16: 955 ; AVX512BW: # %bb.0: 956 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 957 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 958 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero 959 ; AVX512BW-NEXT: vzeroupper 960 ; AVX512BW-NEXT: retq 961 ; 962 ; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_return_v8i16: 963 ; AVX512BWVL: # %bb.0: 964 ; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0 965 ; AVX512BWVL-NEXT: vzeroupper 966 ; AVX512BWVL-NEXT: retq 967 ; 968 ; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_return_v8i16: 969 ; AVX512VBMIVL: # %bb.0: 970 ; AVX512VBMIVL-NEXT: vpmovqw %ymm0, %xmm0 971 ; AVX512VBMIVL-NEXT: vzeroupper 972 ; AVX512VBMIVL-NEXT: retq 973 %truncated = trunc <4 x i64> %vec to <4 x i16> 974 %result = shufflevector <4 x i16> %truncated, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 975 ret <8 x i16> %result 976 } 977 978 define <16 x i8> @trunc_v4i64_to_v4i8_return_v16i8(<4 x i64> %vec) nounwind { 979 ; AVX1-LABEL: trunc_v4i64_to_v4i8_return_v16i8: 980 ; AVX1: # %bb.0: 981 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 982 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 983 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[u],zero 984 ; AVX1-NEXT: vzeroupper 985 ; AVX1-NEXT: retq 986 ; 987 ; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i8_return_v16i8: 988 ; AVX2-SLOW: # %bb.0: 989 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] 990 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 991 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[u],zero 992 ; AVX2-SLOW-NEXT: vzeroupper 993 ; AVX2-SLOW-NEXT: retq 994 ; 995 ; AVX2-FAST-LABEL: trunc_v4i64_to_v4i8_return_v16i8: 996 ; AVX2-FAST: # %bb.0: 997 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] 998 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 999 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[u],zero 1000 ; AVX2-FAST-NEXT: vzeroupper 1001 ; AVX2-FAST-NEXT: retq 1002 ; 1003 ; AVX512F-LABEL: trunc_v4i64_to_v4i8_return_v16i8: 1004 ; AVX512F: # %bb.0: 1005 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1006 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 1007 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[u],zero 1008 ; AVX512F-NEXT: vzeroupper 1009 ; AVX512F-NEXT: retq 1010 ; 1011 ; AVX512VL-LABEL: trunc_v4i64_to_v4i8_return_v16i8: 1012 ; AVX512VL: # %bb.0: 1013 ; AVX512VL-NEXT: vpmovqb %ymm0, %xmm0 1014 ; AVX512VL-NEXT: vzeroupper 1015 ; AVX512VL-NEXT: retq 1016 ; 1017 ; AVX512BW-LABEL: trunc_v4i64_to_v4i8_return_v16i8: 1018 ; AVX512BW: # %bb.0: 1019 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1020 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 1021 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[u],zero 1022 ; AVX512BW-NEXT: vzeroupper 1023 ; AVX512BW-NEXT: retq 1024 ; 1025 ; AVX512BWVL-LABEL: trunc_v4i64_to_v4i8_return_v16i8: 1026 ; AVX512BWVL: # %bb.0: 1027 ; AVX512BWVL-NEXT: vpmovqb %ymm0, %xmm0 1028 ; AVX512BWVL-NEXT: vzeroupper 1029 ; AVX512BWVL-NEXT: retq 1030 ; 1031 ; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i8_return_v16i8: 1032 ; AVX512VBMIVL: # %bb.0: 1033 ; AVX512VBMIVL-NEXT: vpmovqb %ymm0, %xmm0 1034 ; AVX512VBMIVL-NEXT: vzeroupper 1035 ; AVX512VBMIVL-NEXT: retq 1036 %truncated = trunc <4 x i64> %vec to <4 x i8> 1037 %result = shufflevector <4 x i8> %truncated, <4 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 5, i32 5, i32 undef, i32 7> 1038 ret <16 x i8> %result 1039 } 1040 1041 define void @shuffle_v16i16_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind { 1042 ; AVX1-LABEL: shuffle_v16i16_to_v4i16: 1043 ; AVX1: # %bb.0: 1044 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 1045 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1046 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1047 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 1048 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1049 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 1050 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1051 ; AVX1-NEXT: vmovq %xmm0, (%rsi) 1052 ; AVX1-NEXT: vzeroupper 1053 ; AVX1-NEXT: retq 1054 ; 1055 ; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16: 1056 ; AVX2-SLOW: # %bb.0: 1057 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 1058 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 1059 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1060 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 1061 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1062 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 1063 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1064 ; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi) 1065 ; AVX2-SLOW-NEXT: vzeroupper 1066 ; AVX2-SLOW-NEXT: retq 1067 ; 1068 ; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16: 1069 ; AVX2-FAST: # %bb.0: 1070 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 1071 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 1072 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] 1073 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1074 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1075 ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1076 ; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi) 1077 ; AVX2-FAST-NEXT: vzeroupper 1078 ; AVX2-FAST-NEXT: retq 1079 ; 1080 ; AVX512F-LABEL: shuffle_v16i16_to_v4i16: 1081 ; AVX512F: # %bb.0: 1082 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 1083 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 1084 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1085 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 1086 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1087 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 1088 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1089 ; AVX512F-NEXT: vmovq %xmm0, (%rsi) 1090 ; AVX512F-NEXT: vzeroupper 1091 ; AVX512F-NEXT: retq 1092 ; 1093 ; AVX512VL-LABEL: shuffle_v16i16_to_v4i16: 1094 ; AVX512VL: # %bb.0: 1095 ; AVX512VL-NEXT: vmovaps (%rdi), %ymm0 1096 ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1 1097 ; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1098 ; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi) 1099 ; AVX512VL-NEXT: vzeroupper 1100 ; AVX512VL-NEXT: retq 1101 ; 1102 ; AVX512BW-LABEL: shuffle_v16i16_to_v4i16: 1103 ; AVX512BW: # %bb.0: 1104 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 1105 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 1106 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] 1107 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1108 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1109 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1110 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) 1111 ; AVX512BW-NEXT: vzeroupper 1112 ; AVX512BW-NEXT: retq 1113 ; 1114 ; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16: 1115 ; AVX512BWVL: # %bb.0: 1116 ; AVX512BWVL-NEXT: vmovaps (%rdi), %ymm0 1117 ; AVX512BWVL-NEXT: vextractf128 $1, %ymm0, %xmm1 1118 ; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1119 ; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi) 1120 ; AVX512BWVL-NEXT: vzeroupper 1121 ; AVX512BWVL-NEXT: retq 1122 ; 1123 ; AVX512VBMIVL-LABEL: shuffle_v16i16_to_v4i16: 1124 ; AVX512VBMIVL: # %bb.0: 1125 ; AVX512VBMIVL-NEXT: vmovaps (%rdi), %ymm0 1126 ; AVX512VBMIVL-NEXT: vextractf128 $1, %ymm0, %xmm1 1127 ; AVX512VBMIVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1128 ; AVX512VBMIVL-NEXT: vpmovdw %xmm0, (%rsi) 1129 ; AVX512VBMIVL-NEXT: vzeroupper 1130 ; AVX512VBMIVL-NEXT: retq 1131 %vec = load <16 x i16>, <16 x i16>* %L 1132 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12> 1133 store <4 x i16> %strided.vec, <4 x i16>* %S 1134 ret void 1135 } 1136 1137 define void @trunc_v4i64_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind { 1138 ; AVX1-LABEL: trunc_v4i64_to_v4i16: 1139 ; AVX1: # %bb.0: 1140 ; AVX1-NEXT: vmovaps (%rdi), %ymm0 1141 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1142 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1143 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 1144 ; AVX1-NEXT: vmovq %xmm0, (%rsi) 1145 ; AVX1-NEXT: vzeroupper 1146 ; AVX1-NEXT: retq 1147 ; 1148 ; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16: 1149 ; AVX2-SLOW: # %bb.0: 1150 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7] 1151 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 1152 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 1153 ; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi) 1154 ; AVX2-SLOW-NEXT: vzeroupper 1155 ; AVX2-SLOW-NEXT: retq 1156 ; 1157 ; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16: 1158 ; AVX2-FAST: # %bb.0: 1159 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7] 1160 ; AVX2-FAST-NEXT: vpermd (%rdi), %ymm0, %ymm0 1161 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 1162 ; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi) 1163 ; AVX2-FAST-NEXT: vzeroupper 1164 ; AVX2-FAST-NEXT: retq 1165 ; 1166 ; AVX512F-LABEL: trunc_v4i64_to_v4i16: 1167 ; AVX512F: # %bb.0: 1168 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 1169 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 1170 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 1171 ; AVX512F-NEXT: vmovq %xmm0, (%rsi) 1172 ; AVX512F-NEXT: vzeroupper 1173 ; AVX512F-NEXT: retq 1174 ; 1175 ; AVX512VL-LABEL: trunc_v4i64_to_v4i16: 1176 ; AVX512VL: # %bb.0: 1177 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 1178 ; AVX512VL-NEXT: vpmovqw %ymm0, (%rsi) 1179 ; AVX512VL-NEXT: vzeroupper 1180 ; AVX512VL-NEXT: retq 1181 ; 1182 ; AVX512BW-LABEL: trunc_v4i64_to_v4i16: 1183 ; AVX512BW: # %bb.0: 1184 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 1185 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 1186 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 1187 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) 1188 ; AVX512BW-NEXT: vzeroupper 1189 ; AVX512BW-NEXT: retq 1190 ; 1191 ; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16: 1192 ; AVX512BWVL: # %bb.0: 1193 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 1194 ; AVX512BWVL-NEXT: vpmovqw %ymm0, (%rsi) 1195 ; AVX512BWVL-NEXT: vzeroupper 1196 ; AVX512BWVL-NEXT: retq 1197 ; 1198 ; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16: 1199 ; AVX512VBMIVL: # %bb.0: 1200 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 1201 ; AVX512VBMIVL-NEXT: vpmovqw %ymm0, (%rsi) 1202 ; AVX512VBMIVL-NEXT: vzeroupper 1203 ; AVX512VBMIVL-NEXT: retq 1204 %vec = load <16 x i16>, <16 x i16>* %L 1205 %bc = bitcast <16 x i16> %vec to <4 x i64> 1206 %strided.vec = trunc <4 x i64> %bc to <4 x i16> 1207 store <4 x i16> %strided.vec, <4 x i16>* %S 1208 ret void 1209 } 1210 1211 define void @shuffle_v32i8_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind { 1212 ; AVX1-LABEL: shuffle_v32i8_to_v4i8: 1213 ; AVX1: # %bb.0: 1214 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 1215 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1216 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 1217 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1218 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1219 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1220 ; AVX1-NEXT: vmovd %xmm0, (%rsi) 1221 ; AVX1-NEXT: vzeroupper 1222 ; AVX1-NEXT: retq 1223 ; 1224 ; AVX2-LABEL: shuffle_v32i8_to_v4i8: 1225 ; AVX2: # %bb.0: 1226 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 1227 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1228 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 1229 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1230 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1231 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1232 ; AVX2-NEXT: vmovd %xmm0, (%rsi) 1233 ; AVX2-NEXT: vzeroupper 1234 ; AVX2-NEXT: retq 1235 ; 1236 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8: 1237 ; AVX512F: # %bb.0: 1238 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 1239 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 1240 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 1241 ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1242 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1243 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1244 ; AVX512F-NEXT: vmovd %xmm0, (%rsi) 1245 ; AVX512F-NEXT: vzeroupper 1246 ; AVX512F-NEXT: retq 1247 ; 1248 ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8: 1249 ; AVX512VL: # %bb.0: 1250 ; AVX512VL-NEXT: vmovaps (%rdi), %ymm0 1251 ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1 1252 ; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1253 ; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi) 1254 ; AVX512VL-NEXT: vzeroupper 1255 ; AVX512VL-NEXT: retq 1256 ; 1257 ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8: 1258 ; AVX512BW: # %bb.0: 1259 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 1260 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 1261 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 1262 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1263 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1264 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1265 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi) 1266 ; AVX512BW-NEXT: vzeroupper 1267 ; AVX512BW-NEXT: retq 1268 ; 1269 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8: 1270 ; AVX512BWVL: # %bb.0: 1271 ; AVX512BWVL-NEXT: vmovaps (%rdi), %ymm0 1272 ; AVX512BWVL-NEXT: vextractf128 $1, %ymm0, %xmm1 1273 ; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1274 ; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi) 1275 ; AVX512BWVL-NEXT: vzeroupper 1276 ; AVX512BWVL-NEXT: retq 1277 ; 1278 ; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v4i8: 1279 ; AVX512VBMIVL: # %bb.0: 1280 ; AVX512VBMIVL-NEXT: vmovaps (%rdi), %ymm0 1281 ; AVX512VBMIVL-NEXT: vextractf128 $1, %ymm0, %xmm1 1282 ; AVX512VBMIVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1283 ; AVX512VBMIVL-NEXT: vpmovdb %xmm0, (%rsi) 1284 ; AVX512VBMIVL-NEXT: vzeroupper 1285 ; AVX512VBMIVL-NEXT: retq 1286 %vec = load <32 x i8>, <32 x i8>* %L 1287 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24> 1288 store <4 x i8> %strided.vec, <4 x i8>* %S 1289 ret void 1290 } 1291 1292 define void @trunc_v4i64_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind { 1293 ; AVX1-LABEL: trunc_v4i64_to_v4i8: 1294 ; AVX1: # %bb.0: 1295 ; AVX1-NEXT: vmovaps (%rdi), %ymm0 1296 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1297 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1298 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] 1299 ; AVX1-NEXT: vmovd %xmm0, (%rsi) 1300 ; AVX1-NEXT: vzeroupper 1301 ; AVX1-NEXT: retq 1302 ; 1303 ; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i8: 1304 ; AVX2-SLOW: # %bb.0: 1305 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7] 1306 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 1307 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] 1308 ; AVX2-SLOW-NEXT: vmovd %xmm0, (%rsi) 1309 ; AVX2-SLOW-NEXT: vzeroupper 1310 ; AVX2-SLOW-NEXT: retq 1311 ; 1312 ; AVX2-FAST-LABEL: trunc_v4i64_to_v4i8: 1313 ; AVX2-FAST: # %bb.0: 1314 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7] 1315 ; AVX2-FAST-NEXT: vpermd (%rdi), %ymm0, %ymm0 1316 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] 1317 ; AVX2-FAST-NEXT: vmovd %xmm0, (%rsi) 1318 ; AVX2-FAST-NEXT: vzeroupper 1319 ; AVX2-FAST-NEXT: retq 1320 ; 1321 ; AVX512F-LABEL: trunc_v4i64_to_v4i8: 1322 ; AVX512F: # %bb.0: 1323 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 1324 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 1325 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] 1326 ; AVX512F-NEXT: vmovd %xmm0, (%rsi) 1327 ; AVX512F-NEXT: vzeroupper 1328 ; AVX512F-NEXT: retq 1329 ; 1330 ; AVX512VL-LABEL: trunc_v4i64_to_v4i8: 1331 ; AVX512VL: # %bb.0: 1332 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 1333 ; AVX512VL-NEXT: vpmovqb %ymm0, (%rsi) 1334 ; AVX512VL-NEXT: vzeroupper 1335 ; AVX512VL-NEXT: retq 1336 ; 1337 ; AVX512BW-LABEL: trunc_v4i64_to_v4i8: 1338 ; AVX512BW: # %bb.0: 1339 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 1340 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 1341 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] 1342 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi) 1343 ; AVX512BW-NEXT: vzeroupper 1344 ; AVX512BW-NEXT: retq 1345 ; 1346 ; AVX512BWVL-LABEL: trunc_v4i64_to_v4i8: 1347 ; AVX512BWVL: # %bb.0: 1348 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 1349 ; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rsi) 1350 ; AVX512BWVL-NEXT: vzeroupper 1351 ; AVX512BWVL-NEXT: retq 1352 ; 1353 ; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i8: 1354 ; AVX512VBMIVL: # %bb.0: 1355 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 1356 ; AVX512VBMIVL-NEXT: vpmovqb %ymm0, (%rsi) 1357 ; AVX512VBMIVL-NEXT: vzeroupper 1358 ; AVX512VBMIVL-NEXT: retq 1359 %vec = load <32 x i8>, <32 x i8>* %L 1360 %bc = bitcast <32 x i8> %vec to <4 x i64> 1361 %strided.vec = trunc <4 x i64> %bc to <4 x i8> 1362 store <4 x i8> %strided.vec, <4 x i8>* %S 1363 ret void 1364 } 1365 1366 ; In this case not all elements are collected from the same source vector, so 1367 ; the resulting BUILD_VECTOR should not be combined to a truncate. 1368 define <16 x i8> @negative(<32 x i8> %v, <32 x i8> %w) nounwind { 1369 ; AVX1-LABEL: negative: 1370 ; AVX1: # %bb.0: 1371 ; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[u,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero 1372 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1373 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u],zero,zero,zero,zero,zero,zero,zero,xmm0[0,2,4,6,8,10,12,14] 1374 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 1375 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 1376 ; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 1377 ; AVX1-NEXT: vzeroupper 1378 ; AVX1-NEXT: retq 1379 ; 1380 ; AVX2-LABEL: negative: 1381 ; AVX2: # %bb.0: 1382 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30] 1383 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 1384 ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 1385 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] 1386 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1387 ; AVX2-NEXT: vzeroupper 1388 ; AVX2-NEXT: retq 1389 ; 1390 ; AVX512F-LABEL: negative: 1391 ; AVX512F: # %bb.0: 1392 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30] 1393 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 1394 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 1395 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] 1396 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1397 ; AVX512F-NEXT: vzeroupper 1398 ; AVX512F-NEXT: retq 1399 ; 1400 ; AVX512VL-LABEL: negative: 1401 ; AVX512VL: # %bb.0: 1402 ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30] 1403 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 1404 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 1405 ; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] 1406 ; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1407 ; AVX512VL-NEXT: vzeroupper 1408 ; AVX512VL-NEXT: retq 1409 ; 1410 ; AVX512BW-LABEL: negative: 1411 ; AVX512BW: # %bb.0: 1412 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30] 1413 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 1414 ; AVX512BW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 1415 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] 1416 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1417 ; AVX512BW-NEXT: vzeroupper 1418 ; AVX512BW-NEXT: retq 1419 ; 1420 ; AVX512BWVL-LABEL: negative: 1421 ; AVX512BWVL: # %bb.0: 1422 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30] 1423 ; AVX512BWVL-NEXT: movl $65537, %eax # imm = 0x10001 1424 ; AVX512BWVL-NEXT: kmovd %eax, %k1 1425 ; AVX512BWVL-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} 1426 ; AVX512BWVL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] 1427 ; AVX512BWVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1428 ; AVX512BWVL-NEXT: vzeroupper 1429 ; AVX512BWVL-NEXT: retq 1430 ; 1431 ; AVX512VBMIVL-LABEL: negative: 1432 ; AVX512VBMIVL: # %bb.0: 1433 ; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm2 = [32,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,48,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30] 1434 ; AVX512VBMIVL-NEXT: vpermt2b %ymm1, %ymm2, %ymm0 1435 ; AVX512VBMIVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1436 ; AVX512VBMIVL-NEXT: vzeroupper 1437 ; AVX512VBMIVL-NEXT: retq 1438 %strided.vec = shufflevector <32 x i8> %v, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> 1439 %w0 = extractelement <32 x i8> %w, i32 0 1440 %merged = insertelement <16 x i8> %strided.vec, i8 %w0, i32 0 1441 ret <16 x i8> %merged 1442 } 1443