1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE42 4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW 6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST 7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F 8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL 9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW 10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL 11 12 define void @shuffle_v16i8_to_v8i8_1(<16 x i8>* %L, <8 x i8>* %S) nounwind { 13 ; SSE2-LABEL: shuffle_v16i8_to_v8i8_1: 14 ; SSE2: # %bb.0: 15 ; SSE2-NEXT: movdqa (%rdi), %xmm0 16 ; SSE2-NEXT: pxor %xmm1, %xmm1 17 ; SSE2-NEXT: movdqa %xmm0, %xmm2 18 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 19 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] 20 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] 21 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 22 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7] 23 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 24 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] 25 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] 26 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 27 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] 28 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 29 ; SSE2-NEXT: packuswb %xmm0, %xmm0 30 ; SSE2-NEXT: movq %xmm0, (%rsi) 31 ; SSE2-NEXT: retq 32 ; 33 ; SSE42-LABEL: shuffle_v16i8_to_v8i8_1: 34 ; SSE42: # %bb.0: 35 ; SSE42-NEXT: movdqa (%rdi), %xmm0 36 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] 37 ; SSE42-NEXT: movq %xmm0, (%rsi) 38 ; SSE42-NEXT: retq 39 ; 40 ; AVX-LABEL: shuffle_v16i8_to_v8i8_1: 41 ; AVX: # %bb.0: 42 ; AVX-NEXT: vmovdqa (%rdi), %xmm0 43 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] 44 ; AVX-NEXT: vmovq %xmm0, (%rsi) 45 ; AVX-NEXT: retq 46 ; 47 ; AVX512F-LABEL: shuffle_v16i8_to_v8i8_1: 48 ; AVX512F: # %bb.0: 49 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 50 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] 51 ; AVX512F-NEXT: vmovq %xmm0, (%rsi) 52 ; AVX512F-NEXT: retq 53 ; 54 ; AVX512VL-LABEL: shuffle_v16i8_to_v8i8_1: 55 ; AVX512VL: # %bb.0: 56 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 57 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] 58 ; AVX512VL-NEXT: vmovq %xmm0, (%rsi) 59 ; AVX512VL-NEXT: retq 60 ; 61 ; AVX512BW-LABEL: shuffle_v16i8_to_v8i8_1: 62 ; AVX512BW: # %bb.0: 63 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 64 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] 65 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) 66 ; AVX512BW-NEXT: retq 67 ; 68 ; AVX512BWVL-LABEL: shuffle_v16i8_to_v8i8_1: 69 ; AVX512BWVL: # %bb.0: 70 ; AVX512BWVL-NEXT: vpsrlw $8, (%rdi), %xmm0 71 ; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi) 72 ; AVX512BWVL-NEXT: retq 73 %vec = load <16 x i8>, <16 x i8>* %L 74 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 75 store <8 x i8> %strided.vec, <8 x i8>* %S 76 ret void 77 } 78 79 define void @shuffle_v8i16_to_v4i16_1(<8 x i16>* %L, <4 x i16>* %S) nounwind { 80 ; SSE2-LABEL: shuffle_v8i16_to_v4i16_1: 81 ; SSE2: # %bb.0: 82 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = mem[3,1,2,3,4,5,6,7] 83 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] 84 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 85 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] 86 ; SSE2-NEXT: movq %xmm0, (%rsi) 87 ; SSE2-NEXT: retq 88 ; 89 ; SSE42-LABEL: shuffle_v8i16_to_v4i16_1: 90 ; SSE42: # %bb.0: 91 ; SSE42-NEXT: movdqa (%rdi), %xmm0 92 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] 93 ; SSE42-NEXT: movq %xmm0, (%rsi) 94 ; SSE42-NEXT: retq 95 ; 96 ; AVX-LABEL: shuffle_v8i16_to_v4i16_1: 97 ; AVX: # %bb.0: 98 ; AVX-NEXT: vmovdqa (%rdi), %xmm0 99 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] 100 ; AVX-NEXT: vmovq %xmm0, (%rsi) 101 ; AVX-NEXT: retq 102 ; 103 ; AVX512F-LABEL: shuffle_v8i16_to_v4i16_1: 104 ; AVX512F: # %bb.0: 105 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 106 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] 107 ; AVX512F-NEXT: vmovq %xmm0, (%rsi) 108 ; AVX512F-NEXT: retq 109 ; 110 ; AVX512VL-LABEL: shuffle_v8i16_to_v4i16_1: 111 ; AVX512VL: # %bb.0: 112 ; AVX512VL-NEXT: vpsrld $16, (%rdi), %xmm0 113 ; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi) 114 ; AVX512VL-NEXT: retq 115 ; 116 ; AVX512BW-LABEL: shuffle_v8i16_to_v4i16_1: 117 ; AVX512BW: # %bb.0: 118 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 119 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] 120 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) 121 ; AVX512BW-NEXT: retq 122 ; 123 ; AVX512BWVL-LABEL: shuffle_v8i16_to_v4i16_1: 124 ; AVX512BWVL: # %bb.0: 125 ; AVX512BWVL-NEXT: vpsrld $16, (%rdi), %xmm0 126 ; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi) 127 ; AVX512BWVL-NEXT: retq 128 %vec = load <8 x i16>, <8 x i16>* %L 129 %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 130 store <4 x i16> %strided.vec, <4 x i16>* %S 131 ret void 132 } 133 134 define void @shuffle_v4i32_to_v2i32_1(<4 x i32>* %L, <2 x i32>* %S) nounwind { 135 ; SSE-LABEL: shuffle_v4i32_to_v2i32_1: 136 ; SSE: # %bb.0: 137 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[1,3,2,3] 138 ; SSE-NEXT: movq %xmm0, (%rsi) 139 ; SSE-NEXT: retq 140 ; 141 ; AVX-LABEL: shuffle_v4i32_to_v2i32_1: 142 ; AVX: # %bb.0: 143 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,3,2,3] 144 ; AVX-NEXT: vmovlps %xmm0, (%rsi) 145 ; AVX-NEXT: retq 146 ; 147 ; AVX512F-LABEL: shuffle_v4i32_to_v2i32_1: 148 ; AVX512F: # %bb.0: 149 ; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,3,2,3] 150 ; AVX512F-NEXT: vmovlps %xmm0, (%rsi) 151 ; AVX512F-NEXT: retq 152 ; 153 ; AVX512VL-LABEL: shuffle_v4i32_to_v2i32_1: 154 ; AVX512VL: # %bb.0: 155 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,1,3,3] 156 ; AVX512VL-NEXT: vpmovqd %xmm0, (%rsi) 157 ; AVX512VL-NEXT: retq 158 ; 159 ; AVX512BW-LABEL: shuffle_v4i32_to_v2i32_1: 160 ; AVX512BW: # %bb.0: 161 ; AVX512BW-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,3,2,3] 162 ; AVX512BW-NEXT: vmovlps %xmm0, (%rsi) 163 ; AVX512BW-NEXT: retq 164 ; 165 ; AVX512BWVL-LABEL: shuffle_v4i32_to_v2i32_1: 166 ; AVX512BWVL: # %bb.0: 167 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,1,3,3] 168 ; AVX512BWVL-NEXT: vpmovqd %xmm0, (%rsi) 169 ; AVX512BWVL-NEXT: retq 170 %vec = load <4 x i32>, <4 x i32>* %L 171 %strided.vec = shufflevector <4 x i32> %vec, <4 x i32> undef, <2 x i32> <i32 1, i32 3> 172 store <2 x i32> %strided.vec, <2 x i32>* %S 173 ret void 174 } 175 176 define void @shuffle_v16i8_to_v4i8_1(<16 x i8>* %L, <4 x i8>* %S) nounwind { 177 ; SSE2-LABEL: shuffle_v16i8_to_v4i8_1: 178 ; SSE2: # %bb.0: 179 ; SSE2-NEXT: movdqa (%rdi), %xmm0 180 ; SSE2-NEXT: pxor %xmm1, %xmm1 181 ; SSE2-NEXT: movdqa %xmm0, %xmm2 182 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 183 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 184 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] 185 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 186 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 187 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] 188 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 189 ; SSE2-NEXT: packuswb %xmm0, %xmm0 190 ; SSE2-NEXT: movd %xmm0, (%rsi) 191 ; SSE2-NEXT: retq 192 ; 193 ; SSE42-LABEL: shuffle_v16i8_to_v4i8_1: 194 ; SSE42: # %bb.0: 195 ; SSE42-NEXT: movdqa (%rdi), %xmm0 196 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u] 197 ; SSE42-NEXT: movd %xmm0, (%rsi) 198 ; SSE42-NEXT: retq 199 ; 200 ; AVX-LABEL: shuffle_v16i8_to_v4i8_1: 201 ; AVX: # %bb.0: 202 ; AVX-NEXT: vmovdqa (%rdi), %xmm0 203 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u] 204 ; AVX-NEXT: vmovd %xmm0, (%rsi) 205 ; AVX-NEXT: retq 206 ; 207 ; AVX512F-LABEL: shuffle_v16i8_to_v4i8_1: 208 ; AVX512F: # %bb.0: 209 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 210 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u] 211 ; AVX512F-NEXT: vmovd %xmm0, (%rsi) 212 ; AVX512F-NEXT: retq 213 ; 214 ; AVX512VL-LABEL: shuffle_v16i8_to_v4i8_1: 215 ; AVX512VL: # %bb.0: 216 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 217 ; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm0 218 ; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi) 219 ; AVX512VL-NEXT: retq 220 ; 221 ; AVX512BW-LABEL: shuffle_v16i8_to_v4i8_1: 222 ; AVX512BW: # %bb.0: 223 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 224 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u] 225 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi) 226 ; AVX512BW-NEXT: retq 227 ; 228 ; AVX512BWVL-LABEL: shuffle_v16i8_to_v4i8_1: 229 ; AVX512BWVL: # %bb.0: 230 ; AVX512BWVL-NEXT: vpsrlw $8, (%rdi), %xmm0 231 ; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi) 232 ; AVX512BWVL-NEXT: retq 233 %vec = load <16 x i8>, <16 x i8>* %L 234 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13> 235 store <4 x i8> %strided.vec, <4 x i8>* %S 236 ret void 237 } 238 239 define void @shuffle_v16i8_to_v4i8_2(<16 x i8>* %L, <4 x i8>* %S) nounwind { 240 ; SSE2-LABEL: shuffle_v16i8_to_v4i8_2: 241 ; SSE2: # %bb.0: 242 ; SSE2-NEXT: movdqa (%rdi), %xmm0 243 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 244 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] 245 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] 246 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 247 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] 248 ; SSE2-NEXT: packuswb %xmm0, %xmm0 249 ; SSE2-NEXT: movd %xmm0, (%rsi) 250 ; SSE2-NEXT: retq 251 ; 252 ; SSE42-LABEL: shuffle_v16i8_to_v4i8_2: 253 ; SSE42: # %bb.0: 254 ; SSE42-NEXT: movdqa (%rdi), %xmm0 255 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u] 256 ; SSE42-NEXT: movd %xmm0, (%rsi) 257 ; SSE42-NEXT: retq 258 ; 259 ; AVX-LABEL: shuffle_v16i8_to_v4i8_2: 260 ; AVX: # %bb.0: 261 ; AVX-NEXT: vmovdqa (%rdi), %xmm0 262 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u] 263 ; AVX-NEXT: vmovd %xmm0, (%rsi) 264 ; AVX-NEXT: retq 265 ; 266 ; AVX512F-LABEL: shuffle_v16i8_to_v4i8_2: 267 ; AVX512F: # %bb.0: 268 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 269 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u] 270 ; AVX512F-NEXT: vmovd %xmm0, (%rsi) 271 ; AVX512F-NEXT: retq 272 ; 273 ; AVX512VL-LABEL: shuffle_v16i8_to_v4i8_2: 274 ; AVX512VL: # %bb.0: 275 ; AVX512VL-NEXT: vpsrld $16, (%rdi), %xmm0 276 ; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi) 277 ; AVX512VL-NEXT: retq 278 ; 279 ; AVX512BW-LABEL: shuffle_v16i8_to_v4i8_2: 280 ; AVX512BW: # %bb.0: 281 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 282 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u] 283 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi) 284 ; AVX512BW-NEXT: retq 285 ; 286 ; AVX512BWVL-LABEL: shuffle_v16i8_to_v4i8_2: 287 ; AVX512BWVL: # %bb.0: 288 ; AVX512BWVL-NEXT: vpsrld $16, (%rdi), %xmm0 289 ; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi) 290 ; AVX512BWVL-NEXT: retq 291 %vec = load <16 x i8>, <16 x i8>* %L 292 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14> 293 store <4 x i8> %strided.vec, <4 x i8>* %S 294 ret void 295 } 296 297 define void @shuffle_v16i8_to_v4i8_3(<16 x i8>* %L, <4 x i8>* %S) nounwind { 298 ; SSE2-LABEL: shuffle_v16i8_to_v4i8_3: 299 ; SSE2: # %bb.0: 300 ; SSE2-NEXT: movdqa (%rdi), %xmm0 301 ; SSE2-NEXT: pxor %xmm1, %xmm1 302 ; SSE2-NEXT: movdqa %xmm0, %xmm2 303 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 304 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] 305 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] 306 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 307 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] 308 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] 309 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 310 ; SSE2-NEXT: packuswb %xmm0, %xmm0 311 ; SSE2-NEXT: movd %xmm0, (%rsi) 312 ; SSE2-NEXT: retq 313 ; 314 ; SSE42-LABEL: shuffle_v16i8_to_v4i8_3: 315 ; SSE42: # %bb.0: 316 ; SSE42-NEXT: movdqa (%rdi), %xmm0 317 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] 318 ; SSE42-NEXT: movd %xmm0, (%rsi) 319 ; SSE42-NEXT: retq 320 ; 321 ; AVX-LABEL: shuffle_v16i8_to_v4i8_3: 322 ; AVX: # %bb.0: 323 ; AVX-NEXT: vmovdqa (%rdi), %xmm0 324 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] 325 ; AVX-NEXT: vmovd %xmm0, (%rsi) 326 ; AVX-NEXT: retq 327 ; 328 ; AVX512F-LABEL: shuffle_v16i8_to_v4i8_3: 329 ; AVX512F: # %bb.0: 330 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 331 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] 332 ; AVX512F-NEXT: vmovd %xmm0, (%rsi) 333 ; AVX512F-NEXT: retq 334 ; 335 ; AVX512VL-LABEL: shuffle_v16i8_to_v4i8_3: 336 ; AVX512VL: # %bb.0: 337 ; AVX512VL-NEXT: vpsrld $24, (%rdi), %xmm0 338 ; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi) 339 ; AVX512VL-NEXT: retq 340 ; 341 ; AVX512BW-LABEL: shuffle_v16i8_to_v4i8_3: 342 ; AVX512BW: # %bb.0: 343 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 344 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] 345 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi) 346 ; AVX512BW-NEXT: retq 347 ; 348 ; AVX512BWVL-LABEL: shuffle_v16i8_to_v4i8_3: 349 ; AVX512BWVL: # %bb.0: 350 ; AVX512BWVL-NEXT: vpsrld $24, (%rdi), %xmm0 351 ; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi) 352 ; AVX512BWVL-NEXT: retq 353 %vec = load <16 x i8>, <16 x i8>* %L 354 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15> 355 store <4 x i8> %strided.vec, <4 x i8>* %S 356 ret void 357 } 358 359 define void @shuffle_v8i16_to_v2i16_1(<8 x i16>* %L, <2 x i16>* %S) nounwind { 360 ; SSE-LABEL: shuffle_v8i16_to_v2i16_1: 361 ; SSE: # %bb.0: 362 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,2,2,3] 363 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] 364 ; SSE-NEXT: movd %xmm0, (%rsi) 365 ; SSE-NEXT: retq 366 ; 367 ; AVX1-LABEL: shuffle_v8i16_to_v2i16_1: 368 ; AVX1: # %bb.0: 369 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] 370 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] 371 ; AVX1-NEXT: vmovd %xmm0, (%rsi) 372 ; AVX1-NEXT: retq 373 ; 374 ; AVX2-SLOW-LABEL: shuffle_v8i16_to_v2i16_1: 375 ; AVX2-SLOW: # %bb.0: 376 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] 377 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] 378 ; AVX2-SLOW-NEXT: vmovd %xmm0, (%rsi) 379 ; AVX2-SLOW-NEXT: retq 380 ; 381 ; AVX2-FAST-LABEL: shuffle_v8i16_to_v2i16_1: 382 ; AVX2-FAST: # %bb.0: 383 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 384 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] 385 ; AVX2-FAST-NEXT: vmovd %xmm0, (%rsi) 386 ; AVX2-FAST-NEXT: retq 387 ; 388 ; AVX512F-LABEL: shuffle_v8i16_to_v2i16_1: 389 ; AVX512F: # %bb.0: 390 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] 391 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] 392 ; AVX512F-NEXT: vmovd %xmm0, (%rsi) 393 ; AVX512F-NEXT: retq 394 ; 395 ; AVX512VL-LABEL: shuffle_v8i16_to_v2i16_1: 396 ; AVX512VL: # %bb.0: 397 ; AVX512VL-NEXT: vpsrld $16, (%rdi), %xmm0 398 ; AVX512VL-NEXT: vpmovqw %xmm0, (%rsi) 399 ; AVX512VL-NEXT: retq 400 ; 401 ; AVX512BW-LABEL: shuffle_v8i16_to_v2i16_1: 402 ; AVX512BW: # %bb.0: 403 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 404 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] 405 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi) 406 ; AVX512BW-NEXT: retq 407 ; 408 ; AVX512BWVL-LABEL: shuffle_v8i16_to_v2i16_1: 409 ; AVX512BWVL: # %bb.0: 410 ; AVX512BWVL-NEXT: vpsrld $16, (%rdi), %xmm0 411 ; AVX512BWVL-NEXT: vpmovqw %xmm0, (%rsi) 412 ; AVX512BWVL-NEXT: retq 413 %vec = load <8 x i16>, <8 x i16>* %L 414 %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <2 x i32> <i32 1, i32 5> 415 store <2 x i16> %strided.vec, <2 x i16>* %S 416 ret void 417 } 418 419 define void @shuffle_v8i16_to_v2i16_2(<8 x i16>* %L, <2 x i16>* %S) nounwind { 420 ; SSE-LABEL: shuffle_v8i16_to_v2i16_2: 421 ; SSE: # %bb.0: 422 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[3,1,2,3] 423 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] 424 ; SSE-NEXT: movd %xmm0, (%rsi) 425 ; SSE-NEXT: retq 426 ; 427 ; AVX1-LABEL: shuffle_v8i16_to_v2i16_2: 428 ; AVX1: # %bb.0: 429 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] 430 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] 431 ; AVX1-NEXT: vmovd %xmm0, (%rsi) 432 ; AVX1-NEXT: retq 433 ; 434 ; AVX2-SLOW-LABEL: shuffle_v8i16_to_v2i16_2: 435 ; AVX2-SLOW: # %bb.0: 436 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] 437 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] 438 ; AVX2-SLOW-NEXT: vmovd %xmm0, (%rsi) 439 ; AVX2-SLOW-NEXT: retq 440 ; 441 ; AVX2-FAST-LABEL: shuffle_v8i16_to_v2i16_2: 442 ; AVX2-FAST: # %bb.0: 443 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 444 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15] 445 ; AVX2-FAST-NEXT: vmovd %xmm0, (%rsi) 446 ; AVX2-FAST-NEXT: retq 447 ; 448 ; AVX512F-LABEL: shuffle_v8i16_to_v2i16_2: 449 ; AVX512F: # %bb.0: 450 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] 451 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] 452 ; AVX512F-NEXT: vmovd %xmm0, (%rsi) 453 ; AVX512F-NEXT: retq 454 ; 455 ; AVX512VL-LABEL: shuffle_v8i16_to_v2i16_2: 456 ; AVX512VL: # %bb.0: 457 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,1,3,3] 458 ; AVX512VL-NEXT: vpmovqw %xmm0, (%rsi) 459 ; AVX512VL-NEXT: retq 460 ; 461 ; AVX512BW-LABEL: shuffle_v8i16_to_v2i16_2: 462 ; AVX512BW: # %bb.0: 463 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 464 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15] 465 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi) 466 ; AVX512BW-NEXT: retq 467 ; 468 ; AVX512BWVL-LABEL: shuffle_v8i16_to_v2i16_2: 469 ; AVX512BWVL: # %bb.0: 470 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,1,3,3] 471 ; AVX512BWVL-NEXT: vpmovqw %xmm0, (%rsi) 472 ; AVX512BWVL-NEXT: retq 473 %vec = load <8 x i16>, <8 x i16>* %L 474 %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <2 x i32> <i32 2, i32 6> 475 store <2 x i16> %strided.vec, <2 x i16>* %S 476 ret void 477 } 478 479 define void @shuffle_v8i16_to_v2i16_3(<8 x i16>* %L, <2 x i16>* %S) nounwind { 480 ; SSE-LABEL: shuffle_v8i16_to_v2i16_3: 481 ; SSE: # %bb.0: 482 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[3,1,2,3] 483 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] 484 ; SSE-NEXT: movd %xmm0, (%rsi) 485 ; SSE-NEXT: retq 486 ; 487 ; AVX1-LABEL: shuffle_v8i16_to_v2i16_3: 488 ; AVX1: # %bb.0: 489 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] 490 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] 491 ; AVX1-NEXT: vmovd %xmm0, (%rsi) 492 ; AVX1-NEXT: retq 493 ; 494 ; AVX2-SLOW-LABEL: shuffle_v8i16_to_v2i16_3: 495 ; AVX2-SLOW: # %bb.0: 496 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] 497 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] 498 ; AVX2-SLOW-NEXT: vmovd %xmm0, (%rsi) 499 ; AVX2-SLOW-NEXT: retq 500 ; 501 ; AVX2-FAST-LABEL: shuffle_v8i16_to_v2i16_3: 502 ; AVX2-FAST: # %bb.0: 503 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 504 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15] 505 ; AVX2-FAST-NEXT: vmovd %xmm0, (%rsi) 506 ; AVX2-FAST-NEXT: retq 507 ; 508 ; AVX512F-LABEL: shuffle_v8i16_to_v2i16_3: 509 ; AVX512F: # %bb.0: 510 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] 511 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] 512 ; AVX512F-NEXT: vmovd %xmm0, (%rsi) 513 ; AVX512F-NEXT: retq 514 ; 515 ; AVX512VL-LABEL: shuffle_v8i16_to_v2i16_3: 516 ; AVX512VL: # %bb.0: 517 ; AVX512VL-NEXT: vpsrlq $48, (%rdi), %xmm0 518 ; AVX512VL-NEXT: vpmovqw %xmm0, (%rsi) 519 ; AVX512VL-NEXT: retq 520 ; 521 ; AVX512BW-LABEL: shuffle_v8i16_to_v2i16_3: 522 ; AVX512BW: # %bb.0: 523 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 524 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15] 525 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi) 526 ; AVX512BW-NEXT: retq 527 ; 528 ; AVX512BWVL-LABEL: shuffle_v8i16_to_v2i16_3: 529 ; AVX512BWVL: # %bb.0: 530 ; AVX512BWVL-NEXT: vpsrlq $48, (%rdi), %xmm0 531 ; AVX512BWVL-NEXT: vpmovqw %xmm0, (%rsi) 532 ; AVX512BWVL-NEXT: retq 533 %vec = load <8 x i16>, <8 x i16>* %L 534 %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <2 x i32> <i32 3, i32 7> 535 store <2 x i16> %strided.vec, <2 x i16>* %S 536 ret void 537 } 538 539 define void @shuffle_v16i8_to_v2i8_1(<16 x i8>* %L, <2 x i8>* %S) nounwind { 540 ; SSE2-LABEL: shuffle_v16i8_to_v2i8_1: 541 ; SSE2: # %bb.0: 542 ; SSE2-NEXT: movdqa (%rdi), %xmm0 543 ; SSE2-NEXT: pxor %xmm1, %xmm1 544 ; SSE2-NEXT: movdqa %xmm0, %xmm2 545 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 546 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 547 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 548 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] 549 ; SSE2-NEXT: packuswb %xmm0, %xmm0 550 ; SSE2-NEXT: movd %xmm0, %eax 551 ; SSE2-NEXT: movw %ax, (%rsi) 552 ; SSE2-NEXT: retq 553 ; 554 ; SSE42-LABEL: shuffle_v16i8_to_v2i8_1: 555 ; SSE42: # %bb.0: 556 ; SSE42-NEXT: movdqa (%rdi), %xmm0 557 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 558 ; SSE42-NEXT: pextrw $0, %xmm0, (%rsi) 559 ; SSE42-NEXT: retq 560 ; 561 ; AVX-LABEL: shuffle_v16i8_to_v2i8_1: 562 ; AVX: # %bb.0: 563 ; AVX-NEXT: vmovdqa (%rdi), %xmm0 564 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 565 ; AVX-NEXT: vpextrw $0, %xmm0, (%rsi) 566 ; AVX-NEXT: retq 567 ; 568 ; AVX512F-LABEL: shuffle_v16i8_to_v2i8_1: 569 ; AVX512F: # %bb.0: 570 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 571 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 572 ; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi) 573 ; AVX512F-NEXT: retq 574 ; 575 ; AVX512VL-LABEL: shuffle_v16i8_to_v2i8_1: 576 ; AVX512VL: # %bb.0: 577 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 578 ; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm0 579 ; AVX512VL-NEXT: vpmovqb %xmm0, (%rsi) 580 ; AVX512VL-NEXT: retq 581 ; 582 ; AVX512BW-LABEL: shuffle_v16i8_to_v2i8_1: 583 ; AVX512BW: # %bb.0: 584 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 585 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 586 ; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi) 587 ; AVX512BW-NEXT: retq 588 ; 589 ; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8_1: 590 ; AVX512BWVL: # %bb.0: 591 ; AVX512BWVL-NEXT: vpsrlw $8, (%rdi), %xmm0 592 ; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rsi) 593 ; AVX512BWVL-NEXT: retq 594 %vec = load <16 x i8>, <16 x i8>* %L 595 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 1, i32 9> 596 store <2 x i8> %strided.vec, <2 x i8>* %S 597 ret void 598 } 599 600 define void @shuffle_v16i8_to_v2i8_2(<16 x i8>* %L, <2 x i8>* %S) nounwind { 601 ; SSE2-LABEL: shuffle_v16i8_to_v2i8_2: 602 ; SSE2: # %bb.0: 603 ; SSE2-NEXT: movdqa (%rdi), %xmm0 604 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 605 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 606 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] 607 ; SSE2-NEXT: packuswb %xmm0, %xmm0 608 ; SSE2-NEXT: movd %xmm0, %eax 609 ; SSE2-NEXT: movw %ax, (%rsi) 610 ; SSE2-NEXT: retq 611 ; 612 ; SSE42-LABEL: shuffle_v16i8_to_v2i8_2: 613 ; SSE42: # %bb.0: 614 ; SSE42-NEXT: movdqa (%rdi), %xmm0 615 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 616 ; SSE42-NEXT: pextrw $0, %xmm0, (%rsi) 617 ; SSE42-NEXT: retq 618 ; 619 ; AVX-LABEL: shuffle_v16i8_to_v2i8_2: 620 ; AVX: # %bb.0: 621 ; AVX-NEXT: vmovdqa (%rdi), %xmm0 622 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 623 ; AVX-NEXT: vpextrw $0, %xmm0, (%rsi) 624 ; AVX-NEXT: retq 625 ; 626 ; AVX512F-LABEL: shuffle_v16i8_to_v2i8_2: 627 ; AVX512F: # %bb.0: 628 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 629 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 630 ; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi) 631 ; AVX512F-NEXT: retq 632 ; 633 ; AVX512VL-LABEL: shuffle_v16i8_to_v2i8_2: 634 ; AVX512VL: # %bb.0: 635 ; AVX512VL-NEXT: vpsrld $16, (%rdi), %xmm0 636 ; AVX512VL-NEXT: vpmovqb %xmm0, (%rsi) 637 ; AVX512VL-NEXT: retq 638 ; 639 ; AVX512BW-LABEL: shuffle_v16i8_to_v2i8_2: 640 ; AVX512BW: # %bb.0: 641 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 642 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 643 ; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi) 644 ; AVX512BW-NEXT: retq 645 ; 646 ; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8_2: 647 ; AVX512BWVL: # %bb.0: 648 ; AVX512BWVL-NEXT: vpsrld $16, (%rdi), %xmm0 649 ; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rsi) 650 ; AVX512BWVL-NEXT: retq 651 %vec = load <16 x i8>, <16 x i8>* %L 652 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 2, i32 10> 653 store <2 x i8> %strided.vec, <2 x i8>* %S 654 ret void 655 } 656 657 define void @shuffle_v16i8_to_v2i8_3(<16 x i8>* %L, <2 x i8>* %S) nounwind { 658 ; SSE2-LABEL: shuffle_v16i8_to_v2i8_3: 659 ; SSE2: # %bb.0: 660 ; SSE2-NEXT: movdqa (%rdi), %xmm0 661 ; SSE2-NEXT: pxor %xmm1, %xmm1 662 ; SSE2-NEXT: movdqa %xmm0, %xmm2 663 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 664 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 665 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 666 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] 667 ; SSE2-NEXT: packuswb %xmm0, %xmm0 668 ; SSE2-NEXT: movd %xmm0, %eax 669 ; SSE2-NEXT: movw %ax, (%rsi) 670 ; SSE2-NEXT: retq 671 ; 672 ; SSE42-LABEL: shuffle_v16i8_to_v2i8_3: 673 ; SSE42: # %bb.0: 674 ; SSE42-NEXT: movdqa (%rdi), %xmm0 675 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 676 ; SSE42-NEXT: pextrw $0, %xmm0, (%rsi) 677 ; SSE42-NEXT: retq 678 ; 679 ; AVX-LABEL: shuffle_v16i8_to_v2i8_3: 680 ; AVX: # %bb.0: 681 ; AVX-NEXT: vmovdqa (%rdi), %xmm0 682 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 683 ; AVX-NEXT: vpextrw $0, %xmm0, (%rsi) 684 ; AVX-NEXT: retq 685 ; 686 ; AVX512F-LABEL: shuffle_v16i8_to_v2i8_3: 687 ; AVX512F: # %bb.0: 688 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 689 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 690 ; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi) 691 ; AVX512F-NEXT: retq 692 ; 693 ; AVX512VL-LABEL: shuffle_v16i8_to_v2i8_3: 694 ; AVX512VL: # %bb.0: 695 ; AVX512VL-NEXT: vpsrld $24, (%rdi), %xmm0 696 ; AVX512VL-NEXT: vpmovqb %xmm0, (%rsi) 697 ; AVX512VL-NEXT: retq 698 ; 699 ; AVX512BW-LABEL: shuffle_v16i8_to_v2i8_3: 700 ; AVX512BW: # %bb.0: 701 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 702 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 703 ; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi) 704 ; AVX512BW-NEXT: retq 705 ; 706 ; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8_3: 707 ; AVX512BWVL: # %bb.0: 708 ; AVX512BWVL-NEXT: vpsrld $24, (%rdi), %xmm0 709 ; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rsi) 710 ; AVX512BWVL-NEXT: retq 711 %vec = load <16 x i8>, <16 x i8>* %L 712 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 3, i32 11> 713 store <2 x i8> %strided.vec, <2 x i8>* %S 714 ret void 715 } 716 717 define void @shuffle_v16i8_to_v2i8_4(<16 x i8>* %L, <2 x i8>* %S) nounwind { 718 ; SSE2-LABEL: shuffle_v16i8_to_v2i8_4: 719 ; SSE2: # %bb.0: 720 ; SSE2-NEXT: movdqa (%rdi), %xmm0 721 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 722 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] 723 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] 724 ; SSE2-NEXT: packuswb %xmm0, %xmm0 725 ; SSE2-NEXT: movd %xmm0, %eax 726 ; SSE2-NEXT: movw %ax, (%rsi) 727 ; SSE2-NEXT: retq 728 ; 729 ; SSE42-LABEL: shuffle_v16i8_to_v2i8_4: 730 ; SSE42: # %bb.0: 731 ; SSE42-NEXT: movdqa (%rdi), %xmm0 732 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 733 ; SSE42-NEXT: pextrw $0, %xmm0, (%rsi) 734 ; SSE42-NEXT: retq 735 ; 736 ; AVX-LABEL: shuffle_v16i8_to_v2i8_4: 737 ; AVX: # %bb.0: 738 ; AVX-NEXT: vmovdqa (%rdi), %xmm0 739 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 740 ; AVX-NEXT: vpextrw $0, %xmm0, (%rsi) 741 ; AVX-NEXT: retq 742 ; 743 ; AVX512F-LABEL: shuffle_v16i8_to_v2i8_4: 744 ; AVX512F: # %bb.0: 745 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 746 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 747 ; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi) 748 ; AVX512F-NEXT: retq 749 ; 750 ; AVX512VL-LABEL: shuffle_v16i8_to_v2i8_4: 751 ; AVX512VL: # %bb.0: 752 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,1,3,3] 753 ; AVX512VL-NEXT: vpmovqb %xmm0, (%rsi) 754 ; AVX512VL-NEXT: retq 755 ; 756 ; AVX512BW-LABEL: shuffle_v16i8_to_v2i8_4: 757 ; AVX512BW: # %bb.0: 758 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 759 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 760 ; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi) 761 ; AVX512BW-NEXT: retq 762 ; 763 ; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8_4: 764 ; AVX512BWVL: # %bb.0: 765 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,1,3,3] 766 ; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rsi) 767 ; AVX512BWVL-NEXT: retq 768 %vec = load <16 x i8>, <16 x i8>* %L 769 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 4, i32 12> 770 store <2 x i8> %strided.vec, <2 x i8>* %S 771 ret void 772 } 773 774 define void @shuffle_v16i8_to_v2i8_5(<16 x i8>* %L, <2 x i8>* %S) nounwind { 775 ; SSE2-LABEL: shuffle_v16i8_to_v2i8_5: 776 ; SSE2: # %bb.0: 777 ; SSE2-NEXT: movdqa (%rdi), %xmm0 778 ; SSE2-NEXT: pxor %xmm1, %xmm1 779 ; SSE2-NEXT: movdqa %xmm0, %xmm2 780 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 781 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 782 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 783 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] 784 ; SSE2-NEXT: packuswb %xmm0, %xmm0 785 ; SSE2-NEXT: movd %xmm0, %eax 786 ; SSE2-NEXT: movw %ax, (%rsi) 787 ; SSE2-NEXT: retq 788 ; 789 ; SSE42-LABEL: shuffle_v16i8_to_v2i8_5: 790 ; SSE42: # %bb.0: 791 ; SSE42-NEXT: movdqa (%rdi), %xmm0 792 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 793 ; SSE42-NEXT: pextrw $0, %xmm0, (%rsi) 794 ; SSE42-NEXT: retq 795 ; 796 ; AVX-LABEL: shuffle_v16i8_to_v2i8_5: 797 ; AVX: # %bb.0: 798 ; AVX-NEXT: vmovdqa (%rdi), %xmm0 799 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 800 ; AVX-NEXT: vpextrw $0, %xmm0, (%rsi) 801 ; AVX-NEXT: retq 802 ; 803 ; AVX512F-LABEL: shuffle_v16i8_to_v2i8_5: 804 ; AVX512F: # %bb.0: 805 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 806 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 807 ; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi) 808 ; AVX512F-NEXT: retq 809 ; 810 ; AVX512VL-LABEL: shuffle_v16i8_to_v2i8_5: 811 ; AVX512VL: # %bb.0: 812 ; AVX512VL-NEXT: vpsrlq $40, (%rdi), %xmm0 813 ; AVX512VL-NEXT: vpmovqb %xmm0, (%rsi) 814 ; AVX512VL-NEXT: retq 815 ; 816 ; AVX512BW-LABEL: shuffle_v16i8_to_v2i8_5: 817 ; AVX512BW: # %bb.0: 818 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 819 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 820 ; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi) 821 ; AVX512BW-NEXT: retq 822 ; 823 ; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8_5: 824 ; AVX512BWVL: # %bb.0: 825 ; AVX512BWVL-NEXT: vpsrlq $40, (%rdi), %xmm0 826 ; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rsi) 827 ; AVX512BWVL-NEXT: retq 828 %vec = load <16 x i8>, <16 x i8>* %L 829 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 5, i32 13> 830 store <2 x i8> %strided.vec, <2 x i8>* %S 831 ret void 832 } 833 834 define void @shuffle_v16i8_to_v2i8_6(<16 x i8>* %L, <2 x i8>* %S) nounwind { 835 ; SSE2-LABEL: shuffle_v16i8_to_v2i8_6: 836 ; SSE2: # %bb.0: 837 ; SSE2-NEXT: movdqa (%rdi), %xmm0 838 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 839 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] 840 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] 841 ; SSE2-NEXT: packuswb %xmm0, %xmm0 842 ; SSE2-NEXT: movd %xmm0, %eax 843 ; SSE2-NEXT: movw %ax, (%rsi) 844 ; SSE2-NEXT: retq 845 ; 846 ; SSE42-LABEL: shuffle_v16i8_to_v2i8_6: 847 ; SSE42: # %bb.0: 848 ; SSE42-NEXT: movdqa (%rdi), %xmm0 849 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 850 ; SSE42-NEXT: pextrw $0, %xmm0, (%rsi) 851 ; SSE42-NEXT: retq 852 ; 853 ; AVX-LABEL: shuffle_v16i8_to_v2i8_6: 854 ; AVX: # %bb.0: 855 ; AVX-NEXT: vmovdqa (%rdi), %xmm0 856 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 857 ; AVX-NEXT: vpextrw $0, %xmm0, (%rsi) 858 ; AVX-NEXT: retq 859 ; 860 ; AVX512F-LABEL: shuffle_v16i8_to_v2i8_6: 861 ; AVX512F: # %bb.0: 862 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 863 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 864 ; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi) 865 ; AVX512F-NEXT: retq 866 ; 867 ; AVX512VL-LABEL: shuffle_v16i8_to_v2i8_6: 868 ; AVX512VL: # %bb.0: 869 ; AVX512VL-NEXT: vpsrlq $48, (%rdi), %xmm0 870 ; AVX512VL-NEXT: vpmovqb %xmm0, (%rsi) 871 ; AVX512VL-NEXT: retq 872 ; 873 ; AVX512BW-LABEL: shuffle_v16i8_to_v2i8_6: 874 ; AVX512BW: # %bb.0: 875 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 876 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 877 ; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi) 878 ; AVX512BW-NEXT: retq 879 ; 880 ; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8_6: 881 ; AVX512BWVL: # %bb.0: 882 ; AVX512BWVL-NEXT: vpsrlq $48, (%rdi), %xmm0 883 ; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rsi) 884 ; AVX512BWVL-NEXT: retq 885 %vec = load <16 x i8>, <16 x i8>* %L 886 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 6, i32 14> 887 store <2 x i8> %strided.vec, <2 x i8>* %S 888 ret void 889 } 890 891 define void @shuffle_v16i8_to_v2i8_7(<16 x i8>* %L, <2 x i8>* %S) nounwind { 892 ; SSE2-LABEL: shuffle_v16i8_to_v2i8_7: 893 ; SSE2: # %bb.0: 894 ; SSE2-NEXT: movdqa (%rdi), %xmm0 895 ; SSE2-NEXT: pxor %xmm1, %xmm1 896 ; SSE2-NEXT: movdqa %xmm0, %xmm2 897 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 898 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 899 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 900 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] 901 ; SSE2-NEXT: packuswb %xmm0, %xmm0 902 ; SSE2-NEXT: movd %xmm0, %eax 903 ; SSE2-NEXT: movw %ax, (%rsi) 904 ; SSE2-NEXT: retq 905 ; 906 ; SSE42-LABEL: shuffle_v16i8_to_v2i8_7: 907 ; SSE42: # %bb.0: 908 ; SSE42-NEXT: movdqa (%rdi), %xmm0 909 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 910 ; SSE42-NEXT: pextrw $0, %xmm0, (%rsi) 911 ; SSE42-NEXT: retq 912 ; 913 ; AVX-LABEL: shuffle_v16i8_to_v2i8_7: 914 ; AVX: # %bb.0: 915 ; AVX-NEXT: vmovdqa (%rdi), %xmm0 916 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 917 ; AVX-NEXT: vpextrw $0, %xmm0, (%rsi) 918 ; AVX-NEXT: retq 919 ; 920 ; AVX512F-LABEL: shuffle_v16i8_to_v2i8_7: 921 ; AVX512F: # %bb.0: 922 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 923 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 924 ; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi) 925 ; AVX512F-NEXT: retq 926 ; 927 ; AVX512VL-LABEL: shuffle_v16i8_to_v2i8_7: 928 ; AVX512VL: # %bb.0: 929 ; AVX512VL-NEXT: vpsrlq $56, (%rdi), %xmm0 930 ; AVX512VL-NEXT: vpmovqb %xmm0, (%rsi) 931 ; AVX512VL-NEXT: retq 932 ; 933 ; AVX512BW-LABEL: shuffle_v16i8_to_v2i8_7: 934 ; AVX512BW: # %bb.0: 935 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 936 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 937 ; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi) 938 ; AVX512BW-NEXT: retq 939 ; 940 ; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8_7: 941 ; AVX512BWVL: # %bb.0: 942 ; AVX512BWVL-NEXT: vpsrlq $56, (%rdi), %xmm0 943 ; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rsi) 944 ; AVX512BWVL-NEXT: retq 945 %vec = load <16 x i8>, <16 x i8>* %L 946 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 7, i32 15> 947 store <2 x i8> %strided.vec, <2 x i8>* %S 948 ret void 949 } 950 951