1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW 4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST 5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F 6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL 7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW 8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL 9 10 define void @shuffle_v32i8_to_v16i8_1(<32 x i8>* %L, <16 x i8>* %S) nounwind { 11 ; AVX1-LABEL: shuffle_v32i8_to_v16i8_1: 12 ; AVX1: # %bb.0: 13 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 14 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 15 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> 16 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 17 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 18 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 19 ; AVX1-NEXT: vmovdqa %xmm0, (%rsi) 20 ; AVX1-NEXT: vzeroupper 21 ; AVX1-NEXT: retq 22 ; 23 ; AVX2-LABEL: shuffle_v32i8_to_v16i8_1: 24 ; AVX2: # %bb.0: 25 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 26 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 27 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> 28 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 29 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 30 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 31 ; AVX2-NEXT: vmovdqa %xmm0, (%rsi) 32 ; AVX2-NEXT: vzeroupper 33 ; AVX2-NEXT: retq 34 ; 35 ; AVX512-LABEL: shuffle_v32i8_to_v16i8_1: 36 ; AVX512: # %bb.0: 37 ; AVX512-NEXT: vmovdqa (%rdi), %ymm0 38 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 39 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> 40 ; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 41 ; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 42 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 43 ; AVX512-NEXT: vmovdqa %xmm0, (%rsi) 44 ; AVX512-NEXT: vzeroupper 45 ; AVX512-NEXT: retq 46 %vec = load <32 x i8>, <32 x i8>* %L 47 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31> 48 store <16 x i8> %strided.vec, <16 x i8>* %S 49 ret void 50 } 51 52 define void @shuffle_v16i16_to_v8i16_1(<16 x i16>* %L, <8 x i16>* %S) nounwind { 53 ; AVX1-LABEL: shuffle_v16i16_to_v8i16_1: 54 ; AVX1: # %bb.0: 55 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 56 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 57 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] 58 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 59 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 60 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 61 ; AVX1-NEXT: vmovdqa %xmm0, (%rsi) 62 ; AVX1-NEXT: vzeroupper 63 ; AVX1-NEXT: retq 64 ; 65 ; AVX2-LABEL: shuffle_v16i16_to_v8i16_1: 66 ; AVX2: # %bb.0: 67 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 68 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 69 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] 70 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 71 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 72 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 73 ; AVX2-NEXT: vmovdqa %xmm0, (%rsi) 74 ; AVX2-NEXT: vzeroupper 75 ; AVX2-NEXT: retq 76 ; 77 ; AVX512-LABEL: shuffle_v16i16_to_v8i16_1: 78 ; AVX512: # %bb.0: 79 ; AVX512-NEXT: vmovdqa (%rdi), %ymm0 80 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 81 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] 82 ; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 83 ; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 84 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 85 ; AVX512-NEXT: vmovdqa %xmm0, (%rsi) 86 ; AVX512-NEXT: vzeroupper 87 ; AVX512-NEXT: retq 88 %vec = load <16 x i16>, <16 x i16>* %L 89 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 90 store <8 x i16> %strided.vec, <8 x i16>* %S 91 ret void 92 } 93 94 define void @shuffle_v8i32_to_v4i32_1(<8 x i32>* %L, <4 x i32>* %S) nounwind { 95 ; AVX-LABEL: shuffle_v8i32_to_v4i32_1: 96 ; AVX: # %bb.0: 97 ; AVX-NEXT: vmovaps (%rdi), %ymm0 98 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 99 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] 100 ; AVX-NEXT: vmovaps %xmm0, (%rsi) 101 ; AVX-NEXT: vzeroupper 102 ; AVX-NEXT: retq 103 ; 104 ; AVX512-LABEL: shuffle_v8i32_to_v4i32_1: 105 ; AVX512: # %bb.0: 106 ; AVX512-NEXT: vmovaps (%rdi), %ymm0 107 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 108 ; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] 109 ; AVX512-NEXT: vmovaps %xmm0, (%rsi) 110 ; AVX512-NEXT: vzeroupper 111 ; AVX512-NEXT: retq 112 %vec = load <8 x i32>, <8 x i32>* %L 113 %strided.vec = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 114 store <4 x i32> %strided.vec, <4 x i32>* %S 115 ret void 116 } 117 118 define void @shuffle_v32i8_to_v8i8_1(<32 x i8>* %L, <8 x i8>* %S) nounwind { 119 ; AVX1-LABEL: shuffle_v32i8_to_v8i8_1: 120 ; AVX1: # %bb.0: 121 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 122 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 123 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> 124 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 125 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 126 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 127 ; AVX1-NEXT: vmovq %xmm0, (%rsi) 128 ; AVX1-NEXT: vzeroupper 129 ; AVX1-NEXT: retq 130 ; 131 ; AVX2-LABEL: shuffle_v32i8_to_v8i8_1: 132 ; AVX2: # %bb.0: 133 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 134 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 135 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> 136 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 137 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 138 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 139 ; AVX2-NEXT: vmovq %xmm0, (%rsi) 140 ; AVX2-NEXT: vzeroupper 141 ; AVX2-NEXT: retq 142 ; 143 ; AVX512F-LABEL: shuffle_v32i8_to_v8i8_1: 144 ; AVX512F: # %bb.0: 145 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 146 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 147 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> 148 ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 149 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 150 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 151 ; AVX512F-NEXT: vmovq %xmm0, (%rsi) 152 ; AVX512F-NEXT: vzeroupper 153 ; AVX512F-NEXT: retq 154 ; 155 ; AVX512VL-LABEL: shuffle_v32i8_to_v8i8_1: 156 ; AVX512VL: # %bb.0: 157 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 158 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 159 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> 160 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 161 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 162 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 163 ; AVX512VL-NEXT: vmovq %xmm0, (%rsi) 164 ; AVX512VL-NEXT: vzeroupper 165 ; AVX512VL-NEXT: retq 166 ; 167 ; AVX512BW-LABEL: shuffle_v32i8_to_v8i8_1: 168 ; AVX512BW: # %bb.0: 169 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 170 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 171 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> 172 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 173 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 174 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 175 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) 176 ; AVX512BW-NEXT: vzeroupper 177 ; AVX512BW-NEXT: retq 178 ; 179 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_1: 180 ; AVX512BWVL: # %bb.0: 181 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 182 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 183 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,5,5,9,9,13,13,13,13,5,5,12,12,13,13] 184 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 185 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 186 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 187 ; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi) 188 ; AVX512BWVL-NEXT: vzeroupper 189 ; AVX512BWVL-NEXT: retq 190 %vec = load <32 x i8>, <32 x i8>* %L 191 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29> 192 store <8 x i8> %strided.vec, <8 x i8>* %S 193 ret void 194 } 195 196 define void @shuffle_v32i8_to_v8i8_2(<32 x i8>* %L, <8 x i8>* %S) nounwind { 197 ; AVX1-LABEL: shuffle_v32i8_to_v8i8_2: 198 ; AVX1: # %bb.0: 199 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 200 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 201 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> 202 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 203 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 204 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 205 ; AVX1-NEXT: vmovq %xmm0, (%rsi) 206 ; AVX1-NEXT: vzeroupper 207 ; AVX1-NEXT: retq 208 ; 209 ; AVX2-LABEL: shuffle_v32i8_to_v8i8_2: 210 ; AVX2: # %bb.0: 211 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 212 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 213 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> 214 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 215 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 216 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 217 ; AVX2-NEXT: vmovq %xmm0, (%rsi) 218 ; AVX2-NEXT: vzeroupper 219 ; AVX2-NEXT: retq 220 ; 221 ; AVX512F-LABEL: shuffle_v32i8_to_v8i8_2: 222 ; AVX512F: # %bb.0: 223 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 224 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 225 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> 226 ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 227 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 228 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 229 ; AVX512F-NEXT: vmovq %xmm0, (%rsi) 230 ; AVX512F-NEXT: vzeroupper 231 ; AVX512F-NEXT: retq 232 ; 233 ; AVX512VL-LABEL: shuffle_v32i8_to_v8i8_2: 234 ; AVX512VL: # %bb.0: 235 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 236 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 237 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> 238 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 239 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 240 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 241 ; AVX512VL-NEXT: vmovq %xmm0, (%rsi) 242 ; AVX512VL-NEXT: vzeroupper 243 ; AVX512VL-NEXT: retq 244 ; 245 ; AVX512BW-LABEL: shuffle_v32i8_to_v8i8_2: 246 ; AVX512BW: # %bb.0: 247 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 248 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 249 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> 250 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 251 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 252 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 253 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) 254 ; AVX512BW-NEXT: vzeroupper 255 ; AVX512BW-NEXT: retq 256 ; 257 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_2: 258 ; AVX512BWVL: # %bb.0: 259 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 260 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 261 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] 262 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 263 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 264 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 265 ; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi) 266 ; AVX512BWVL-NEXT: vzeroupper 267 ; AVX512BWVL-NEXT: retq 268 %vec = load <32 x i8>, <32 x i8>* %L 269 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30> 270 store <8 x i8> %strided.vec, <8 x i8>* %S 271 ret void 272 } 273 274 define void @shuffle_v32i8_to_v8i8_3(<32 x i8>* %L, <8 x i8>* %S) nounwind { 275 ; AVX1-LABEL: shuffle_v32i8_to_v8i8_3: 276 ; AVX1: # %bb.0: 277 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 278 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 279 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> 280 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 281 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 282 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 283 ; AVX1-NEXT: vmovq %xmm0, (%rsi) 284 ; AVX1-NEXT: vzeroupper 285 ; AVX1-NEXT: retq 286 ; 287 ; AVX2-LABEL: shuffle_v32i8_to_v8i8_3: 288 ; AVX2: # %bb.0: 289 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 290 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 291 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> 292 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 293 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 294 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 295 ; AVX2-NEXT: vmovq %xmm0, (%rsi) 296 ; AVX2-NEXT: vzeroupper 297 ; AVX2-NEXT: retq 298 ; 299 ; AVX512F-LABEL: shuffle_v32i8_to_v8i8_3: 300 ; AVX512F: # %bb.0: 301 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 302 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 303 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> 304 ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 305 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 306 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 307 ; AVX512F-NEXT: vmovq %xmm0, (%rsi) 308 ; AVX512F-NEXT: vzeroupper 309 ; AVX512F-NEXT: retq 310 ; 311 ; AVX512VL-LABEL: shuffle_v32i8_to_v8i8_3: 312 ; AVX512VL: # %bb.0: 313 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 314 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 315 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> 316 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 317 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 318 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 319 ; AVX512VL-NEXT: vmovq %xmm0, (%rsi) 320 ; AVX512VL-NEXT: vzeroupper 321 ; AVX512VL-NEXT: retq 322 ; 323 ; AVX512BW-LABEL: shuffle_v32i8_to_v8i8_3: 324 ; AVX512BW: # %bb.0: 325 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 326 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 327 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> 328 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 329 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 330 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 331 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) 332 ; AVX512BW-NEXT: vzeroupper 333 ; AVX512BW-NEXT: retq 334 ; 335 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_3: 336 ; AVX512BWVL: # %bb.0: 337 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 338 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 339 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [3,3,7,7,11,11,15,15,7,7,15,15,6,6,7,7] 340 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 341 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 342 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 343 ; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi) 344 ; AVX512BWVL-NEXT: vzeroupper 345 ; AVX512BWVL-NEXT: retq 346 %vec = load <32 x i8>, <32 x i8>* %L 347 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31> 348 store <8 x i8> %strided.vec, <8 x i8>* %S 349 ret void 350 } 351 352 define void @shuffle_v16i16_to_v4i16_1(<16 x i16>* %L, <4 x i16>* %S) nounwind { 353 ; AVX1-LABEL: shuffle_v16i16_to_v4i16_1: 354 ; AVX1: # %bb.0: 355 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 356 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 357 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 358 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] 359 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 360 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] 361 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 362 ; AVX1-NEXT: vmovq %xmm0, (%rsi) 363 ; AVX1-NEXT: vzeroupper 364 ; AVX1-NEXT: retq 365 ; 366 ; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_1: 367 ; AVX2-SLOW: # %bb.0: 368 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 369 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 370 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 371 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] 372 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 373 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] 374 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 375 ; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi) 376 ; AVX2-SLOW-NEXT: vzeroupper 377 ; AVX2-SLOW-NEXT: retq 378 ; 379 ; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_1: 380 ; AVX2-FAST: # %bb.0: 381 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 382 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 383 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] 384 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 385 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 386 ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 387 ; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi) 388 ; AVX2-FAST-NEXT: vzeroupper 389 ; AVX2-FAST-NEXT: retq 390 ; 391 ; AVX512F-LABEL: shuffle_v16i16_to_v4i16_1: 392 ; AVX512F: # %bb.0: 393 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 394 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 395 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 396 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] 397 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 398 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] 399 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 400 ; AVX512F-NEXT: vmovq %xmm0, (%rsi) 401 ; AVX512F-NEXT: vzeroupper 402 ; AVX512F-NEXT: retq 403 ; 404 ; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_1: 405 ; AVX512VL: # %bb.0: 406 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 407 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 408 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,2,3,10,11,10,11,8,9,10,11,12,13,14,15] 409 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 410 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 411 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 412 ; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi) 413 ; AVX512VL-NEXT: vzeroupper 414 ; AVX512VL-NEXT: retq 415 ; 416 ; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_1: 417 ; AVX512BW: # %bb.0: 418 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 419 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 420 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] 421 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 422 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 423 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 424 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) 425 ; AVX512BW-NEXT: vzeroupper 426 ; AVX512BW-NEXT: retq 427 ; 428 ; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_1: 429 ; AVX512BWVL: # %bb.0: 430 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 431 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 432 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,2,3,10,11,10,11,8,9,10,11,12,13,14,15] 433 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 434 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 435 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 436 ; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi) 437 ; AVX512BWVL-NEXT: vzeroupper 438 ; AVX512BWVL-NEXT: retq 439 %vec = load <16 x i16>, <16 x i16>* %L 440 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13> 441 store <4 x i16> %strided.vec, <4 x i16>* %S 442 ret void 443 } 444 445 define void @shuffle_v16i16_to_v4i16_2(<16 x i16>* %L, <4 x i16>* %S) nounwind { 446 ; AVX1-LABEL: shuffle_v16i16_to_v4i16_2: 447 ; AVX1: # %bb.0: 448 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 449 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 450 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] 451 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] 452 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] 453 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] 454 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 455 ; AVX1-NEXT: vmovq %xmm0, (%rsi) 456 ; AVX1-NEXT: vzeroupper 457 ; AVX1-NEXT: retq 458 ; 459 ; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_2: 460 ; AVX2-SLOW: # %bb.0: 461 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 462 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 463 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] 464 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] 465 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] 466 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] 467 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 468 ; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi) 469 ; AVX2-SLOW-NEXT: vzeroupper 470 ; AVX2-SLOW-NEXT: retq 471 ; 472 ; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_2: 473 ; AVX2-FAST: # %bb.0: 474 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 475 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 476 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15] 477 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 478 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 479 ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 480 ; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi) 481 ; AVX2-FAST-NEXT: vzeroupper 482 ; AVX2-FAST-NEXT: retq 483 ; 484 ; AVX512F-LABEL: shuffle_v16i16_to_v4i16_2: 485 ; AVX512F: # %bb.0: 486 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 487 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 488 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] 489 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] 490 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] 491 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] 492 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 493 ; AVX512F-NEXT: vmovq %xmm0, (%rsi) 494 ; AVX512F-NEXT: vzeroupper 495 ; AVX512F-NEXT: retq 496 ; 497 ; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_2: 498 ; AVX512VL: # %bb.0: 499 ; AVX512VL-NEXT: vmovaps (%rdi), %ymm0 500 ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1 501 ; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] 502 ; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi) 503 ; AVX512VL-NEXT: vzeroupper 504 ; AVX512VL-NEXT: retq 505 ; 506 ; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_2: 507 ; AVX512BW: # %bb.0: 508 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 509 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 510 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15] 511 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 512 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 513 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 514 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) 515 ; AVX512BW-NEXT: vzeroupper 516 ; AVX512BW-NEXT: retq 517 ; 518 ; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_2: 519 ; AVX512BWVL: # %bb.0: 520 ; AVX512BWVL-NEXT: vmovaps (%rdi), %ymm0 521 ; AVX512BWVL-NEXT: vextractf128 $1, %ymm0, %xmm1 522 ; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] 523 ; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi) 524 ; AVX512BWVL-NEXT: vzeroupper 525 ; AVX512BWVL-NEXT: retq 526 %vec = load <16 x i16>, <16 x i16>* %L 527 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14> 528 store <4 x i16> %strided.vec, <4 x i16>* %S 529 ret void 530 } 531 532 define void @shuffle_v16i16_to_v4i16_3(<16 x i16>* %L, <4 x i16>* %S) nounwind { 533 ; AVX1-LABEL: shuffle_v16i16_to_v4i16_3: 534 ; AVX1: # %bb.0: 535 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 536 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 537 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] 538 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] 539 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] 540 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] 541 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 542 ; AVX1-NEXT: vmovq %xmm0, (%rsi) 543 ; AVX1-NEXT: vzeroupper 544 ; AVX1-NEXT: retq 545 ; 546 ; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_3: 547 ; AVX2-SLOW: # %bb.0: 548 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 549 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 550 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] 551 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] 552 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] 553 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] 554 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 555 ; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi) 556 ; AVX2-SLOW-NEXT: vzeroupper 557 ; AVX2-SLOW-NEXT: retq 558 ; 559 ; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_3: 560 ; AVX2-FAST: # %bb.0: 561 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 562 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 563 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15] 564 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 565 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 566 ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 567 ; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi) 568 ; AVX2-FAST-NEXT: vzeroupper 569 ; AVX2-FAST-NEXT: retq 570 ; 571 ; AVX512F-LABEL: shuffle_v16i16_to_v4i16_3: 572 ; AVX512F: # %bb.0: 573 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 574 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 575 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] 576 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] 577 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] 578 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] 579 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 580 ; AVX512F-NEXT: vmovq %xmm0, (%rsi) 581 ; AVX512F-NEXT: vzeroupper 582 ; AVX512F-NEXT: retq 583 ; 584 ; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_3: 585 ; AVX512VL: # %bb.0: 586 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 587 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 588 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,14,15,6,7,8,9,10,11,12,13,14,15] 589 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 590 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 591 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 592 ; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi) 593 ; AVX512VL-NEXT: vzeroupper 594 ; AVX512VL-NEXT: retq 595 ; 596 ; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_3: 597 ; AVX512BW: # %bb.0: 598 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 599 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 600 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15] 601 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 602 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 603 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 604 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) 605 ; AVX512BW-NEXT: vzeroupper 606 ; AVX512BW-NEXT: retq 607 ; 608 ; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_3: 609 ; AVX512BWVL: # %bb.0: 610 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 611 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 612 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,14,15,6,7,8,9,10,11,12,13,14,15] 613 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 614 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 615 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 616 ; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi) 617 ; AVX512BWVL-NEXT: vzeroupper 618 ; AVX512BWVL-NEXT: retq 619 %vec = load <16 x i16>, <16 x i16>* %L 620 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15> 621 store <4 x i16> %strided.vec, <4 x i16>* %S 622 ret void 623 } 624 625 define void @shuffle_v32i8_to_v4i8_1(<32 x i8>* %L, <4 x i8>* %S) nounwind { 626 ; AVX1-LABEL: shuffle_v32i8_to_v4i8_1: 627 ; AVX1: # %bb.0: 628 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 629 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 630 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 631 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 632 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 633 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 634 ; AVX1-NEXT: vmovd %xmm0, (%rsi) 635 ; AVX1-NEXT: vzeroupper 636 ; AVX1-NEXT: retq 637 ; 638 ; AVX2-LABEL: shuffle_v32i8_to_v4i8_1: 639 ; AVX2: # %bb.0: 640 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 641 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 642 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 643 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 644 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 645 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 646 ; AVX2-NEXT: vmovd %xmm0, (%rsi) 647 ; AVX2-NEXT: vzeroupper 648 ; AVX2-NEXT: retq 649 ; 650 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_1: 651 ; AVX512F: # %bb.0: 652 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 653 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 654 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 655 ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 656 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 657 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 658 ; AVX512F-NEXT: vmovd %xmm0, (%rsi) 659 ; AVX512F-NEXT: vzeroupper 660 ; AVX512F-NEXT: retq 661 ; 662 ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_1: 663 ; AVX512VL: # %bb.0: 664 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 665 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 666 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,255,255,9,9,255,255,9,9,255,255,11,11,255,255] 667 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 668 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 669 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 670 ; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi) 671 ; AVX512VL-NEXT: vzeroupper 672 ; AVX512VL-NEXT: retq 673 ; 674 ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_1: 675 ; AVX512BW: # %bb.0: 676 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 677 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 678 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 679 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 680 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 681 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 682 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi) 683 ; AVX512BW-NEXT: vzeroupper 684 ; AVX512BW-NEXT: retq 685 ; 686 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_1: 687 ; AVX512BWVL: # %bb.0: 688 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 689 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 690 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,255,255,9,9,255,255,9,9,255,255,11,11,255,255] 691 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 692 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 693 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 694 ; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi) 695 ; AVX512BWVL-NEXT: vzeroupper 696 ; AVX512BWVL-NEXT: retq 697 %vec = load <32 x i8>, <32 x i8>* %L 698 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 1, i32 9, i32 17, i32 25> 699 store <4 x i8> %strided.vec, <4 x i8>* %S 700 ret void 701 } 702 703 define void @shuffle_v32i8_to_v4i8_2(<32 x i8>* %L, <4 x i8>* %S) nounwind { 704 ; AVX1-LABEL: shuffle_v32i8_to_v4i8_2: 705 ; AVX1: # %bb.0: 706 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 707 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 708 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 709 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 710 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 711 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 712 ; AVX1-NEXT: vmovd %xmm0, (%rsi) 713 ; AVX1-NEXT: vzeroupper 714 ; AVX1-NEXT: retq 715 ; 716 ; AVX2-LABEL: shuffle_v32i8_to_v4i8_2: 717 ; AVX2: # %bb.0: 718 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 719 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 720 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 721 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 722 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 723 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 724 ; AVX2-NEXT: vmovd %xmm0, (%rsi) 725 ; AVX2-NEXT: vzeroupper 726 ; AVX2-NEXT: retq 727 ; 728 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_2: 729 ; AVX512F: # %bb.0: 730 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 731 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 732 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 733 ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 734 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 735 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 736 ; AVX512F-NEXT: vmovd %xmm0, (%rsi) 737 ; AVX512F-NEXT: vzeroupper 738 ; AVX512F-NEXT: retq 739 ; 740 ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_2: 741 ; AVX512VL: # %bb.0: 742 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 743 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 744 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,2,3,10,11,10,11,8,9,10,11,12,13,14,15] 745 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 746 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 747 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 748 ; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi) 749 ; AVX512VL-NEXT: vzeroupper 750 ; AVX512VL-NEXT: retq 751 ; 752 ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_2: 753 ; AVX512BW: # %bb.0: 754 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 755 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 756 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 757 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 758 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 759 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 760 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi) 761 ; AVX512BW-NEXT: vzeroupper 762 ; AVX512BW-NEXT: retq 763 ; 764 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_2: 765 ; AVX512BWVL: # %bb.0: 766 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 767 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 768 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,2,3,10,11,10,11,8,9,10,11,12,13,14,15] 769 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 770 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 771 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 772 ; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi) 773 ; AVX512BWVL-NEXT: vzeroupper 774 ; AVX512BWVL-NEXT: retq 775 %vec = load <32 x i8>, <32 x i8>* %L 776 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 2, i32 10, i32 18, i32 26> 777 store <4 x i8> %strided.vec, <4 x i8>* %S 778 ret void 779 } 780 781 define void @shuffle_v32i8_to_v4i8_3(<32 x i8>* %L, <4 x i8>* %S) nounwind { 782 ; AVX1-LABEL: shuffle_v32i8_to_v4i8_3: 783 ; AVX1: # %bb.0: 784 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 785 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 786 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 787 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 788 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 789 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 790 ; AVX1-NEXT: vmovd %xmm0, (%rsi) 791 ; AVX1-NEXT: vzeroupper 792 ; AVX1-NEXT: retq 793 ; 794 ; AVX2-LABEL: shuffle_v32i8_to_v4i8_3: 795 ; AVX2: # %bb.0: 796 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 797 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 798 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 799 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 800 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 801 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 802 ; AVX2-NEXT: vmovd %xmm0, (%rsi) 803 ; AVX2-NEXT: vzeroupper 804 ; AVX2-NEXT: retq 805 ; 806 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_3: 807 ; AVX512F: # %bb.0: 808 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 809 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 810 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 811 ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 812 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 813 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 814 ; AVX512F-NEXT: vmovd %xmm0, (%rsi) 815 ; AVX512F-NEXT: vzeroupper 816 ; AVX512F-NEXT: retq 817 ; 818 ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_3: 819 ; AVX512VL: # %bb.0: 820 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 821 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 822 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [3,3,11,11,11,11,3,3,8,8,9,9,10,10,11,11] 823 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 824 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 825 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 826 ; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi) 827 ; AVX512VL-NEXT: vzeroupper 828 ; AVX512VL-NEXT: retq 829 ; 830 ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_3: 831 ; AVX512BW: # %bb.0: 832 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 833 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 834 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 835 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 836 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 837 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 838 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi) 839 ; AVX512BW-NEXT: vzeroupper 840 ; AVX512BW-NEXT: retq 841 ; 842 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_3: 843 ; AVX512BWVL: # %bb.0: 844 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 845 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 846 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [3,3,11,11,11,11,3,3,8,8,9,9,10,10,11,11] 847 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 848 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 849 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 850 ; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi) 851 ; AVX512BWVL-NEXT: vzeroupper 852 ; AVX512BWVL-NEXT: retq 853 %vec = load <32 x i8>, <32 x i8>* %L 854 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 3, i32 11, i32 19, i32 27> 855 store <4 x i8> %strided.vec, <4 x i8>* %S 856 ret void 857 } 858 859 define void @shuffle_v32i8_to_v4i8_4(<32 x i8>* %L, <4 x i8>* %S) nounwind { 860 ; AVX1-LABEL: shuffle_v32i8_to_v4i8_4: 861 ; AVX1: # %bb.0: 862 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 863 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 864 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 865 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 866 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 867 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 868 ; AVX1-NEXT: vmovd %xmm0, (%rsi) 869 ; AVX1-NEXT: vzeroupper 870 ; AVX1-NEXT: retq 871 ; 872 ; AVX2-LABEL: shuffle_v32i8_to_v4i8_4: 873 ; AVX2: # %bb.0: 874 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 875 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 876 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 877 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 878 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 879 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 880 ; AVX2-NEXT: vmovd %xmm0, (%rsi) 881 ; AVX2-NEXT: vzeroupper 882 ; AVX2-NEXT: retq 883 ; 884 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_4: 885 ; AVX512F: # %bb.0: 886 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 887 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 888 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 889 ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 890 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 891 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 892 ; AVX512F-NEXT: vmovd %xmm0, (%rsi) 893 ; AVX512F-NEXT: vzeroupper 894 ; AVX512F-NEXT: retq 895 ; 896 ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_4: 897 ; AVX512VL: # %bb.0: 898 ; AVX512VL-NEXT: vmovaps (%rdi), %ymm0 899 ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1 900 ; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] 901 ; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi) 902 ; AVX512VL-NEXT: vzeroupper 903 ; AVX512VL-NEXT: retq 904 ; 905 ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_4: 906 ; AVX512BW: # %bb.0: 907 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 908 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 909 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 910 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 911 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 912 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 913 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi) 914 ; AVX512BW-NEXT: vzeroupper 915 ; AVX512BW-NEXT: retq 916 ; 917 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_4: 918 ; AVX512BWVL: # %bb.0: 919 ; AVX512BWVL-NEXT: vmovaps (%rdi), %ymm0 920 ; AVX512BWVL-NEXT: vextractf128 $1, %ymm0, %xmm1 921 ; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] 922 ; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi) 923 ; AVX512BWVL-NEXT: vzeroupper 924 ; AVX512BWVL-NEXT: retq 925 %vec = load <32 x i8>, <32 x i8>* %L 926 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 4, i32 12, i32 20, i32 28> 927 store <4 x i8> %strided.vec, <4 x i8>* %S 928 ret void 929 } 930 931 define void @shuffle_v32i8_to_v4i8_5(<32 x i8>* %L, <4 x i8>* %S) nounwind { 932 ; AVX1-LABEL: shuffle_v32i8_to_v4i8_5: 933 ; AVX1: # %bb.0: 934 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 935 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 936 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 937 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 938 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 939 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 940 ; AVX1-NEXT: vmovd %xmm0, (%rsi) 941 ; AVX1-NEXT: vzeroupper 942 ; AVX1-NEXT: retq 943 ; 944 ; AVX2-LABEL: shuffle_v32i8_to_v4i8_5: 945 ; AVX2: # %bb.0: 946 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 947 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 948 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 949 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 950 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 951 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 952 ; AVX2-NEXT: vmovd %xmm0, (%rsi) 953 ; AVX2-NEXT: vzeroupper 954 ; AVX2-NEXT: retq 955 ; 956 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_5: 957 ; AVX512F: # %bb.0: 958 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 959 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 960 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 961 ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 962 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 963 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 964 ; AVX512F-NEXT: vmovd %xmm0, (%rsi) 965 ; AVX512F-NEXT: vzeroupper 966 ; AVX512F-NEXT: retq 967 ; 968 ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_5: 969 ; AVX512VL: # %bb.0: 970 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 971 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 972 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [5,5,13,13,13,13,5,5,4,4,5,5,6,6,7,7] 973 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 974 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 975 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 976 ; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi) 977 ; AVX512VL-NEXT: vzeroupper 978 ; AVX512VL-NEXT: retq 979 ; 980 ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_5: 981 ; AVX512BW: # %bb.0: 982 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 983 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 984 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 985 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 986 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 987 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 988 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi) 989 ; AVX512BW-NEXT: vzeroupper 990 ; AVX512BW-NEXT: retq 991 ; 992 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_5: 993 ; AVX512BWVL: # %bb.0: 994 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 995 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 996 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [5,5,13,13,13,13,5,5,4,4,5,5,6,6,7,7] 997 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 998 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 999 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1000 ; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi) 1001 ; AVX512BWVL-NEXT: vzeroupper 1002 ; AVX512BWVL-NEXT: retq 1003 %vec = load <32 x i8>, <32 x i8>* %L 1004 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 5, i32 13, i32 21, i32 29> 1005 store <4 x i8> %strided.vec, <4 x i8>* %S 1006 ret void 1007 } 1008 1009 define void @shuffle_v32i8_to_v4i8_6(<32 x i8>* %L, <4 x i8>* %S) nounwind { 1010 ; AVX1-LABEL: shuffle_v32i8_to_v4i8_6: 1011 ; AVX1: # %bb.0: 1012 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 1013 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1014 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 1015 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1016 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1017 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1018 ; AVX1-NEXT: vmovd %xmm0, (%rsi) 1019 ; AVX1-NEXT: vzeroupper 1020 ; AVX1-NEXT: retq 1021 ; 1022 ; AVX2-LABEL: shuffle_v32i8_to_v4i8_6: 1023 ; AVX2: # %bb.0: 1024 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 1025 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1026 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 1027 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1028 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1029 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1030 ; AVX2-NEXT: vmovd %xmm0, (%rsi) 1031 ; AVX2-NEXT: vzeroupper 1032 ; AVX2-NEXT: retq 1033 ; 1034 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_6: 1035 ; AVX512F: # %bb.0: 1036 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 1037 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 1038 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 1039 ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1040 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1041 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1042 ; AVX512F-NEXT: vmovd %xmm0, (%rsi) 1043 ; AVX512F-NEXT: vzeroupper 1044 ; AVX512F-NEXT: retq 1045 ; 1046 ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_6: 1047 ; AVX512VL: # %bb.0: 1048 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 1049 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 1050 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,14,15,6,7,8,9,10,11,12,13,14,15] 1051 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1052 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1053 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1054 ; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi) 1055 ; AVX512VL-NEXT: vzeroupper 1056 ; AVX512VL-NEXT: retq 1057 ; 1058 ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_6: 1059 ; AVX512BW: # %bb.0: 1060 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 1061 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 1062 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 1063 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1064 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1065 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1066 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi) 1067 ; AVX512BW-NEXT: vzeroupper 1068 ; AVX512BW-NEXT: retq 1069 ; 1070 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_6: 1071 ; AVX512BWVL: # %bb.0: 1072 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 1073 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 1074 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,14,15,6,7,8,9,10,11,12,13,14,15] 1075 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1076 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1077 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1078 ; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi) 1079 ; AVX512BWVL-NEXT: vzeroupper 1080 ; AVX512BWVL-NEXT: retq 1081 %vec = load <32 x i8>, <32 x i8>* %L 1082 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 6, i32 14, i32 22, i32 30> 1083 store <4 x i8> %strided.vec, <4 x i8>* %S 1084 ret void 1085 } 1086 1087 define void @shuffle_v32i8_to_v4i8_7(<32 x i8>* %L, <4 x i8>* %S) nounwind { 1088 ; AVX1-LABEL: shuffle_v32i8_to_v4i8_7: 1089 ; AVX1: # %bb.0: 1090 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 1091 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1092 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 1093 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1094 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1095 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1096 ; AVX1-NEXT: vmovd %xmm0, (%rsi) 1097 ; AVX1-NEXT: vzeroupper 1098 ; AVX1-NEXT: retq 1099 ; 1100 ; AVX2-LABEL: shuffle_v32i8_to_v4i8_7: 1101 ; AVX2: # %bb.0: 1102 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 1103 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1104 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 1105 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1106 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1107 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1108 ; AVX2-NEXT: vmovd %xmm0, (%rsi) 1109 ; AVX2-NEXT: vzeroupper 1110 ; AVX2-NEXT: retq 1111 ; 1112 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_7: 1113 ; AVX512F: # %bb.0: 1114 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 1115 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 1116 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 1117 ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1118 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1119 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1120 ; AVX512F-NEXT: vmovd %xmm0, (%rsi) 1121 ; AVX512F-NEXT: vzeroupper 1122 ; AVX512F-NEXT: retq 1123 ; 1124 ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_7: 1125 ; AVX512VL: # %bb.0: 1126 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 1127 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 1128 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,14,14,15,15,14,14,15,15,4,4,5,5,6,6] 1129 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1130 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1131 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1132 ; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi) 1133 ; AVX512VL-NEXT: vzeroupper 1134 ; AVX512VL-NEXT: retq 1135 ; 1136 ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_7: 1137 ; AVX512BW: # %bb.0: 1138 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 1139 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 1140 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 1141 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1142 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1143 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1144 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi) 1145 ; AVX512BW-NEXT: vzeroupper 1146 ; AVX512BW-NEXT: retq 1147 ; 1148 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_7: 1149 ; AVX512BWVL: # %bb.0: 1150 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 1151 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 1152 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,14,14,15,15,14,14,15,15,4,4,5,5,6,6] 1153 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1154 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1155 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1156 ; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi) 1157 ; AVX512BWVL-NEXT: vzeroupper 1158 ; AVX512BWVL-NEXT: retq 1159 %vec = load <32 x i8>, <32 x i8>* %L 1160 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 7, i32 15, i32 23, i32 31> 1161 store <4 x i8> %strided.vec, <4 x i8>* %S 1162 ret void 1163 } 1164 1165