1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F 3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL 4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW 5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL 6 7 define void @shuffle_v64i8_to_v32i8_1(<64 x i8>* %L, <32 x i8>* %S) nounwind { 8 ; AVX512F-LABEL: shuffle_v64i8_to_v32i8_1: 9 ; AVX512F: # %bb.0: 10 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 11 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 12 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31] 13 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u] 14 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 15 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 16 ; AVX512F-NEXT: vmovdqa %ymm0, (%rsi) 17 ; AVX512F-NEXT: vzeroupper 18 ; AVX512F-NEXT: retq 19 ; 20 ; AVX512VL-LABEL: shuffle_v64i8_to_v32i8_1: 21 ; AVX512VL: # %bb.0: 22 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 23 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 24 ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31] 25 ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u] 26 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7] 27 ; AVX512VL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 28 ; AVX512VL-NEXT: vmovdqa %ymm2, (%rsi) 29 ; AVX512VL-NEXT: vzeroupper 30 ; AVX512VL-NEXT: retq 31 ; 32 ; AVX512BW-LABEL: shuffle_v64i8_to_v32i8_1: 33 ; AVX512BW: # %bb.0: 34 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 35 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 36 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31] 37 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u] 38 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 39 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 40 ; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi) 41 ; AVX512BW-NEXT: vzeroupper 42 ; AVX512BW-NEXT: retq 43 ; 44 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v32i8_1: 45 ; AVX512BWVL: # %bb.0: 46 ; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 47 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 48 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31] 49 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u] 50 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7] 51 ; AVX512BWVL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 52 ; AVX512BWVL-NEXT: vmovdqa %ymm2, (%rsi) 53 ; AVX512BWVL-NEXT: vzeroupper 54 ; AVX512BWVL-NEXT: retq 55 %vec = load <64 x i8>, <64 x i8>* %L 56 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63> 57 store <32 x i8> %strided.vec, <32 x i8>* %S 58 ret void 59 } 60 61 define void @shuffle_v32i16_to_v16i16_1(<32 x i16>* %L, <16 x i16>* %S) nounwind { 62 ; AVX512F-LABEL: shuffle_v32i16_to_v16i16_1: 63 ; AVX512F: # %bb.0: 64 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 65 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 66 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15,22,23,18,19,20,21,22,23,18,19,22,23,26,27,30,31] 67 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31] 68 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 69 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 70 ; AVX512F-NEXT: vmovdqa %ymm0, (%rsi) 71 ; AVX512F-NEXT: vzeroupper 72 ; AVX512F-NEXT: retq 73 ; 74 ; AVX512VL-LABEL: shuffle_v32i16_to_v16i16_1: 75 ; AVX512VL: # %bb.0: 76 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 77 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 78 ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15,22,23,18,19,20,21,22,23,18,19,22,23,26,27,30,31] 79 ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31] 80 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7] 81 ; AVX512VL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 82 ; AVX512VL-NEXT: vmovdqa %ymm2, (%rsi) 83 ; AVX512VL-NEXT: vzeroupper 84 ; AVX512VL-NEXT: retq 85 ; 86 ; AVX512BW-LABEL: shuffle_v32i16_to_v16i16_1: 87 ; AVX512BW: # %bb.0: 88 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 89 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 90 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15,22,23,18,19,20,21,22,23,18,19,22,23,26,27,30,31] 91 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31] 92 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 93 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 94 ; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi) 95 ; AVX512BW-NEXT: vzeroupper 96 ; AVX512BW-NEXT: retq 97 ; 98 ; AVX512BWVL-LABEL: shuffle_v32i16_to_v16i16_1: 99 ; AVX512BWVL: # %bb.0: 100 ; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 101 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 102 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] 103 ; AVX512BWVL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 104 ; AVX512BWVL-NEXT: vmovdqa %ymm2, (%rsi) 105 ; AVX512BWVL-NEXT: vzeroupper 106 ; AVX512BWVL-NEXT: retq 107 %vec = load <32 x i16>, <32 x i16>* %L 108 %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31> 109 store <16 x i16> %strided.vec, <16 x i16>* %S 110 ret void 111 } 112 113 define void @shuffle_v16i32_to_v8i32_1(<16 x i32>* %L, <8 x i32>* %S) nounwind { 114 ; AVX512F-LABEL: shuffle_v16i32_to_v8i32_1: 115 ; AVX512F: # %bb.0: 116 ; AVX512F-NEXT: vmovaps (%rdi), %zmm0 117 ; AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm1 118 ; AVX512F-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] 119 ; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] 120 ; AVX512F-NEXT: vmovaps %ymm0, (%rsi) 121 ; AVX512F-NEXT: vzeroupper 122 ; AVX512F-NEXT: retq 123 ; 124 ; AVX512VL-LABEL: shuffle_v16i32_to_v8i32_1: 125 ; AVX512VL: # %bb.0: 126 ; AVX512VL-NEXT: vmovdqa64 (%rdi), %zmm0 127 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 128 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15] 129 ; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 130 ; AVX512VL-NEXT: vmovdqa %ymm2, (%rsi) 131 ; AVX512VL-NEXT: vzeroupper 132 ; AVX512VL-NEXT: retq 133 ; 134 ; AVX512BW-LABEL: shuffle_v16i32_to_v8i32_1: 135 ; AVX512BW: # %bb.0: 136 ; AVX512BW-NEXT: vmovaps (%rdi), %zmm0 137 ; AVX512BW-NEXT: vextractf64x4 $1, %zmm0, %ymm1 138 ; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] 139 ; AVX512BW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] 140 ; AVX512BW-NEXT: vmovaps %ymm0, (%rsi) 141 ; AVX512BW-NEXT: vzeroupper 142 ; AVX512BW-NEXT: retq 143 ; 144 ; AVX512BWVL-LABEL: shuffle_v16i32_to_v8i32_1: 145 ; AVX512BWVL: # %bb.0: 146 ; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 147 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 148 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15] 149 ; AVX512BWVL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 150 ; AVX512BWVL-NEXT: vmovdqa %ymm2, (%rsi) 151 ; AVX512BWVL-NEXT: vzeroupper 152 ; AVX512BWVL-NEXT: retq 153 %vec = load <16 x i32>, <16 x i32>* %L 154 %strided.vec = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 155 store <8 x i32> %strided.vec, <8 x i32>* %S 156 ret void 157 } 158 159 define void @shuffle_v64i8_to_v16i8_1(<64 x i8>* %L, <16 x i8>* %S) nounwind { 160 ; AVX512F-LABEL: shuffle_v64i8_to_v16i8_1: 161 ; AVX512F: # %bb.0: 162 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 163 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 164 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 165 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u> 166 ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 167 ; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1 168 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 169 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 170 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> 171 ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 172 ; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0 173 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 174 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 175 ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) 176 ; AVX512F-NEXT: vzeroupper 177 ; AVX512F-NEXT: retq 178 ; 179 ; AVX512VL-LABEL: shuffle_v64i8_to_v16i8_1: 180 ; AVX512VL: # %bb.0: 181 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 182 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 183 ; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2 184 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u> 185 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 186 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 187 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 188 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 189 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> 190 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 191 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 192 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 193 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 194 ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi) 195 ; AVX512VL-NEXT: vzeroupper 196 ; AVX512VL-NEXT: retq 197 ; 198 ; AVX512BW-LABEL: shuffle_v64i8_to_v16i8_1: 199 ; AVX512BW: # %bb.0: 200 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 201 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 202 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 203 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u> 204 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 205 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 206 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 207 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 208 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> 209 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 210 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 211 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 212 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 213 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) 214 ; AVX512BW-NEXT: vzeroupper 215 ; AVX512BW-NEXT: retq 216 ; 217 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v16i8_1: 218 ; AVX512BWVL: # %bb.0: 219 ; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 220 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 221 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2 222 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u> 223 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 224 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 225 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 226 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2 227 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> 228 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 229 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 230 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 231 ; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 232 ; AVX512BWVL-NEXT: vmovdqa %xmm0, (%rsi) 233 ; AVX512BWVL-NEXT: vzeroupper 234 ; AVX512BWVL-NEXT: retq 235 %vec = load <64 x i8>, <64 x i8>* %L 236 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61> 237 store <16 x i8> %strided.vec, <16 x i8>* %S 238 ret void 239 } 240 241 define void @shuffle_v64i8_to_v16i8_2(<64 x i8>* %L, <16 x i8>* %S) nounwind { 242 ; AVX512F-LABEL: shuffle_v64i8_to_v16i8_2: 243 ; AVX512F: # %bb.0: 244 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 245 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 246 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 247 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u> 248 ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 249 ; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1 250 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 251 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 252 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> 253 ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 254 ; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0 255 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 256 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 257 ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) 258 ; AVX512F-NEXT: vzeroupper 259 ; AVX512F-NEXT: retq 260 ; 261 ; AVX512VL-LABEL: shuffle_v64i8_to_v16i8_2: 262 ; AVX512VL: # %bb.0: 263 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 264 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 265 ; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2 266 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u> 267 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 268 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 269 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 270 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 271 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> 272 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 273 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 274 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 275 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 276 ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi) 277 ; AVX512VL-NEXT: vzeroupper 278 ; AVX512VL-NEXT: retq 279 ; 280 ; AVX512BW-LABEL: shuffle_v64i8_to_v16i8_2: 281 ; AVX512BW: # %bb.0: 282 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 283 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 284 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 285 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u> 286 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 287 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 288 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 289 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 290 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> 291 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 292 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 293 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 294 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 295 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) 296 ; AVX512BW-NEXT: vzeroupper 297 ; AVX512BW-NEXT: retq 298 ; 299 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v16i8_2: 300 ; AVX512BWVL: # %bb.0: 301 ; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 302 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 303 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2 304 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u> 305 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 306 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 307 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 308 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2 309 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> 310 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 311 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 312 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 313 ; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 314 ; AVX512BWVL-NEXT: vmovdqa %xmm0, (%rsi) 315 ; AVX512BWVL-NEXT: vzeroupper 316 ; AVX512BWVL-NEXT: retq 317 %vec = load <64 x i8>, <64 x i8>* %L 318 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62> 319 store <16 x i8> %strided.vec, <16 x i8>* %S 320 ret void 321 } 322 323 define void @shuffle_v64i8_to_v16i8_3(<64 x i8>* %L, <16 x i8>* %S) nounwind { 324 ; AVX512F-LABEL: shuffle_v64i8_to_v16i8_3: 325 ; AVX512F: # %bb.0: 326 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 327 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 328 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 329 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u> 330 ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 331 ; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1 332 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 333 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 334 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> 335 ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 336 ; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0 337 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 338 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 339 ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) 340 ; AVX512F-NEXT: vzeroupper 341 ; AVX512F-NEXT: retq 342 ; 343 ; AVX512VL-LABEL: shuffle_v64i8_to_v16i8_3: 344 ; AVX512VL: # %bb.0: 345 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 346 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 347 ; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2 348 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u> 349 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 350 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 351 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 352 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 353 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> 354 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 355 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 356 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 357 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 358 ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi) 359 ; AVX512VL-NEXT: vzeroupper 360 ; AVX512VL-NEXT: retq 361 ; 362 ; AVX512BW-LABEL: shuffle_v64i8_to_v16i8_3: 363 ; AVX512BW: # %bb.0: 364 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 365 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 366 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 367 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u> 368 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 369 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 370 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 371 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 372 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> 373 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 374 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 375 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 376 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 377 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) 378 ; AVX512BW-NEXT: vzeroupper 379 ; AVX512BW-NEXT: retq 380 ; 381 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v16i8_3: 382 ; AVX512BWVL: # %bb.0: 383 ; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 384 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 385 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2 386 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u> 387 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 388 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 389 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 390 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2 391 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> 392 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 393 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 394 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 395 ; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 396 ; AVX512BWVL-NEXT: vmovdqa %xmm0, (%rsi) 397 ; AVX512BWVL-NEXT: vzeroupper 398 ; AVX512BWVL-NEXT: retq 399 %vec = load <64 x i8>, <64 x i8>* %L 400 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63> 401 store <16 x i8> %strided.vec, <16 x i8>* %S 402 ret void 403 } 404 405 define void @shuffle_v32i16_to_v8i16_1(<32 x i16>* %L, <8 x i16>* %S) nounwind { 406 ; AVX512F-LABEL: shuffle_v32i16_to_v8i16_1: 407 ; AVX512F: # %bb.0: 408 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 409 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 410 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 411 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 412 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] 413 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 414 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] 415 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 416 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 417 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 418 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] 419 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 420 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] 421 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 422 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 423 ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) 424 ; AVX512F-NEXT: vzeroupper 425 ; AVX512F-NEXT: retq 426 ; 427 ; AVX512VL-LABEL: shuffle_v32i16_to_v8i16_1: 428 ; AVX512VL: # %bb.0: 429 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 430 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 431 ; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2 432 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] 433 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 434 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 435 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 436 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 437 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] 438 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 439 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 440 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 441 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 442 ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi) 443 ; AVX512VL-NEXT: vzeroupper 444 ; AVX512VL-NEXT: retq 445 ; 446 ; AVX512BW-LABEL: shuffle_v32i16_to_v8i16_1: 447 ; AVX512BW: # %bb.0: 448 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 449 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 450 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 451 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] 452 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 453 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 454 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 455 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 456 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] 457 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 458 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 459 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 460 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 461 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) 462 ; AVX512BW-NEXT: vzeroupper 463 ; AVX512BW-NEXT: retq 464 ; 465 ; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16_1: 466 ; AVX512BWVL: # %bb.0: 467 ; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 468 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 469 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = <1,5,9,13,17,21,25,29,u,u,u,u,u,u,u,u> 470 ; AVX512BWVL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 471 ; AVX512BWVL-NEXT: vmovdqa %xmm2, (%rsi) 472 ; AVX512BWVL-NEXT: vzeroupper 473 ; AVX512BWVL-NEXT: retq 474 %vec = load <32 x i16>, <32 x i16>* %L 475 %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29> 476 store <8 x i16> %strided.vec, <8 x i16>* %S 477 ret void 478 } 479 480 define void @shuffle_v32i16_to_v8i16_2(<32 x i16>* %L, <8 x i16>* %S) nounwind { 481 ; AVX512F-LABEL: shuffle_v32i16_to_v8i16_2: 482 ; AVX512F: # %bb.0: 483 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 484 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 485 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 486 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] 487 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] 488 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] 489 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] 490 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 491 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 492 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] 493 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] 494 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] 495 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] 496 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 497 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 498 ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) 499 ; AVX512F-NEXT: vzeroupper 500 ; AVX512F-NEXT: retq 501 ; 502 ; AVX512VL-LABEL: shuffle_v32i16_to_v8i16_2: 503 ; AVX512VL: # %bb.0: 504 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 505 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 506 ; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2 507 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [12,13,14,15,4,5,12,13,8,9,10,11,12,13,14,15] 508 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 509 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 510 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 511 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 512 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15] 513 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 514 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 515 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 516 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 517 ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi) 518 ; AVX512VL-NEXT: vzeroupper 519 ; AVX512VL-NEXT: retq 520 ; 521 ; AVX512BW-LABEL: shuffle_v32i16_to_v8i16_2: 522 ; AVX512BW: # %bb.0: 523 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 524 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 525 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 526 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [12,13,14,15,4,5,12,13,8,9,10,11,12,13,14,15] 527 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 528 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 529 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 530 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 531 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15] 532 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 533 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 534 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 535 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 536 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) 537 ; AVX512BW-NEXT: vzeroupper 538 ; AVX512BW-NEXT: retq 539 ; 540 ; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16_2: 541 ; AVX512BWVL: # %bb.0: 542 ; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 543 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 544 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = <2,6,10,14,18,22,26,30,u,u,u,u,u,u,u,u> 545 ; AVX512BWVL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 546 ; AVX512BWVL-NEXT: vmovdqa %xmm2, (%rsi) 547 ; AVX512BWVL-NEXT: vzeroupper 548 ; AVX512BWVL-NEXT: retq 549 %vec = load <32 x i16>, <32 x i16>* %L 550 %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30> 551 store <8 x i16> %strided.vec, <8 x i16>* %S 552 ret void 553 } 554 555 define void @shuffle_v32i16_to_v8i16_3(<32 x i16>* %L, <8 x i16>* %S) nounwind { 556 ; AVX512F-LABEL: shuffle_v32i16_to_v8i16_3: 557 ; AVX512F: # %bb.0: 558 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 559 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 560 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 561 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] 562 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] 563 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] 564 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7] 565 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 566 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 567 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] 568 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] 569 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] 570 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] 571 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 572 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 573 ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) 574 ; AVX512F-NEXT: vzeroupper 575 ; AVX512F-NEXT: retq 576 ; 577 ; AVX512VL-LABEL: shuffle_v32i16_to_v8i16_3: 578 ; AVX512VL: # %bb.0: 579 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 580 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 581 ; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2 582 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [12,13,14,15,6,7,14,15,8,9,10,11,12,13,14,15] 583 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 584 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 585 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 586 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 587 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15] 588 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 589 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 590 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 591 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 592 ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi) 593 ; AVX512VL-NEXT: vzeroupper 594 ; AVX512VL-NEXT: retq 595 ; 596 ; AVX512BW-LABEL: shuffle_v32i16_to_v8i16_3: 597 ; AVX512BW: # %bb.0: 598 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 599 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 600 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 601 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [12,13,14,15,6,7,14,15,8,9,10,11,12,13,14,15] 602 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 603 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 604 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 605 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 606 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15] 607 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 608 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 609 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 610 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 611 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) 612 ; AVX512BW-NEXT: vzeroupper 613 ; AVX512BW-NEXT: retq 614 ; 615 ; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16_3: 616 ; AVX512BWVL: # %bb.0: 617 ; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 618 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 619 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = <3,7,11,15,19,23,27,31,u,u,u,u,u,u,u,u> 620 ; AVX512BWVL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 621 ; AVX512BWVL-NEXT: vmovdqa %xmm2, (%rsi) 622 ; AVX512BWVL-NEXT: vzeroupper 623 ; AVX512BWVL-NEXT: retq 624 %vec = load <32 x i16>, <32 x i16>* %L 625 %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31> 626 store <8 x i16> %strided.vec, <8 x i16>* %S 627 ret void 628 } 629 630 define void @shuffle_v64i8_to_v8i8_1(<64 x i8>* %L, <8 x i8>* %S) nounwind { 631 ; AVX512F-LABEL: shuffle_v64i8_to_v8i8_1: 632 ; AVX512F: # %bb.0: 633 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 634 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 635 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 636 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u> 637 ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 638 ; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1 639 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 640 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 641 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 642 ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 643 ; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0 644 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 645 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 646 ; AVX512F-NEXT: vmovq %xmm0, (%rsi) 647 ; AVX512F-NEXT: vzeroupper 648 ; AVX512F-NEXT: retq 649 ; 650 ; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_1: 651 ; AVX512VL: # %bb.0: 652 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 653 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 654 ; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2 655 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u> 656 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 657 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 658 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 659 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 660 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 661 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 662 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 663 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 664 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 665 ; AVX512VL-NEXT: vmovq %xmm0, (%rsi) 666 ; AVX512VL-NEXT: vzeroupper 667 ; AVX512VL-NEXT: retq 668 ; 669 ; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_1: 670 ; AVX512BW: # %bb.0: 671 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 672 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 673 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 674 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u> 675 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 676 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 677 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 678 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 679 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 680 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 681 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 682 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 683 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 684 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) 685 ; AVX512BW-NEXT: vzeroupper 686 ; AVX512BW-NEXT: retq 687 ; 688 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_1: 689 ; AVX512BWVL: # %bb.0: 690 ; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 691 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 692 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2 693 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,0,1,1,1,1,9,9,8,8,9,9,10,10,11,11] 694 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 695 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 696 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 697 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2 698 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,9,9,8,8,9,9,8,8,9,9,10,10,11,11] 699 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 700 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 701 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 702 ; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 703 ; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi) 704 ; AVX512BWVL-NEXT: vzeroupper 705 ; AVX512BWVL-NEXT: retq 706 %vec = load <64 x i8>, <64 x i8>* %L 707 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 57> 708 store <8 x i8> %strided.vec, <8 x i8>* %S 709 ret void 710 } 711 712 define void @shuffle_v64i8_to_v8i8_2(<64 x i8>* %L, <8 x i8>* %S) nounwind { 713 ; AVX512F-LABEL: shuffle_v64i8_to_v8i8_2: 714 ; AVX512F: # %bb.0: 715 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 716 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 717 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 718 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u> 719 ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 720 ; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1 721 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 722 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 723 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 724 ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 725 ; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0 726 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 727 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 728 ; AVX512F-NEXT: vmovq %xmm0, (%rsi) 729 ; AVX512F-NEXT: vzeroupper 730 ; AVX512F-NEXT: retq 731 ; 732 ; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_2: 733 ; AVX512VL: # %bb.0: 734 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 735 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 736 ; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2 737 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u> 738 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 739 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 740 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 741 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 742 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 743 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 744 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 745 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 746 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 747 ; AVX512VL-NEXT: vmovq %xmm0, (%rsi) 748 ; AVX512VL-NEXT: vzeroupper 749 ; AVX512VL-NEXT: retq 750 ; 751 ; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_2: 752 ; AVX512BW: # %bb.0: 753 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 754 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 755 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 756 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u> 757 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 758 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 759 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 760 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 761 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 762 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 763 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 764 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 765 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 766 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) 767 ; AVX512BW-NEXT: vzeroupper 768 ; AVX512BW-NEXT: retq 769 ; 770 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_2: 771 ; AVX512BWVL: # %bb.0: 772 ; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 773 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 774 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = <1,5,9,13,17,21,25,29,u,u,u,u,u,u,u,u> 775 ; AVX512BWVL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 776 ; AVX512BWVL-NEXT: vpmovwb %xmm2, (%rsi) 777 ; AVX512BWVL-NEXT: vzeroupper 778 ; AVX512BWVL-NEXT: retq 779 %vec = load <64 x i8>, <64 x i8>* %L 780 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 2, i32 10, i32 18, i32 26, i32 34, i32 42, i32 50, i32 58> 781 store <8 x i8> %strided.vec, <8 x i8>* %S 782 ret void 783 } 784 785 define void @shuffle_v64i8_to_v8i8_3(<64 x i8>* %L, <8 x i8>* %S) nounwind { 786 ; AVX512F-LABEL: shuffle_v64i8_to_v8i8_3: 787 ; AVX512F: # %bb.0: 788 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 789 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 790 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 791 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u> 792 ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 793 ; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1 794 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 795 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 796 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 797 ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 798 ; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0 799 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 800 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 801 ; AVX512F-NEXT: vmovq %xmm0, (%rsi) 802 ; AVX512F-NEXT: vzeroupper 803 ; AVX512F-NEXT: retq 804 ; 805 ; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_3: 806 ; AVX512VL: # %bb.0: 807 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 808 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 809 ; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2 810 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u> 811 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 812 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 813 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 814 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 815 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 816 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 817 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 818 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 819 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 820 ; AVX512VL-NEXT: vmovq %xmm0, (%rsi) 821 ; AVX512VL-NEXT: vzeroupper 822 ; AVX512VL-NEXT: retq 823 ; 824 ; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_3: 825 ; AVX512BW: # %bb.0: 826 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 827 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 828 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 829 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u> 830 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 831 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 832 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 833 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 834 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 835 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 836 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 837 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 838 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 839 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) 840 ; AVX512BW-NEXT: vzeroupper 841 ; AVX512BW-NEXT: retq 842 ; 843 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_3: 844 ; AVX512BWVL: # %bb.0: 845 ; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 846 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 847 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2 848 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [10,10,11,11,3,3,11,11,8,8,9,9,10,10,11,11] 849 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 850 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 851 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 852 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2 853 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [3,3,11,11,2,2,3,3,8,8,9,9,10,10,11,11] 854 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 855 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 856 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 857 ; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 858 ; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi) 859 ; AVX512BWVL-NEXT: vzeroupper 860 ; AVX512BWVL-NEXT: retq 861 %vec = load <64 x i8>, <64 x i8>* %L 862 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 3, i32 11, i32 19, i32 27, i32 35, i32 43, i32 51, i32 59> 863 store <8 x i8> %strided.vec, <8 x i8>* %S 864 ret void 865 } 866 867 define void @shuffle_v64i8_to_v8i8_4(<64 x i8>* %L, <8 x i8>* %S) nounwind { 868 ; AVX512F-LABEL: shuffle_v64i8_to_v8i8_4: 869 ; AVX512F: # %bb.0: 870 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 871 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 872 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 873 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u> 874 ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 875 ; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1 876 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 877 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 878 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 879 ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 880 ; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0 881 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 882 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 883 ; AVX512F-NEXT: vmovq %xmm0, (%rsi) 884 ; AVX512F-NEXT: vzeroupper 885 ; AVX512F-NEXT: retq 886 ; 887 ; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_4: 888 ; AVX512VL: # %bb.0: 889 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 890 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 891 ; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2 892 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u> 893 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 894 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 895 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 896 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 897 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 898 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 899 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 900 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 901 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 902 ; AVX512VL-NEXT: vmovq %xmm0, (%rsi) 903 ; AVX512VL-NEXT: vzeroupper 904 ; AVX512VL-NEXT: retq 905 ; 906 ; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_4: 907 ; AVX512BW: # %bb.0: 908 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 909 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 910 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 911 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u> 912 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 913 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 914 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 915 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 916 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 917 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 918 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 919 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 920 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 921 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) 922 ; AVX512BW-NEXT: vzeroupper 923 ; AVX512BW-NEXT: retq 924 ; 925 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_4: 926 ; AVX512BWVL: # %bb.0: 927 ; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 928 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 929 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = <2,6,10,14,18,22,26,30,u,u,u,u,u,u,u,u> 930 ; AVX512BWVL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 931 ; AVX512BWVL-NEXT: vpmovwb %xmm2, (%rsi) 932 ; AVX512BWVL-NEXT: vzeroupper 933 ; AVX512BWVL-NEXT: retq 934 %vec = load <64 x i8>, <64 x i8>* %L 935 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 4, i32 12, i32 20, i32 28, i32 36, i32 44, i32 52, i32 60> 936 store <8 x i8> %strided.vec, <8 x i8>* %S 937 ret void 938 } 939 940 define void @shuffle_v64i8_to_v8i8_5(<64 x i8>* %L, <8 x i8>* %S) nounwind { 941 ; AVX512F-LABEL: shuffle_v64i8_to_v8i8_5: 942 ; AVX512F: # %bb.0: 943 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 944 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 945 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 946 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u> 947 ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 948 ; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1 949 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 950 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 951 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 952 ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 953 ; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0 954 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 955 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 956 ; AVX512F-NEXT: vmovq %xmm0, (%rsi) 957 ; AVX512F-NEXT: vzeroupper 958 ; AVX512F-NEXT: retq 959 ; 960 ; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_5: 961 ; AVX512VL: # %bb.0: 962 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 963 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 964 ; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2 965 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u> 966 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 967 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 968 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 969 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 970 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 971 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 972 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 973 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 974 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 975 ; AVX512VL-NEXT: vmovq %xmm0, (%rsi) 976 ; AVX512VL-NEXT: vzeroupper 977 ; AVX512VL-NEXT: retq 978 ; 979 ; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_5: 980 ; AVX512BW: # %bb.0: 981 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 982 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 983 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 984 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u> 985 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 986 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 987 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 988 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 989 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 990 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 991 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 992 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 993 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 994 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) 995 ; AVX512BW-NEXT: vzeroupper 996 ; AVX512BW-NEXT: retq 997 ; 998 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_5: 999 ; AVX512BWVL: # %bb.0: 1000 ; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 1001 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1002 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2 1003 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [12,12,13,13,5,5,13,13,4,4,5,5,6,6,7,7] 1004 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 1005 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 1006 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 1007 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2 1008 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [5,5,13,13,4,4,5,5,4,4,5,5,6,6,7,7] 1009 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 1010 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 1011 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 1012 ; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 1013 ; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi) 1014 ; AVX512BWVL-NEXT: vzeroupper 1015 ; AVX512BWVL-NEXT: retq 1016 %vec = load <64 x i8>, <64 x i8>* %L 1017 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 61> 1018 store <8 x i8> %strided.vec, <8 x i8>* %S 1019 ret void 1020 } 1021 1022 define void @shuffle_v64i8_to_v8i8_6(<64 x i8>* %L, <8 x i8>* %S) nounwind { 1023 ; AVX512F-LABEL: shuffle_v64i8_to_v8i8_6: 1024 ; AVX512F: # %bb.0: 1025 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 1026 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 1027 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 1028 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u> 1029 ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 1030 ; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1 1031 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 1032 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 1033 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 1034 ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 1035 ; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0 1036 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 1037 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 1038 ; AVX512F-NEXT: vmovq %xmm0, (%rsi) 1039 ; AVX512F-NEXT: vzeroupper 1040 ; AVX512F-NEXT: retq 1041 ; 1042 ; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_6: 1043 ; AVX512VL: # %bb.0: 1044 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 1045 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 1046 ; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2 1047 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u> 1048 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 1049 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 1050 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 1051 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 1052 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 1053 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 1054 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 1055 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 1056 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 1057 ; AVX512VL-NEXT: vmovq %xmm0, (%rsi) 1058 ; AVX512VL-NEXT: vzeroupper 1059 ; AVX512VL-NEXT: retq 1060 ; 1061 ; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_6: 1062 ; AVX512BW: # %bb.0: 1063 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 1064 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1065 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 1066 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u> 1067 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 1068 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 1069 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 1070 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 1071 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 1072 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 1073 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 1074 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 1075 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 1076 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) 1077 ; AVX512BW-NEXT: vzeroupper 1078 ; AVX512BW-NEXT: retq 1079 ; 1080 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_6: 1081 ; AVX512BWVL: # %bb.0: 1082 ; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 1083 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1084 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = <3,7,11,15,19,23,27,31,u,u,u,u,u,u,u,u> 1085 ; AVX512BWVL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 1086 ; AVX512BWVL-NEXT: vpmovwb %xmm2, (%rsi) 1087 ; AVX512BWVL-NEXT: vzeroupper 1088 ; AVX512BWVL-NEXT: retq 1089 %vec = load <64 x i8>, <64 x i8>* %L 1090 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 54, i32 62> 1091 store <8 x i8> %strided.vec, <8 x i8>* %S 1092 ret void 1093 } 1094 1095 define void @shuffle_v64i8_to_v8i8_7(<64 x i8>* %L, <8 x i8>* %S) nounwind { 1096 ; AVX512F-LABEL: shuffle_v64i8_to_v8i8_7: 1097 ; AVX512F: # %bb.0: 1098 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 1099 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 1100 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 1101 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u> 1102 ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 1103 ; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1 1104 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 1105 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 1106 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 1107 ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 1108 ; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0 1109 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 1110 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 1111 ; AVX512F-NEXT: vmovq %xmm0, (%rsi) 1112 ; AVX512F-NEXT: vzeroupper 1113 ; AVX512F-NEXT: retq 1114 ; 1115 ; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_7: 1116 ; AVX512VL: # %bb.0: 1117 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 1118 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 1119 ; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2 1120 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u> 1121 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 1122 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 1123 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 1124 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 1125 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 1126 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 1127 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 1128 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 1129 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 1130 ; AVX512VL-NEXT: vmovq %xmm0, (%rsi) 1131 ; AVX512VL-NEXT: vzeroupper 1132 ; AVX512VL-NEXT: retq 1133 ; 1134 ; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_7: 1135 ; AVX512BW: # %bb.0: 1136 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 1137 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1138 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 1139 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u> 1140 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 1141 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 1142 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 1143 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 1144 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 1145 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 1146 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 1147 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 1148 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 1149 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) 1150 ; AVX512BW-NEXT: vzeroupper 1151 ; AVX512BW-NEXT: retq 1152 ; 1153 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_7: 1154 ; AVX512BWVL: # %bb.0: 1155 ; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 1156 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1157 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2 1158 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [14,14,15,15,7,7,15,15,4,4,5,5,6,6,7,7] 1159 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 1160 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 1161 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 1162 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2 1163 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,15,15,6,6,7,7,4,4,5,5,6,6,7,7] 1164 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 1165 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 1166 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 1167 ; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 1168 ; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi) 1169 ; AVX512BWVL-NEXT: vzeroupper 1170 ; AVX512BWVL-NEXT: retq 1171 %vec = load <64 x i8>, <64 x i8>* %L 1172 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63> 1173 store <8 x i8> %strided.vec, <8 x i8>* %S 1174 ret void 1175 } 1176 1177