1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx | FileCheck %s --check-prefix=AVX1 3 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 4 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx512f -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 5 6 define <4 x double> @load_factorf64_4(<16 x double>* %ptr) { 7 ; AVX1-LABEL: load_factorf64_4: 8 ; AVX1: # %bb.0: 9 ; AVX1-NEXT: vmovupd (%rdi), %ymm0 10 ; AVX1-NEXT: vmovupd 32(%rdi), %ymm1 11 ; AVX1-NEXT: vmovupd 64(%rdi), %ymm2 12 ; AVX1-NEXT: vmovupd 96(%rdi), %ymm3 13 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1] 14 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1] 15 ; AVX1-NEXT: vhaddpd %ymm5, %ymm4, %ymm4 16 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] 17 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] 18 ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 19 ; AVX1-NEXT: vaddpd %ymm2, %ymm4, %ymm2 20 ; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 21 ; AVX1-NEXT: vaddpd %ymm0, %ymm2, %ymm0 22 ; AVX1-NEXT: retq 23 ; 24 ; AVX-LABEL: load_factorf64_4: 25 ; AVX: # %bb.0: 26 ; AVX-NEXT: vmovupd (%rdi), %ymm0 27 ; AVX-NEXT: vmovupd 32(%rdi), %ymm1 28 ; AVX-NEXT: vmovupd 64(%rdi), %ymm2 29 ; AVX-NEXT: vmovupd 96(%rdi), %ymm3 30 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1] 31 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1] 32 ; AVX-NEXT: vhaddpd %ymm5, %ymm4, %ymm4 33 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] 34 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] 35 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 36 ; AVX-NEXT: vaddpd %ymm2, %ymm4, %ymm2 37 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 38 ; AVX-NEXT: vaddpd %ymm0, %ymm2, %ymm0 39 ; AVX-NEXT: retq 40 %wide.vec = load <16 x double>, <16 x double>* %ptr, align 16 41 %strided.v0 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12> 42 %strided.v1 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13> 43 %strided.v2 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14> 44 %strided.v3 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15> 45 %add1 = fadd <4 x double> %strided.v0, %strided.v1 46 %add2 = fadd <4 x double> %add1, %strided.v2 47 %add3 = fadd <4 x double> %add2, %strided.v3 48 ret <4 x double> %add3 49 } 50 51 define <4 x double> @load_factorf64_2(<16 x double>* %ptr) { 52 ; AVX1-LABEL: load_factorf64_2: 53 ; AVX1: # %bb.0: 54 ; AVX1-NEXT: vmovupd (%rdi), %ymm0 55 ; AVX1-NEXT: vmovupd 32(%rdi), %ymm1 56 ; AVX1-NEXT: vmovupd 64(%rdi), %ymm2 57 ; AVX1-NEXT: vmovupd 96(%rdi), %ymm3 58 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1] 59 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1] 60 ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] 61 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] 62 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] 63 ; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 64 ; AVX1-NEXT: vmulpd %ymm0, %ymm4, %ymm0 65 ; AVX1-NEXT: retq 66 ; 67 ; AVX-LABEL: load_factorf64_2: 68 ; AVX: # %bb.0: 69 ; AVX-NEXT: vmovupd (%rdi), %ymm0 70 ; AVX-NEXT: vmovupd 32(%rdi), %ymm1 71 ; AVX-NEXT: vmovupd 64(%rdi), %ymm2 72 ; AVX-NEXT: vmovupd 96(%rdi), %ymm3 73 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1] 74 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1] 75 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] 76 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] 77 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] 78 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 79 ; AVX-NEXT: vmulpd %ymm0, %ymm4, %ymm0 80 ; AVX-NEXT: retq 81 %wide.vec = load <16 x double>, <16 x double>* %ptr, align 16 82 %strided.v0 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12> 83 %strided.v3 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15> 84 %mul = fmul <4 x double> %strided.v0, %strided.v3 85 ret <4 x double> %mul 86 } 87 88 define <4 x double> @load_factorf64_1(<16 x double>* %ptr) { 89 ; AVX1-LABEL: load_factorf64_1: 90 ; AVX1: # %bb.0: 91 ; AVX1-NEXT: vmovupd (%rdi), %ymm0 92 ; AVX1-NEXT: vmovupd 32(%rdi), %ymm1 93 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],mem[0,1] 94 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],mem[0,1] 95 ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 96 ; AVX1-NEXT: vmulpd %ymm0, %ymm0, %ymm0 97 ; AVX1-NEXT: retq 98 ; 99 ; AVX-LABEL: load_factorf64_1: 100 ; AVX: # %bb.0: 101 ; AVX-NEXT: vmovupd (%rdi), %ymm0 102 ; AVX-NEXT: vmovupd 32(%rdi), %ymm1 103 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],mem[0,1] 104 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],mem[0,1] 105 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 106 ; AVX-NEXT: vmulpd %ymm0, %ymm0, %ymm0 107 ; AVX-NEXT: retq 108 %wide.vec = load <16 x double>, <16 x double>* %ptr, align 16 109 %strided.v0 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12> 110 %strided.v3 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12> 111 %mul = fmul <4 x double> %strided.v0, %strided.v3 112 ret <4 x double> %mul 113 } 114 115 define <4 x i64> @load_factori64_4(<16 x i64>* %ptr) { 116 ; AVX1-LABEL: load_factori64_4: 117 ; AVX1: # %bb.0: 118 ; AVX1-NEXT: vmovups (%rdi), %ymm0 119 ; AVX1-NEXT: vmovups 32(%rdi), %ymm1 120 ; AVX1-NEXT: vmovups 64(%rdi), %ymm2 121 ; AVX1-NEXT: vmovups 96(%rdi), %ymm3 122 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1] 123 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1] 124 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] 125 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] 126 ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] 127 ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 128 ; AVX1-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] 129 ; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 130 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm1 131 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 132 ; AVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm4 133 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 134 ; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 135 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 136 ; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 137 ; AVX1-NEXT: vpaddq %xmm1, %xmm5, %xmm1 138 ; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0 139 ; AVX1-NEXT: vpaddq %xmm0, %xmm2, %xmm0 140 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 141 ; AVX1-NEXT: retq 142 ; 143 ; AVX-LABEL: load_factori64_4: 144 ; AVX: # %bb.0: 145 ; AVX-NEXT: vmovdqu (%rdi), %ymm0 146 ; AVX-NEXT: vmovdqu 32(%rdi), %ymm1 147 ; AVX-NEXT: vmovdqu 64(%rdi), %ymm2 148 ; AVX-NEXT: vmovdqu 96(%rdi), %ymm3 149 ; AVX-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1] 150 ; AVX-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1] 151 ; AVX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] 152 ; AVX-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] 153 ; AVX-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] 154 ; AVX-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 155 ; AVX-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] 156 ; AVX-NEXT: vpaddq %ymm3, %ymm4, %ymm3 157 ; AVX-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 158 ; AVX-NEXT: vpaddq %ymm0, %ymm3, %ymm0 159 ; AVX-NEXT: vpaddq %ymm0, %ymm2, %ymm0 160 ; AVX-NEXT: retq 161 %wide.vec = load <16 x i64>, <16 x i64>* %ptr, align 16 162 %strided.v0 = shufflevector <16 x i64> %wide.vec, <16 x i64> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12> 163 %strided.v1 = shufflevector <16 x i64> %wide.vec, <16 x i64> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13> 164 %strided.v2 = shufflevector <16 x i64> %wide.vec, <16 x i64> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14> 165 %strided.v3 = shufflevector <16 x i64> %wide.vec, <16 x i64> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15> 166 %add1 = add <4 x i64> %strided.v0, %strided.v1 167 %add2 = add <4 x i64> %add1, %strided.v2 168 %add3 = add <4 x i64> %add2, %strided.v3 169 ret <4 x i64> %add3 170 } 171 172 define void @store_factorf64_4(<16 x double>* %ptr, <4 x double> %v0, <4 x double> %v1, <4 x double> %v2, <4 x double> %v3) { 173 ; AVX1-LABEL: store_factorf64_4: 174 ; AVX1: # %bb.0: 175 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 176 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5 177 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] 178 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] 179 ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] 180 ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 181 ; AVX1-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] 182 ; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 183 ; AVX1-NEXT: vmovups %ymm0, 96(%rdi) 184 ; AVX1-NEXT: vmovups %ymm3, 64(%rdi) 185 ; AVX1-NEXT: vmovups %ymm4, 32(%rdi) 186 ; AVX1-NEXT: vmovups %ymm2, (%rdi) 187 ; AVX1-NEXT: vzeroupper 188 ; AVX1-NEXT: retq 189 ; 190 ; AVX2-LABEL: store_factorf64_4: 191 ; AVX2: # %bb.0: 192 ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 193 ; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5 194 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] 195 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] 196 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] 197 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 198 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] 199 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 200 ; AVX2-NEXT: vmovups %ymm0, 96(%rdi) 201 ; AVX2-NEXT: vmovups %ymm3, 64(%rdi) 202 ; AVX2-NEXT: vmovups %ymm4, 32(%rdi) 203 ; AVX2-NEXT: vmovups %ymm2, (%rdi) 204 ; AVX2-NEXT: vzeroupper 205 ; AVX2-NEXT: retq 206 ; 207 ; AVX512-LABEL: store_factorf64_4: 208 ; AVX512: # %bb.0: 209 ; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 210 ; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5 211 ; AVX512-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] 212 ; AVX512-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] 213 ; AVX512-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] 214 ; AVX512-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 215 ; AVX512-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] 216 ; AVX512-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 217 ; AVX512-NEXT: vinsertf64x4 $1, %ymm4, %zmm2, %zmm1 218 ; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm3, %zmm0 219 ; AVX512-NEXT: vmovups %zmm0, 64(%rdi) 220 ; AVX512-NEXT: vmovups %zmm1, (%rdi) 221 ; AVX512-NEXT: vzeroupper 222 ; AVX512-NEXT: retq 223 %s0 = shufflevector <4 x double> %v0, <4 x double> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 224 %s1 = shufflevector <4 x double> %v2, <4 x double> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 225 %interleaved.vec = shufflevector <8 x double> %s0, <8 x double> %s1, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15> 226 store <16 x double> %interleaved.vec, <16 x double>* %ptr, align 16 227 ret void 228 } 229 230 define void @store_factori64_4(<16 x i64>* %ptr, <4 x i64> %v0, <4 x i64> %v1, <4 x i64> %v2, <4 x i64> %v3) { 231 ; AVX1-LABEL: store_factori64_4: 232 ; AVX1: # %bb.0: 233 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 234 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5 235 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] 236 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] 237 ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] 238 ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 239 ; AVX1-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] 240 ; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 241 ; AVX1-NEXT: vmovups %ymm0, 96(%rdi) 242 ; AVX1-NEXT: vmovups %ymm3, 64(%rdi) 243 ; AVX1-NEXT: vmovups %ymm4, 32(%rdi) 244 ; AVX1-NEXT: vmovups %ymm2, (%rdi) 245 ; AVX1-NEXT: vzeroupper 246 ; AVX1-NEXT: retq 247 ; 248 ; AVX2-LABEL: store_factori64_4: 249 ; AVX2: # %bb.0: 250 ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 251 ; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5 252 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] 253 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] 254 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] 255 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 256 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] 257 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 258 ; AVX2-NEXT: vmovups %ymm0, 96(%rdi) 259 ; AVX2-NEXT: vmovups %ymm3, 64(%rdi) 260 ; AVX2-NEXT: vmovups %ymm4, 32(%rdi) 261 ; AVX2-NEXT: vmovups %ymm2, (%rdi) 262 ; AVX2-NEXT: vzeroupper 263 ; AVX2-NEXT: retq 264 ; 265 ; AVX512-LABEL: store_factori64_4: 266 ; AVX512: # %bb.0: 267 ; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 268 ; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5 269 ; AVX512-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] 270 ; AVX512-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] 271 ; AVX512-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] 272 ; AVX512-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 273 ; AVX512-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] 274 ; AVX512-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 275 ; AVX512-NEXT: vinsertf64x4 $1, %ymm4, %zmm2, %zmm1 276 ; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm3, %zmm0 277 ; AVX512-NEXT: vmovups %zmm0, 64(%rdi) 278 ; AVX512-NEXT: vmovups %zmm1, (%rdi) 279 ; AVX512-NEXT: vzeroupper 280 ; AVX512-NEXT: retq 281 %s0 = shufflevector <4 x i64> %v0, <4 x i64> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 282 %s1 = shufflevector <4 x i64> %v2, <4 x i64> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 283 %interleaved.vec = shufflevector <8 x i64> %s0, <8 x i64> %s1, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15> 284 store <16 x i64> %interleaved.vec, <16 x i64>* %ptr, align 16 285 ret void 286 } 287 288 289 define void @interleaved_store_vf32_i8_stride4(<32 x i8> %x1, <32 x i8> %x2, <32 x i8> %x3, <32 x i8> %x4, <128 x i8>* %p) { 290 ; AVX1-LABEL: interleaved_store_vf32_i8_stride4: 291 ; AVX1: # %bb.0: 292 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 293 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 294 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 295 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] 296 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 297 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] 298 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] 299 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6 300 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0 301 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] 302 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] 303 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] 304 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] 305 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] 306 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 307 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] 308 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] 309 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 310 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] 311 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] 312 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 313 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] 314 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] 315 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 316 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm4 317 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 318 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] 319 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3] 320 ; AVX1-NEXT: vmovaps %ymm0, 96(%rdi) 321 ; AVX1-NEXT: vmovaps %ymm1, 64(%rdi) 322 ; AVX1-NEXT: vmovaps %ymm2, 32(%rdi) 323 ; AVX1-NEXT: vmovaps %ymm4, (%rdi) 324 ; AVX1-NEXT: vzeroupper 325 ; AVX1-NEXT: retq 326 ; 327 ; AVX2-LABEL: interleaved_store_vf32_i8_stride4: 328 ; AVX2: # %bb.0: 329 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] 330 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] 331 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] 332 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] 333 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11] 334 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15] 335 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] 336 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15] 337 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm2 338 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm5 339 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] 340 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm4[2,3],ymm0[2,3] 341 ; AVX2-NEXT: vmovdqa %ymm0, 96(%rdi) 342 ; AVX2-NEXT: vmovdqa %ymm1, 64(%rdi) 343 ; AVX2-NEXT: vmovdqa %ymm5, 32(%rdi) 344 ; AVX2-NEXT: vmovdqa %ymm2, (%rdi) 345 ; AVX2-NEXT: vzeroupper 346 ; AVX2-NEXT: retq 347 ; 348 ; AVX512-LABEL: interleaved_store_vf32_i8_stride4: 349 ; AVX512: # %bb.0: 350 ; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] 351 ; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] 352 ; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] 353 ; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] 354 ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11] 355 ; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15] 356 ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] 357 ; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15] 358 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm2 359 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm5 360 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] 361 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm4[2,3],ymm0[2,3] 362 ; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 363 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 364 ; AVX512-NEXT: vmovdqa64 %zmm0, 64(%rdi) 365 ; AVX512-NEXT: vmovdqa64 %zmm2, (%rdi) 366 ; AVX512-NEXT: vzeroupper 367 ; AVX512-NEXT: retq 368 %v1 = shufflevector <32 x i8> %x1, <32 x i8> %x2, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 369 %v2 = shufflevector <32 x i8> %x3, <32 x i8> %x4, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 370 %interleaved.vec = shufflevector <64 x i8> %v1, <64 x i8> %v2, <128 x i32> <i32 0, i32 32, i32 64, i32 96, i32 1, i32 33, i32 65, i32 97, i32 2, i32 34, i32 66, i32 98, i32 3, i32 35, i32 67, i32 99, i32 4, i32 36, i32 68, i32 100, i32 5, i32 37, i32 69, i32 101, i32 6, i32 38, i32 70, i32 102, i32 7, i32 39, i32 71, i32 103, i32 8, i32 40, i32 72, i32 104, i32 9, i32 41, i32 73, i32 105, i32 10, i32 42, i32 74, i32 106, i32 11, i32 43, i32 75, i32 107, i32 12, i32 44, i32 76, i32 108, i32 13, i32 45, i32 77, i32 109, i32 14, i32 46, i32 78, i32 110, i32 15, i32 47, i32 79, i32 111, i32 16, i32 48, i32 80, i32 112, i32 17, i32 49, i32 81, i32 113, i32 18, i32 50, i32 82, i32 114, i32 19, i32 51, i32 83, i32 115, i32 20, i32 52, i32 84, i32 116, i32 21, i32 53, i32 85, i32 117, i32 22, i32 54, i32 86, i32 118, i32 23, i32 55, i32 87, i32 119, i32 24, i32 56, i32 88, i32 120, i32 25, i32 57, i32 89, i32 121, i32 26, i32 58, i32 90, i32 122, i32 27, i32 59, i32 91, i32 123, i32 28, i32 60, i32 92, i32 124, i32 29, i32 61, i32 93, i32 125, i32 30, i32 62, i32 94, i32 126, i32 31, i32 63, i32 95, i32 127> 371 store <128 x i8> %interleaved.vec, <128 x i8>* %p 372 ret void 373 } 374 375 define void @interleaved_store_vf16_i8_stride4(<16 x i8> %x1, <16 x i8> %x2, <16 x i8> %x3, <16 x i8> %x4, <64 x i8>* %p) { 376 ; AVX1-LABEL: interleaved_store_vf16_i8_stride4: 377 ; AVX1: # %bb.0: 378 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 379 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 380 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] 381 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] 382 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] 383 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] 384 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 385 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 386 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 387 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 388 ; AVX1-NEXT: vmovaps %ymm0, 32(%rdi) 389 ; AVX1-NEXT: vmovaps %ymm1, (%rdi) 390 ; AVX1-NEXT: vzeroupper 391 ; AVX1-NEXT: retq 392 ; 393 ; AVX2-LABEL: interleaved_store_vf16_i8_stride4: 394 ; AVX2: # %bb.0: 395 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 396 ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 397 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] 398 ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] 399 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] 400 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] 401 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 402 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 403 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 404 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 405 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rdi) 406 ; AVX2-NEXT: vmovdqa %ymm1, (%rdi) 407 ; AVX2-NEXT: vzeroupper 408 ; AVX2-NEXT: retq 409 ; 410 ; AVX512-LABEL: interleaved_store_vf16_i8_stride4: 411 ; AVX512: # %bb.0: 412 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 413 ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 414 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] 415 ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] 416 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] 417 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] 418 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 419 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 420 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 421 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 422 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 423 ; AVX512-NEXT: vmovdqa64 %zmm0, (%rdi) 424 ; AVX512-NEXT: vzeroupper 425 ; AVX512-NEXT: retq 426 %v1 = shufflevector <16 x i8> %x1, <16 x i8> %x2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 427 %v2 = shufflevector <16 x i8> %x3, <16 x i8> %x4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 428 %interleaved.vec = shufflevector <32 x i8> %v1, <32 x i8> %v2, <64 x i32> <i32 0,i32 16,i32 32,i32 48,i32 1,i32 17,i32 33,i32 49,i32 2,i32 18,i32 34,i32 50,i32 3,i32 19,i32 35,i32 51,i32 4,i32 20,i32 36,i32 52,i32 5,i32 21,i32 37,i32 53,i32 6,i32 22,i32 38,i32 54,i32 7,i32 23,i32 39,i32 55,i32 8,i32 24,i32 40,i32 56,i32 9,i32 25,i32 41,i32 57,i32 10,i32 26,i32 42,i32 58,i32 11,i32 27,i32 43,i32 59,i32 12,i32 28,i32 44,i32 60,i32 13,i32 29,i32 45,i32 61,i32 14,i32 30,i32 46,i32 62,i32 15,i32 31,i32 47,i32 63> 429 store <64 x i8> %interleaved.vec, <64 x i8>* %p 430 ret void 431 } 432 433 define <8 x i8> @interleaved_load_vf8_i8_stride4(<32 x i8>* %ptr) { 434 ; AVX1-LABEL: interleaved_load_vf8_i8_stride4: 435 ; AVX1: # %bb.0: 436 ; AVX1-NEXT: vmovdqu (%rdi), %ymm0 437 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 438 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 439 ; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm3 440 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 441 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm1[0],xmm3[0] 442 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1,1,3,3,5,5,7,7,7,7,3,3,6,6,7,7] 443 ; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm3 444 ; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm1 445 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] 446 ; AVX1-NEXT: vpaddw %xmm1, %xmm4, %xmm1 447 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,2,3,14,15,10,11,14,15,10,11,12,13,14,15] 448 ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 449 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[1,0,3,2,4,5,6,7] 450 ; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 451 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[1,0,3,2,4,5,6,7] 452 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] 453 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [3,3,1,1,7,7,5,5,1,1,5,5,0,0,1,1] 454 ; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2 455 ; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0 456 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 457 ; AVX1-NEXT: vpaddw %xmm3, %xmm0, %xmm0 458 ; AVX1-NEXT: vpmullw %xmm0, %xmm1, %xmm0 459 ; AVX1-NEXT: vzeroupper 460 ; AVX1-NEXT: retq 461 ; 462 ; AVX-LABEL: interleaved_load_vf8_i8_stride4: 463 ; AVX: # %bb.0: 464 ; AVX-NEXT: vmovdqu (%rdi), %ymm0 465 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 466 ; AVX-NEXT: vextracti128 $1, %ymm0, %xmm2 467 ; AVX-NEXT: vpshufb %xmm1, %xmm2, %xmm3 468 ; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm1 469 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm1[0],xmm3[0] 470 ; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [1,1,3,3,5,5,7,7,7,7,3,3,6,6,7,7] 471 ; AVX-NEXT: vpshufb %xmm5, %xmm3, %xmm3 472 ; AVX-NEXT: vpshufb %xmm5, %xmm1, %xmm1 473 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] 474 ; AVX-NEXT: vpaddw %xmm1, %xmm4, %xmm1 475 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,2,3,14,15,10,11,14,15,10,11,12,13,14,15] 476 ; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 477 ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[1,0,3,2,4,5,6,7] 478 ; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 479 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[1,0,3,2,4,5,6,7] 480 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] 481 ; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [3,3,1,1,7,7,5,5,1,1,5,5,0,0,1,1] 482 ; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm2 483 ; AVX-NEXT: vpshufb %xmm4, %xmm0, %xmm0 484 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 485 ; AVX-NEXT: vpaddw %xmm3, %xmm0, %xmm0 486 ; AVX-NEXT: vpmullw %xmm0, %xmm1, %xmm0 487 ; AVX-NEXT: vzeroupper 488 ; AVX-NEXT: retq 489 %wide.vec = load <32 x i8>, <32 x i8>* %ptr, align 16 490 %v1 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28> 491 %v2 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29> 492 %v3 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30> 493 %v4 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31> 494 495 %add1 = add <8 x i8> %v1, %v2 496 %add2 = add <8 x i8> %v4, %v3 497 %add3 = mul <8 x i8> %add1, %add2 498 ret <8 x i8> %add3 499 } 500 501 define <16 x i1> @interleaved_load_vf16_i8_stride4(<64 x i8>* %ptr) { 502 ; AVX1-LABEL: interleaved_load_vf16_i8_stride4: 503 ; AVX1: # %bb.0: 504 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 505 ; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1 506 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 507 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u> 508 ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm4 509 ; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm3 510 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 511 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 512 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> 513 ; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm6 514 ; AVX1-NEXT: vpshufb %xmm5, %xmm0, %xmm5 515 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] 516 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7] 517 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u> 518 ; AVX1-NEXT: vpshufb %xmm5, %xmm2, %xmm6 519 ; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm5 520 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] 521 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> 522 ; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm7 523 ; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm6 524 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] 525 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7] 526 ; AVX1-NEXT: vpcmpeqb %xmm5, %xmm3, %xmm3 527 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u> 528 ; AVX1-NEXT: vpshufb %xmm5, %xmm2, %xmm6 529 ; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm5 530 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] 531 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> 532 ; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm7 533 ; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm6 534 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] 535 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7] 536 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u> 537 ; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm2 538 ; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm1 539 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 540 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> 541 ; AVX1-NEXT: vpshufb %xmm2, %xmm4, %xmm4 542 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 543 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 544 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 545 ; AVX1-NEXT: vpcmpeqb %xmm0, %xmm5, %xmm0 546 ; AVX1-NEXT: vpxor %xmm0, %xmm3, %xmm0 547 ; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 548 ; AVX1-NEXT: vzeroupper 549 ; AVX1-NEXT: retq 550 ; 551 ; AVX2-LABEL: interleaved_load_vf16_i8_stride4: 552 ; AVX2: # %bb.0: 553 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 554 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 555 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u> 556 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 557 ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm4 558 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm2 559 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] 560 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> 561 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm5 562 ; AVX2-NEXT: vpshufb %xmm4, %xmm5, %xmm6 563 ; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm4 564 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] 565 ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] 566 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u> 567 ; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm6 568 ; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm4 569 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] 570 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> 571 ; AVX2-NEXT: vpshufb %xmm6, %xmm5, %xmm7 572 ; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm6 573 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] 574 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] 575 ; AVX2-NEXT: vpcmpeqb %xmm4, %xmm2, %xmm2 576 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u> 577 ; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm6 578 ; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm4 579 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] 580 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> 581 ; AVX2-NEXT: vpshufb %xmm6, %xmm5, %xmm7 582 ; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm6 583 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] 584 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] 585 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u> 586 ; AVX2-NEXT: vpshufb %xmm6, %xmm3, %xmm3 587 ; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm1 588 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 589 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> 590 ; AVX2-NEXT: vpshufb %xmm3, %xmm5, %xmm5 591 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 592 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] 593 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 594 ; AVX2-NEXT: vpcmpeqb %xmm0, %xmm4, %xmm0 595 ; AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0 596 ; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 597 ; AVX2-NEXT: vzeroupper 598 ; AVX2-NEXT: retq 599 ; 600 ; AVX512-LABEL: interleaved_load_vf16_i8_stride4: 601 ; AVX512: # %bb.0: 602 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 603 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 604 ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 605 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u> 606 ; AVX512-NEXT: vpshufb %xmm3, %xmm2, %xmm4 607 ; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm3 608 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 609 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm4 610 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> 611 ; AVX512-NEXT: vpshufb %xmm5, %xmm4, %xmm6 612 ; AVX512-NEXT: vpshufb %xmm5, %xmm0, %xmm5 613 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] 614 ; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm5[0,1],xmm3[2,3] 615 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u> 616 ; AVX512-NEXT: vpshufb %xmm5, %xmm2, %xmm6 617 ; AVX512-NEXT: vpshufb %xmm5, %xmm1, %xmm5 618 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] 619 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> 620 ; AVX512-NEXT: vpshufb %xmm6, %xmm4, %xmm7 621 ; AVX512-NEXT: vpshufb %xmm6, %xmm0, %xmm6 622 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] 623 ; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] 624 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u> 625 ; AVX512-NEXT: vpshufb %xmm6, %xmm2, %xmm7 626 ; AVX512-NEXT: vpshufb %xmm6, %xmm1, %xmm6 627 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] 628 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> 629 ; AVX512-NEXT: vpshufb %xmm7, %xmm4, %xmm3 630 ; AVX512-NEXT: vpshufb %xmm7, %xmm0, %xmm7 631 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] 632 ; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3] 633 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u> 634 ; AVX512-NEXT: vpshufb %xmm6, %xmm2, %xmm2 635 ; AVX512-NEXT: vpshufb %xmm6, %xmm1, %xmm1 636 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 637 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> 638 ; AVX512-NEXT: vpshufb %xmm2, %xmm4, %xmm4 639 ; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 640 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 641 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 642 ; AVX512-NEXT: vpcmpeqb %zmm5, %zmm8, %k0 643 ; AVX512-NEXT: vpcmpeqb %zmm0, %zmm3, %k1 644 ; AVX512-NEXT: kxnorw %k1, %k0, %k0 645 ; AVX512-NEXT: vpmovm2b %k0, %zmm0 646 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 647 ; AVX512-NEXT: vzeroupper 648 ; AVX512-NEXT: retq 649 %wide.vec = load <64 x i8>, <64 x i8>* %ptr 650 %v1 = shufflevector <64 x i8> %wide.vec, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60> 651 %v2 = shufflevector <64 x i8> %wide.vec, <64 x i8> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61> 652 %v3 = shufflevector <64 x i8> %wide.vec, <64 x i8> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62> 653 %v4 = shufflevector <64 x i8> %wide.vec, <64 x i8> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63> 654 655 %cmp1 = icmp eq <16 x i8> %v1, %v2 656 %cmp2 = icmp eq <16 x i8> %v3, %v4 657 %res = icmp eq <16 x i1> %cmp1, %cmp2 658 659 ret <16 x i1> %res 660 } 661 662 define <32 x i1> @interleaved_load_vf32_i8_stride4(<128 x i8>* %ptr) { 663 ; AVX1-LABEL: interleaved_load_vf32_i8_stride4: 664 ; AVX1: # %bb.0: 665 ; AVX1-NEXT: vmovdqa (%rdi), %ymm11 666 ; AVX1-NEXT: vmovdqa 32(%rdi), %ymm14 667 ; AVX1-NEXT: vmovdqa 64(%rdi), %ymm2 668 ; AVX1-NEXT: vmovdqa 96(%rdi), %ymm3 669 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u> 670 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm12 671 ; AVX1-NEXT: vpshufb %xmm6, %xmm12, %xmm5 672 ; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm7 673 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] 674 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> 675 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm13 676 ; AVX1-NEXT: vpshufb %xmm0, %xmm13, %xmm4 677 ; AVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm5 678 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] 679 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm7[4,5,6,7] 680 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm8 681 ; AVX1-NEXT: vextractf128 $1, %ymm14, %xmm15 682 ; AVX1-NEXT: vpshufb %xmm6, %xmm15, %xmm5 683 ; AVX1-NEXT: vpshufb %xmm6, %xmm14, %xmm6 684 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] 685 ; AVX1-NEXT: vextractf128 $1, %ymm11, %xmm6 686 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm4 687 ; AVX1-NEXT: vpshufb %xmm0, %xmm11, %xmm0 688 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 689 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4,5,6,7] 690 ; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm8[4,5,6,7] 691 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u> 692 ; AVX1-NEXT: vpshufb %xmm0, %xmm12, %xmm4 693 ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm5 694 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] 695 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> 696 ; AVX1-NEXT: vpshufb %xmm5, %xmm13, %xmm1 697 ; AVX1-NEXT: vpshufb %xmm5, %xmm2, %xmm7 698 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] 699 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7] 700 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 701 ; AVX1-NEXT: vpshufb %xmm0, %xmm15, %xmm4 702 ; AVX1-NEXT: vpshufb %xmm0, %xmm14, %xmm0 703 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 704 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm4 705 ; AVX1-NEXT: vpshufb %xmm5, %xmm11, %xmm5 706 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] 707 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7] 708 ; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5,6,7] 709 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u> 710 ; AVX1-NEXT: vpshufb %xmm0, %xmm12, %xmm1 711 ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm4 712 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 713 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> 714 ; AVX1-NEXT: vpshufb %xmm4, %xmm13, %xmm5 715 ; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm7 716 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] 717 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7] 718 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 719 ; AVX1-NEXT: vpshufb %xmm0, %xmm15, %xmm5 720 ; AVX1-NEXT: vpshufb %xmm0, %xmm14, %xmm0 721 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] 722 ; AVX1-NEXT: vpshufb %xmm4, %xmm6, %xmm5 723 ; AVX1-NEXT: vpshufb %xmm4, %xmm11, %xmm4 724 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] 725 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7] 726 ; AVX1-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5,6,7] 727 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u> 728 ; AVX1-NEXT: vpshufb %xmm0, %xmm12, %xmm1 729 ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm3 730 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 731 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> 732 ; AVX1-NEXT: vpshufb %xmm3, %xmm13, %xmm4 733 ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 734 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] 735 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] 736 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 737 ; AVX1-NEXT: vpshufb %xmm0, %xmm15, %xmm2 738 ; AVX1-NEXT: vpshufb %xmm0, %xmm14, %xmm0 739 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 740 ; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm2 741 ; AVX1-NEXT: vpshufb %xmm3, %xmm11, %xmm3 742 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 743 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] 744 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 745 ; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm1 746 ; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm2 747 ; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 748 ; AVX1-NEXT: vpcmpeqb %xmm9, %xmm8, %xmm2 749 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 750 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 751 ; AVX1-NEXT: vextractf128 $1, %ymm10, %xmm3 752 ; AVX1-NEXT: vpcmpeqb %xmm2, %xmm3, %xmm2 753 ; AVX1-NEXT: vpcmpeqb %xmm0, %xmm10, %xmm0 754 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 755 ; AVX1-NEXT: vxorps %ymm0, %ymm1, %ymm0 756 ; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0 757 ; AVX1-NEXT: retq 758 ; 759 ; AVX2-LABEL: interleaved_load_vf32_i8_stride4: 760 ; AVX2: # %bb.0: 761 ; AVX2-NEXT: vmovdqa (%rdi), %ymm11 762 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 763 ; AVX2-NEXT: vmovdqa 64(%rdi), %ymm7 764 ; AVX2-NEXT: vmovdqa 96(%rdi), %ymm5 765 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm9 766 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u> 767 ; AVX2-NEXT: vpshufb %xmm6, %xmm9, %xmm3 768 ; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm4 769 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 770 ; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm10 771 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> 772 ; AVX2-NEXT: vpshufb %xmm2, %xmm10, %xmm3 773 ; AVX2-NEXT: vpshufb %xmm2, %xmm11, %xmm0 774 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 775 ; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm0[0,1],xmm4[2,3] 776 ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm12 777 ; AVX2-NEXT: vpshufb %xmm6, %xmm12, %xmm3 778 ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1] 779 ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm13 780 ; AVX2-NEXT: vpshufb %xmm6, %xmm13, %xmm6 781 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] 782 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 783 ; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm6 784 ; AVX2-NEXT: vpshufb %xmm2, %xmm6, %xmm0 785 ; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,0,1] 786 ; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm7 787 ; AVX2-NEXT: vpshufb %xmm2, %xmm7, %xmm2 788 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 789 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 790 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] 791 ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm0[4,5,6,7] 792 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u> 793 ; AVX2-NEXT: vpshufb %xmm2, %xmm9, %xmm3 794 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm0 795 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 796 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> 797 ; AVX2-NEXT: vpshufb %xmm3, %xmm10, %xmm4 798 ; AVX2-NEXT: vpshufb %xmm3, %xmm11, %xmm5 799 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] 800 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] 801 ; AVX2-NEXT: vpshufb %xmm2, %xmm12, %xmm4 802 ; AVX2-NEXT: vpshufb %xmm2, %xmm13, %xmm2 803 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] 804 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 805 ; AVX2-NEXT: vpshufb %xmm3, %xmm6, %xmm4 806 ; AVX2-NEXT: vpshufb %xmm3, %xmm7, %xmm3 807 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 808 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 809 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] 810 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] 811 ; AVX2-NEXT: vpcmpeqb %ymm0, %ymm8, %ymm8 812 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u> 813 ; AVX2-NEXT: vpshufb %xmm0, %xmm9, %xmm2 814 ; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm3 815 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 816 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> 817 ; AVX2-NEXT: vpshufb %xmm3, %xmm10, %xmm4 818 ; AVX2-NEXT: vpshufb %xmm3, %xmm11, %xmm5 819 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] 820 ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] 821 ; AVX2-NEXT: vpshufb %xmm0, %xmm12, %xmm4 822 ; AVX2-NEXT: vpshufb %xmm0, %xmm13, %xmm0 823 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 824 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 825 ; AVX2-NEXT: vpshufb %xmm3, %xmm6, %xmm4 826 ; AVX2-NEXT: vpshufb %xmm3, %xmm7, %xmm3 827 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 828 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 829 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] 830 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] 831 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u> 832 ; AVX2-NEXT: vpshufb %xmm2, %xmm9, %xmm3 833 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 834 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 835 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> 836 ; AVX2-NEXT: vpshufb %xmm3, %xmm10, %xmm4 837 ; AVX2-NEXT: vpshufb %xmm3, %xmm11, %xmm5 838 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] 839 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] 840 ; AVX2-NEXT: vpshufb %xmm2, %xmm12, %xmm4 841 ; AVX2-NEXT: vpshufb %xmm2, %xmm13, %xmm2 842 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] 843 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 844 ; AVX2-NEXT: vpshufb %xmm3, %xmm6, %xmm4 845 ; AVX2-NEXT: vpshufb %xmm3, %xmm7, %xmm3 846 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 847 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 848 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] 849 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 850 ; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 851 ; AVX2-NEXT: vpxor %ymm0, %ymm8, %ymm0 852 ; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 853 ; AVX2-NEXT: retq 854 ; 855 ; AVX512-LABEL: interleaved_load_vf32_i8_stride4: 856 ; AVX512: # %bb.0: 857 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 858 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm7 859 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 860 ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm10 861 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u> 862 ; AVX512-NEXT: vpshufb %xmm6, %xmm10, %xmm3 863 ; AVX512-NEXT: vpshufb %xmm6, %xmm1, %xmm4 864 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 865 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm11 866 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> 867 ; AVX512-NEXT: vpshufb %xmm2, %xmm11, %xmm5 868 ; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm3 869 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] 870 ; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm3[0,1],xmm4[2,3] 871 ; AVX512-NEXT: vextracti64x4 $1, %zmm7, %ymm5 872 ; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm12 873 ; AVX512-NEXT: vpshufb %xmm6, %xmm12, %xmm3 874 ; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1] 875 ; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm13 876 ; AVX512-NEXT: vpshufb %xmm6, %xmm13, %xmm6 877 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] 878 ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 879 ; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm14 880 ; AVX512-NEXT: vpshufb %xmm2, %xmm14, %xmm4 881 ; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,0,1] 882 ; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm7 883 ; AVX512-NEXT: vpshufb %xmm2, %xmm7, %xmm2 884 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] 885 ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 886 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 887 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm2[4,5,6,7] 888 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u> 889 ; AVX512-NEXT: vpshufb %xmm2, %xmm10, %xmm3 890 ; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm4 891 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 892 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> 893 ; AVX512-NEXT: vpshufb %xmm4, %xmm11, %xmm5 894 ; AVX512-NEXT: vpshufb %xmm4, %xmm0, %xmm6 895 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] 896 ; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] 897 ; AVX512-NEXT: vpshufb %xmm2, %xmm12, %xmm5 898 ; AVX512-NEXT: vpshufb %xmm2, %xmm13, %xmm2 899 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] 900 ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 901 ; AVX512-NEXT: vpshufb %xmm4, %xmm14, %xmm5 902 ; AVX512-NEXT: vpshufb %xmm4, %xmm7, %xmm4 903 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] 904 ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 905 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] 906 ; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm2[4,5,6,7] 907 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u> 908 ; AVX512-NEXT: vpshufb %xmm2, %xmm10, %xmm3 909 ; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm4 910 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 911 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> 912 ; AVX512-NEXT: vpshufb %xmm4, %xmm11, %xmm5 913 ; AVX512-NEXT: vpshufb %xmm4, %xmm0, %xmm6 914 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] 915 ; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] 916 ; AVX512-NEXT: vpshufb %xmm2, %xmm12, %xmm5 917 ; AVX512-NEXT: vpshufb %xmm2, %xmm13, %xmm2 918 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] 919 ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 920 ; AVX512-NEXT: vpshufb %xmm4, %xmm14, %xmm5 921 ; AVX512-NEXT: vpshufb %xmm4, %xmm7, %xmm4 922 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] 923 ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 924 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] 925 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] 926 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u> 927 ; AVX512-NEXT: vpshufb %xmm3, %xmm10, %xmm4 928 ; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1 929 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] 930 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> 931 ; AVX512-NEXT: vpshufb %xmm4, %xmm11, %xmm5 932 ; AVX512-NEXT: vpshufb %xmm4, %xmm0, %xmm0 933 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] 934 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 935 ; AVX512-NEXT: vpshufb %xmm3, %xmm12, %xmm1 936 ; AVX512-NEXT: vpshufb %xmm3, %xmm13, %xmm3 937 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 938 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 939 ; AVX512-NEXT: vpshufb %xmm4, %xmm14, %xmm3 940 ; AVX512-NEXT: vpshufb %xmm4, %xmm7, %xmm4 941 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 942 ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 943 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] 944 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 945 ; AVX512-NEXT: vpcmpeqb %zmm9, %zmm8, %k0 946 ; AVX512-NEXT: vpcmpeqb %zmm0, %zmm2, %k1 947 ; AVX512-NEXT: kxnord %k1, %k0, %k0 948 ; AVX512-NEXT: vpmovm2b %k0, %zmm0 949 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 950 ; AVX512-NEXT: retq 951 %wide.vec = load <128 x i8>, <128 x i8>* %ptr 952 %v1 = shufflevector <128 x i8> %wide.vec, <128 x i8> undef, <32 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60, i32 64, i32 68, i32 72, i32 76, i32 80, i32 84, i32 88, i32 92, i32 96, i32 100, i32 104, i32 108, i32 112, i32 116, i32 120, i32 124> 953 954 %v2 = shufflevector <128 x i8> %wide.vec, <128 x i8> undef, <32 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61, i32 65, i32 69, i32 73, i32 77, i32 81, i32 85, i32 89, i32 93, i32 97, i32 101, i32 105, i32 109, i32 113, i32 117, i32 121, i32 125> 955 956 %v3 = shufflevector <128 x i8> %wide.vec, <128 x i8> undef, <32 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62, i32 66, i32 70, i32 74, i32 78, i32 82, i32 86, i32 90, i32 94, i32 98, i32 102, i32 106, i32 110, i32 114, i32 118, i32 122, i32 126> 957 958 %v4 = shufflevector <128 x i8> %wide.vec, <128 x i8> undef, <32 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63, i32 67, i32 71, i32 75, i32 79, i32 83, i32 87, i32 91, i32 95, i32 99, i32 103, i32 107, i32 111, i32 115, i32 119, i32 123, i32 127> 959 960 %cmp1 = icmp eq <32 x i8> %v1, %v2 961 %cmp2 = icmp eq <32 x i8> %v3, %v4 962 %res = icmp eq <32 x i1> %cmp1, %cmp2 963 964 ret <32 x i1> %res 965 } 966 967 define void @interleaved_store_vf8_i8_stride4(<8 x i8> %x1, <8 x i8> %x2, <8 x i8> %x3, <8 x i8> %x4, <32 x i8>* %p) { 968 ; AVX1-LABEL: interleaved_store_vf8_i8_stride4: 969 ; AVX1: # %bb.0: 970 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 971 ; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1 972 ; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0 973 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 974 ; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm1 975 ; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2 976 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 977 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 978 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 979 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 980 ; AVX1-NEXT: vmovaps %ymm0, (%rdi) 981 ; AVX1-NEXT: vzeroupper 982 ; AVX1-NEXT: retq 983 ; 984 ; AVX-LABEL: interleaved_store_vf8_i8_stride4: 985 ; AVX: # %bb.0: 986 ; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 987 ; AVX-NEXT: vpshufb %xmm4, %xmm1, %xmm1 988 ; AVX-NEXT: vpshufb %xmm4, %xmm0, %xmm0 989 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 990 ; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm1 991 ; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm2 992 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 993 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 994 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 995 ; AVX-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 996 ; AVX-NEXT: vmovdqa %ymm0, (%rdi) 997 ; AVX-NEXT: vzeroupper 998 ; AVX-NEXT: retq 999 %v1 = shufflevector <8 x i8> %x1, <8 x i8> %x2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1000 %v2 = shufflevector <8 x i8> %x3, <8 x i8> %x4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1001 %interleaved.vec = shufflevector <16 x i8> %v1, <16 x i8> %v2, <32 x i32> <i32 0,i32 8,i32 16,i32 24,i32 1,i32 9,i32 17,i32 25,i32 2,i32 10,i32 18,i32 26,i32 3,i32 11,i32 19,i32 27,i32 4,i32 12,i32 20,i32 28,i32 5,i32 13,i32 21,i32 29,i32 6,i32 14,i32 22,i32 30,i32 7,i32 15,i32 23,i32 31> 1002 store <32 x i8> %interleaved.vec, <32 x i8>* %p 1003 ret void 1004 } 1005 1006 define <32 x i8> @interleaved_load_vf32_i8_stride3(<96 x i8>* %ptr){ 1007 ; AVX1-LABEL: interleaved_load_vf32_i8_stride3: 1008 ; AVX1: # %bb.0: 1009 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 1010 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 1011 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 1012 ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 1013 ; AVX1-NEXT: vmovdqa 64(%rdi), %xmm4 1014 ; AVX1-NEXT: vmovdqa 80(%rdi), %xmm5 1015 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] 1016 ; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm0 1017 ; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm3 1018 ; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm1 1019 ; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm4 1020 ; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm2 1021 ; AVX1-NEXT: vpshufb %xmm6, %xmm5, %xmm5 1022 ; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm5[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10] 1023 ; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm2[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10] 1024 ; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10] 1025 ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] 1026 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm8 1027 ; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7,8,9,10] 1028 ; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10] 1029 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm2 1030 ; AVX1-NEXT: vpalignr {{.*#+}} xmm9 = xmm7[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] 1031 ; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm6[11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10] 1032 ; AVX1-NEXT: vmovaps {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] 1033 ; AVX1-NEXT: vandnps %ymm2, %ymm5, %ymm2 1034 ; AVX1-NEXT: vandps %ymm5, %ymm8, %ymm5 1035 ; AVX1-NEXT: vorps %ymm2, %ymm5, %ymm2 1036 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,128,128,128,11,12,13,14,15,128,128,128,128,128] 1037 ; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm3 1038 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4] 1039 ; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm6 1040 ; AVX1-NEXT: vpor %xmm3, %xmm6, %xmm3 1041 ; AVX1-NEXT: vpshufb %xmm5, %xmm0, %xmm0 1042 ; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm1 1043 ; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 1044 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 1045 ; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1 1046 ; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1 1047 ; AVX1-NEXT: vpaddb %xmm9, %xmm2, %xmm2 1048 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 1049 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1050 ; AVX1-NEXT: retq 1051 ; 1052 ; AVX-LABEL: interleaved_load_vf32_i8_stride3: 1053 ; AVX: # %bb.0: 1054 ; AVX-NEXT: vmovdqa (%rdi), %xmm0 1055 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 1056 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm2 1057 ; AVX-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0 1058 ; AVX-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1 1059 ; AVX-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2 1060 ; AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] 1061 ; AVX-NEXT: vpshufb %ymm3, %ymm0, %ymm0 1062 ; AVX-NEXT: vpshufb %ymm3, %ymm1, %ymm1 1063 ; AVX-NEXT: vpshufb %ymm3, %ymm2, %ymm2 1064 ; AVX-NEXT: vpalignr {{.*#+}} ymm3 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26] 1065 ; AVX-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26] 1066 ; AVX-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm1[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26] 1067 ; AVX-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26] 1068 ; AVX-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] 1069 ; AVX-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm1 1070 ; AVX-NEXT: vpaddb %ymm2, %ymm1, %ymm1 1071 ; AVX-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0 1072 ; AVX-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20] 1073 ; AVX-NEXT: vpaddb %ymm1, %ymm0, %ymm0 1074 ; AVX-NEXT: retq 1075 %wide.vec = load <96 x i8>, <96 x i8>* %ptr 1076 %v1 = shufflevector <96 x i8> %wide.vec, <96 x i8> undef,<32 x i32> <i32 0,i32 3,i32 6,i32 9,i32 12,i32 15,i32 18,i32 21,i32 24,i32 27,i32 30,i32 33,i32 36,i32 39,i32 42,i32 45,i32 48,i32 51,i32 54,i32 57,i32 60,i32 63,i32 66,i32 69,i32 72,i32 75,i32 78,i32 81,i32 84,i32 87,i32 90,i32 93> 1077 %v2 = shufflevector <96 x i8> %wide.vec, <96 x i8> undef,<32 x i32> <i32 1,i32 4,i32 7,i32 10,i32 13,i32 16,i32 19,i32 22,i32 25,i32 28,i32 31,i32 34,i32 37,i32 40,i32 43,i32 46,i32 49,i32 52,i32 55,i32 58,i32 61,i32 64,i32 67,i32 70,i32 73,i32 76,i32 79,i32 82,i32 85,i32 88,i32 91,i32 94> 1078 %v3 = shufflevector <96 x i8> %wide.vec, <96 x i8> undef,<32 x i32> <i32 2,i32 5,i32 8,i32 11,i32 14,i32 17,i32 20,i32 23,i32 26,i32 29,i32 32,i32 35,i32 38,i32 41,i32 44,i32 47,i32 50,i32 53,i32 56,i32 59,i32 62,i32 65,i32 68,i32 71,i32 74,i32 77,i32 80,i32 83,i32 86,i32 89,i32 92,i32 95> 1079 %add1 = add <32 x i8> %v1, %v2 1080 %add2 = add <32 x i8> %v3, %add1 1081 ret <32 x i8> %add2 1082 } 1083 1084 define <16 x i8> @interleaved_load_vf16_i8_stride3(<48 x i8>* %ptr){ 1085 ; AVX1-LABEL: interleaved_load_vf16_i8_stride3: 1086 ; AVX1: # %bb.0: 1087 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 1088 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 1089 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 1090 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] 1091 ; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 1092 ; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 1093 ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 1094 ; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm2[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10] 1095 ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] 1096 ; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10] 1097 ; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm3[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] 1098 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] 1099 ; AVX1-NEXT: vpblendvb %xmm4, %xmm0, %xmm1, %xmm1 1100 ; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1 1101 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[11,12,13,14,15],zero,zero,zero,zero,zero 1102 ; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[5,6,7,8,9,10],zero,zero,zero,zero,zero,xmm3[0,1,2,3,4] 1103 ; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 1104 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 1105 ; AVX1-NEXT: retq 1106 ; 1107 ; AVX-LABEL: interleaved_load_vf16_i8_stride3: 1108 ; AVX: # %bb.0: 1109 ; AVX-NEXT: vmovdqa (%rdi), %xmm0 1110 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 1111 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm2 1112 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] 1113 ; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 1114 ; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 1115 ; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 1116 ; AVX-NEXT: vpalignr {{.*#+}} xmm3 = xmm2[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10] 1117 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] 1118 ; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10] 1119 ; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm3[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] 1120 ; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] 1121 ; AVX-NEXT: vpblendvb %xmm4, %xmm0, %xmm1, %xmm1 1122 ; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1 1123 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[11,12,13,14,15],zero,zero,zero,zero,zero 1124 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[5,6,7,8,9,10],zero,zero,zero,zero,zero,xmm3[0,1,2,3,4] 1125 ; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 1126 ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 1127 ; AVX-NEXT: retq 1128 %wide.vec = load <48 x i8>, <48 x i8>* %ptr 1129 %v1 = shufflevector <48 x i8> %wide.vec, <48 x i8> undef,<16 x i32> <i32 0,i32 3,i32 6,i32 9,i32 12,i32 15,i32 18,i32 21,i32 24,i32 27,i32 30,i32 33,i32 36,i32 39,i32 42 ,i32 45> 1130 %v2 = shufflevector <48 x i8> %wide.vec, <48 x i8> undef,<16 x i32> <i32 1,i32 4,i32 7,i32 10,i32 13,i32 16,i32 19,i32 22,i32 25,i32 28,i32 31,i32 34,i32 37,i32 40,i32 43,i32 46> 1131 %v3 = shufflevector <48 x i8> %wide.vec, <48 x i8> undef,<16 x i32> <i32 2,i32 5,i32 8,i32 11,i32 14,i32 17,i32 20,i32 23,i32 26,i32 29,i32 32,i32 35,i32 38,i32 41,i32 44,i32 47> 1132 %add1 = add <16 x i8> %v1, %v2 1133 %add2 = add <16 x i8> %v3, %add1 1134 ret <16 x i8> %add2 1135 } 1136 1137 define <8 x i8> @interleaved_load_vf8_i8_stride3(<24 x i8>* %ptr){ 1138 ; AVX1-LABEL: interleaved_load_vf8_i8_stride3: 1139 ; AVX1: # %bb.0: 1140 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 1141 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1142 ; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,2,u,5,u] 1143 ; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,u,3,u,6,u,9,u,12,u,15,u],zero,xmm0[u],zero,xmm0[u] 1144 ; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2 1145 ; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,0,u,3,u,6,u] 1146 ; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,u,4,u,7,u,10,u,13,u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u] 1147 ; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3 1148 ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,1,u,4,u,7,u] 1149 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,u,5,u,8,u,11,u,14,u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u] 1150 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 1151 ; AVX1-NEXT: vpaddw %xmm0, %xmm3, %xmm0 1152 ; AVX1-NEXT: vpaddw %xmm0, %xmm2, %xmm0 1153 ; AVX1-NEXT: vzeroupper 1154 ; AVX1-NEXT: retq 1155 ; 1156 ; AVX-LABEL: interleaved_load_vf8_i8_stride3: 1157 ; AVX: # %bb.0: 1158 ; AVX-NEXT: vmovdqa (%rdi), %ymm0 1159 ; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 1160 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,2,u,5,u] 1161 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,u,3,u,6,u,9,u,12,u,15,u],zero,xmm0[u],zero,xmm0[u] 1162 ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 1163 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,0,u,3,u,6,u] 1164 ; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,u,4,u,7,u,10,u,13,u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u] 1165 ; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 1166 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,1,u,4,u,7,u] 1167 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,u,5,u,8,u,11,u,14,u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u] 1168 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 1169 ; AVX-NEXT: vpaddw %xmm0, %xmm3, %xmm0 1170 ; AVX-NEXT: vpaddw %xmm0, %xmm2, %xmm0 1171 ; AVX-NEXT: vzeroupper 1172 ; AVX-NEXT: retq 1173 %wide.vec = load <24 x i8>, <24 x i8>* %ptr 1174 %v1 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef,<8 x i32> <i32 0,i32 3,i32 6,i32 9,i32 12,i32 15,i32 18,i32 21> 1175 %v2 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef,<8 x i32> <i32 1,i32 4,i32 7,i32 10,i32 13,i32 16,i32 19,i32 22> 1176 %v3 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef,<8 x i32> <i32 2,i32 5,i32 8,i32 11,i32 14,i32 17,i32 20,i32 23> 1177 %add1 = add <8 x i8> %v1, %v2 1178 %add2 = add <8 x i8> %v3, %add1 1179 ret <8 x i8> %add2 1180 } 1181 1182 define void @interleaved_store_vf8_i8_stride3(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <24 x i8>* %p) { 1183 ; AVX1-LABEL: interleaved_store_vf8_i8_stride3: 1184 ; AVX1: # %bb.0: 1185 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 1186 ; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 1187 ; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 1188 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1189 ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm1 1190 ; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5] 1191 ; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[0],zero,zero,xmm1[1],zero,zero,xmm1[2],zero,zero,xmm1[3],zero,zero,xmm1[4],zero 1192 ; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 1193 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u] 1194 ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[5],zero,zero,xmm1[6],zero,zero,xmm1[7,u,u,u,u,u,u,u,u] 1195 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 1196 ; AVX1-NEXT: vmovq %xmm0, 16(%rdi) 1197 ; AVX1-NEXT: vmovdqu %xmm2, (%rdi) 1198 ; AVX1-NEXT: retq 1199 ; 1200 ; AVX-LABEL: interleaved_store_vf8_i8_stride3: 1201 ; AVX: # %bb.0: 1202 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 1203 ; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 1204 ; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 1205 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1206 ; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm1 1207 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5] 1208 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[0],zero,zero,xmm1[1],zero,zero,xmm1[2],zero,zero,xmm1[3],zero,zero,xmm1[4],zero 1209 ; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2 1210 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u] 1211 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[5],zero,zero,xmm1[6],zero,zero,xmm1[7,u,u,u,u,u,u,u,u] 1212 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 1213 ; AVX-NEXT: vmovq %xmm0, 16(%rdi) 1214 ; AVX-NEXT: vmovdqu %xmm2, (%rdi) 1215 ; AVX-NEXT: retq 1216 %1 = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1217 %2 = shufflevector <8 x i8> %c, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1218 %interleaved.vec = shufflevector <16 x i8> %1, <16 x i8> %2, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23> 1219 store <24 x i8> %interleaved.vec, <24 x i8>* %p, align 1 1220 ret void 1221 } 1222 1223 define void @interleaved_store_vf16_i8_stride3(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <48 x i8>* %p) { 1224 ; AVX1-LABEL: interleaved_store_vf16_i8_stride3: 1225 ; AVX1: # %bb.0: 1226 ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] 1227 ; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] 1228 ; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] 1229 ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] 1230 ; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] 1231 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [128,0,128,128,1,128,128,2,128,128,3,128,128,4,128,128] 1232 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm4 1233 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [5,128,11,6,128,12,7,128,13,8,128,14,9,128,15,10] 1234 ; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm6 1235 ; AVX1-NEXT: vpor %xmm4, %xmm6, %xmm4 1236 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm6 1237 ; AVX1-NEXT: vpshufb %xmm5, %xmm0, %xmm0 1238 ; AVX1-NEXT: vpor %xmm6, %xmm0, %xmm0 1239 ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 1240 ; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm1 1241 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 1242 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 1243 ; AVX1-NEXT: vmovdqu %xmm1, 32(%rdi) 1244 ; AVX1-NEXT: vmovups %ymm0, (%rdi) 1245 ; AVX1-NEXT: vzeroupper 1246 ; AVX1-NEXT: retq 1247 ; 1248 ; AVX2-LABEL: interleaved_store_vf16_i8_stride3: 1249 ; AVX2: # %bb.0: 1250 ; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] 1251 ; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] 1252 ; AVX2-NEXT: vpalignr {{.*#+}} xmm3 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] 1253 ; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] 1254 ; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] 1255 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [128,0,128,128,1,128,128,2,128,128,3,128,128,4,128,128] 1256 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm4 1257 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [5,128,11,6,128,12,7,128,13,8,128,14,9,128,15,10] 1258 ; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm6 1259 ; AVX2-NEXT: vpor %xmm4, %xmm6, %xmm4 1260 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm6 1261 ; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm0 1262 ; AVX2-NEXT: vpor %xmm6, %xmm0, %xmm0 1263 ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 1264 ; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm1 1265 ; AVX2-NEXT: vpor %xmm2, %xmm1, %xmm1 1266 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 1267 ; AVX2-NEXT: vmovdqu %xmm1, 32(%rdi) 1268 ; AVX2-NEXT: vmovdqu %ymm0, (%rdi) 1269 ; AVX2-NEXT: vzeroupper 1270 ; AVX2-NEXT: retq 1271 ; 1272 ; AVX512-LABEL: interleaved_store_vf16_i8_stride3: 1273 ; AVX512: # %bb.0: 1274 ; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] 1275 ; AVX512-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] 1276 ; AVX512-NEXT: vpalignr {{.*#+}} xmm3 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] 1277 ; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] 1278 ; AVX512-NEXT: vpalignr {{.*#+}} xmm1 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] 1279 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [128,0,128,128,1,128,128,2,128,128,3,128,128,4,128,128] 1280 ; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm4 1281 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [5,128,11,6,128,12,7,128,13,8,128,14,9,128,15,10] 1282 ; AVX512-NEXT: vpshufb %xmm5, %xmm3, %xmm6 1283 ; AVX512-NEXT: vpor %xmm4, %xmm6, %xmm4 1284 ; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm6 1285 ; AVX512-NEXT: vpshufb %xmm5, %xmm0, %xmm0 1286 ; AVX512-NEXT: vpor %xmm6, %xmm0, %xmm0 1287 ; AVX512-NEXT: vpshufb %xmm2, %xmm3, %xmm2 1288 ; AVX512-NEXT: vpshufb %xmm5, %xmm1, %xmm1 1289 ; AVX512-NEXT: vpor %xmm2, %xmm1, %xmm1 1290 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 1291 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 1292 ; AVX512-NEXT: vmovdqu %ymm0, (%rdi) 1293 ; AVX512-NEXT: vextracti32x4 $2, %zmm1, 32(%rdi) 1294 ; AVX512-NEXT: vzeroupper 1295 ; AVX512-NEXT: retq 1296 %1 = shufflevector <16 x i8> %a, <16 x i8> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 1297 %2 = shufflevector <16 x i8> %c, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1298 %interleaved.vec = shufflevector <32 x i8> %1, <32 x i8> %2, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47> 1299 store <48 x i8> %interleaved.vec, <48 x i8>* %p, align 1 1300 ret void 1301 } 1302 1303 define void @interleaved_store_vf32_i8_stride3(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <96 x i8>* %p) { 1304 ; AVX1-LABEL: interleaved_store_vf32_i8_stride3: 1305 ; AVX1: # %bb.0: 1306 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1307 ; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] 1308 ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] 1309 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 1310 ; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] 1311 ; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] 1312 ; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] 1313 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 1314 ; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4] 1315 ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] 1316 ; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] 1317 ; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] 1318 ; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] 1319 ; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm7[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] 1320 ; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] 1321 ; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] 1322 ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] 1323 ; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4] 1324 ; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4] 1325 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] 1326 ; AVX1-NEXT: vpshufb %xmm5, %xmm0, %xmm0 1327 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm6 1328 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0 1329 ; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm4 1330 ; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm1 1331 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 1332 ; AVX1-NEXT: vpshufb %xmm5, %xmm2, %xmm2 1333 ; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm3 1334 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 1335 ; AVX1-NEXT: vmovups %ymm2, 64(%rdi) 1336 ; AVX1-NEXT: vmovups %ymm1, 32(%rdi) 1337 ; AVX1-NEXT: vmovups %ymm0, (%rdi) 1338 ; AVX1-NEXT: vzeroupper 1339 ; AVX1-NEXT: retq 1340 ; 1341 ; AVX2-LABEL: interleaved_store_vf32_i8_stride3: 1342 ; AVX2: # %bb.0: 1343 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21] 1344 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26] 1345 ; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] 1346 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20] 1347 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20] 1348 ; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20] 1349 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20] 1350 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20] 1351 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm3 1352 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] 1353 ; AVX2-NEXT: vpshufb %ymm4, %ymm3, %ymm3 1354 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] 1355 ; AVX2-NEXT: vpshufb %ymm4, %ymm2, %ymm2 1356 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 1357 ; AVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm0 1358 ; AVX2-NEXT: vmovdqu %ymm0, 64(%rdi) 1359 ; AVX2-NEXT: vmovdqu %ymm2, 32(%rdi) 1360 ; AVX2-NEXT: vmovdqu %ymm3, (%rdi) 1361 ; AVX2-NEXT: vzeroupper 1362 ; AVX2-NEXT: retq 1363 ; 1364 ; AVX512-LABEL: interleaved_store_vf32_i8_stride3: 1365 ; AVX512: # %bb.0: 1366 ; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21] 1367 ; AVX512-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26] 1368 ; AVX512-NEXT: vpalignr {{.*#+}} ymm3 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] 1369 ; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20] 1370 ; AVX512-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20] 1371 ; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20] 1372 ; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20] 1373 ; AVX512-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20] 1374 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm3 1375 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] 1376 ; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm3 1377 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] 1378 ; AVX512-NEXT: vpshufb %ymm4, %ymm2, %ymm2 1379 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 1380 ; AVX512-NEXT: vpshufb %ymm4, %ymm0, %ymm0 1381 ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm1 1382 ; AVX512-NEXT: vmovdqu %ymm0, 64(%rdi) 1383 ; AVX512-NEXT: vmovdqu64 %zmm1, (%rdi) 1384 ; AVX512-NEXT: vzeroupper 1385 ; AVX512-NEXT: retq 1386 %1 = shufflevector <32 x i8> %a, <32 x i8> %b, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 1387 %2 = shufflevector <32 x i8> %c, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1388 %interleaved.vec = shufflevector <64 x i8> %1, <64 x i8> %2, <96 x i32> <i32 0, i32 32, i32 64, i32 1, i32 33, i32 65, i32 2, i32 34, i32 66, i32 3, i32 35, i32 67, i32 4, i32 36, i32 68, i32 5, i32 37, i32 69, i32 6, i32 38, i32 70, i32 7, i32 39, i32 71, i32 8, i32 40, i32 72, i32 9, i32 41, i32 73, i32 10, i32 42, i32 74, i32 11, i32 43, i32 75, i32 12, i32 44, i32 76, i32 13, i32 45, i32 77, i32 14, i32 46, i32 78, i32 15, i32 47, i32 79, i32 16, i32 48, i32 80, i32 17, i32 49, i32 81, i32 18, i32 50, i32 82, i32 19, i32 51, i32 83, i32 20, i32 52, i32 84, i32 21, i32 53, i32 85, i32 22, i32 54, i32 86, i32 23, i32 55, i32 87, i32 24, i32 56, i32 88, i32 25, i32 57, i32 89, i32 26, i32 58, i32 90, i32 27, i32 59, i32 91, i32 28, i32 60, i32 92, i32 29, i32 61, i32 93, i32 30, i32 62, i32 94, i32 31, i32 63, i32 95> 1389 store <96 x i8> %interleaved.vec, <96 x i8>* %p, align 1 1390 ret void 1391 } 1392 1393 define void @interleaved_store_vf64_i8_stride3(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <192 x i8>* %p) { 1394 ; AVX1-LABEL: interleaved_store_vf64_i8_stride3: 1395 ; AVX1: # %bb.0: 1396 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 1397 ; AVX1-NEXT: vpalignr {{.*#+}} xmm8 = xmm6[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] 1398 ; AVX1-NEXT: vpalignr {{.*#+}} xmm9 = xmm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] 1399 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 1400 ; AVX1-NEXT: vpalignr {{.*#+}} xmm14 = xmm7[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] 1401 ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] 1402 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6 1403 ; AVX1-NEXT: vpalignr {{.*#+}} xmm11 = xmm6[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] 1404 ; AVX1-NEXT: vpalignr {{.*#+}} xmm15 = xmm3[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] 1405 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 1406 ; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] 1407 ; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] 1408 ; AVX1-NEXT: vpalignr {{.*#+}} xmm10 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] 1409 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm6 1410 ; AVX1-NEXT: vpalignr {{.*#+}} xmm12 = xmm14[5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4] 1411 ; AVX1-NEXT: vpalignr {{.*#+}} xmm13 = xmm9[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4] 1412 ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm7 1413 ; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm8[5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4] 1414 ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] 1415 ; AVX1-NEXT: vpalignr {{.*#+}} xmm14 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4] 1416 ; AVX1-NEXT: vpalignr {{.*#+}} xmm9 = xmm15[5,6,7,8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4] 1417 ; AVX1-NEXT: vpalignr {{.*#+}} xmm8 = xmm11[5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4] 1418 ; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] 1419 ; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] 1420 ; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4] 1421 ; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm7[5,6,7,8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4] 1422 ; AVX1-NEXT: vpalignr {{.*#+}} xmm11 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4] 1423 ; AVX1-NEXT: vpalignr {{.*#+}} xmm15 = xmm13[5,6,7,8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4] 1424 ; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm12[5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4] 1425 ; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm10[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] 1426 ; AVX1-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4] 1427 ; AVX1-NEXT: vpalignr {{.*#+}} xmm9 = xmm9[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] 1428 ; AVX1-NEXT: vpalignr {{.*#+}} xmm14 = xmm14[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] 1429 ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] 1430 ; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] 1431 ; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4] 1432 ; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4] 1433 ; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4] 1434 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] 1435 ; AVX1-NEXT: vpshufb %xmm5, %xmm0, %xmm0 1436 ; AVX1-NEXT: vpshufb %xmm5, %xmm7, %xmm7 1437 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0 1438 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm6 1439 ; AVX1-NEXT: vpshufb %xmm5, %xmm2, %xmm2 1440 ; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 1441 ; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm1 1442 ; AVX1-NEXT: vpshufb %xmm5, %xmm14, %xmm6 1443 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1 1444 ; AVX1-NEXT: vpshufb %xmm5, %xmm9, %xmm6 1445 ; AVX1-NEXT: vpshufb %xmm5, %xmm15, %xmm7 1446 ; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 1447 ; AVX1-NEXT: vpshufb %xmm5, %xmm11, %xmm7 1448 ; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm4 1449 ; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm4 1450 ; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm3 1451 ; AVX1-NEXT: vpshufb %xmm5, %xmm8, %xmm5 1452 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 1453 ; AVX1-NEXT: vmovups %ymm3, 160(%rdi) 1454 ; AVX1-NEXT: vmovups %ymm4, 128(%rdi) 1455 ; AVX1-NEXT: vmovups %ymm6, 96(%rdi) 1456 ; AVX1-NEXT: vmovups %ymm1, 64(%rdi) 1457 ; AVX1-NEXT: vmovups %ymm2, 32(%rdi) 1458 ; AVX1-NEXT: vmovups %ymm0, (%rdi) 1459 ; AVX1-NEXT: vzeroupper 1460 ; AVX1-NEXT: retq 1461 ; 1462 ; AVX2-LABEL: interleaved_store_vf64_i8_stride3: 1463 ; AVX2: # %bb.0: 1464 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21] 1465 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21] 1466 ; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26] 1467 ; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26] 1468 ; AVX2-NEXT: vpalignr {{.*#+}} ymm6 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20] 1469 ; AVX2-NEXT: vpalignr {{.*#+}} ymm7 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20] 1470 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20] 1471 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20] 1472 ; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm4[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm4[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] 1473 ; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm5[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm5[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20] 1474 ; AVX2-NEXT: vpalignr {{.*#+}} ymm4 = ymm7[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm7[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20] 1475 ; AVX2-NEXT: vpalignr {{.*#+}} ymm5 = ymm6[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm6[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20] 1476 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20] 1477 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] 1478 ; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20] 1479 ; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20] 1480 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm5, %ymm6 1481 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] 1482 ; AVX2-NEXT: vpshufb %ymm7, %ymm6, %ymm6 1483 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm5[4,5,6,7] 1484 ; AVX2-NEXT: vpshufb %ymm7, %ymm5, %ymm5 1485 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] 1486 ; AVX2-NEXT: vpshufb %ymm7, %ymm0, %ymm0 1487 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm4, %ymm2 1488 ; AVX2-NEXT: vpshufb %ymm7, %ymm2, %ymm2 1489 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm4[4,5,6,7] 1490 ; AVX2-NEXT: vpshufb %ymm7, %ymm4, %ymm4 1491 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] 1492 ; AVX2-NEXT: vpshufb %ymm7, %ymm1, %ymm1 1493 ; AVX2-NEXT: vmovdqu %ymm1, 160(%rdi) 1494 ; AVX2-NEXT: vmovdqu %ymm4, 128(%rdi) 1495 ; AVX2-NEXT: vmovdqu %ymm0, 64(%rdi) 1496 ; AVX2-NEXT: vmovdqu %ymm5, 32(%rdi) 1497 ; AVX2-NEXT: vmovdqu %ymm2, 96(%rdi) 1498 ; AVX2-NEXT: vmovdqu %ymm6, (%rdi) 1499 ; AVX2-NEXT: vzeroupper 1500 ; AVX2-NEXT: retq 1501 ; 1502 ; AVX512-LABEL: interleaved_store_vf64_i8_stride3: 1503 ; AVX512: # %bb.0: 1504 ; AVX512-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21,38,39,40,41,42,43,44,45,46,47,32,33,34,35,36,37,54,55,56,57,58,59,60,61,62,63,48,49,50,51,52,53] 1505 ; AVX512-NEXT: vpalignr {{.*#+}} zmm1 = zmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26,43,44,45,46,47,32,33,34,35,36,37,38,39,40,41,42,59,60,61,62,63,48,49,50,51,52,53,54,55,56,57,58] 1506 ; AVX512-NEXT: vpalignr {{.*#+}} zmm3 = zmm0[5,6,7,8,9,10,11,12,13,14,15],zmm2[0,1,2,3,4],zmm0[21,22,23,24,25,26,27,28,29,30,31],zmm2[16,17,18,19,20],zmm0[37,38,39,40,41,42,43,44,45,46,47],zmm2[32,33,34,35,36],zmm0[53,54,55,56,57,58,59,60,61,62,63],zmm2[48,49,50,51,52] 1507 ; AVX512-NEXT: vpalignr {{.*#+}} zmm0 = zmm1[5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1,2,3,4],zmm1[21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17,18,19,20],zmm1[37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33,34,35,36],zmm1[53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49,50,51,52] 1508 ; AVX512-NEXT: vpalignr {{.*#+}} zmm1 = zmm2[5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1,2,3,4],zmm2[21,22,23,24,25,26,27,28,29,30,31],zmm1[16,17,18,19,20],zmm2[37,38,39,40,41,42,43,44,45,46,47],zmm1[32,33,34,35,36],zmm2[53,54,55,56,57,58,59,60,61,62,63],zmm1[48,49,50,51,52] 1509 ; AVX512-NEXT: vpalignr {{.*#+}} zmm2 = zmm3[5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1,2,3,4],zmm3[21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17,18,19,20],zmm3[37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33,34,35,36],zmm3[53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49,50,51,52] 1510 ; AVX512-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1,2,3,4],zmm0[21,22,23,24,25,26,27,28,29,30,31],zmm1[16,17,18,19,20],zmm0[37,38,39,40,41,42,43,44,45,46,47],zmm1[32,33,34,35,36],zmm0[53,54,55,56,57,58,59,60,61,62,63],zmm1[48,49,50,51,52] 1511 ; AVX512-NEXT: vpalignr {{.*#+}} zmm1 = zmm1[5,6,7,8,9,10,11,12,13,14,15],zmm3[0,1,2,3,4],zmm1[21,22,23,24,25,26,27,28,29,30,31],zmm3[16,17,18,19,20],zmm1[37,38,39,40,41,42,43,44,45,46,47],zmm3[32,33,34,35,36],zmm1[53,54,55,56,57,58,59,60,61,62,63],zmm3[48,49,50,51,52] 1512 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm3 1513 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] 1514 ; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm3 1515 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm2[4,5,6,7] 1516 ; AVX512-NEXT: vpshufb %ymm4, %ymm5, %ymm5 1517 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm1[2,3] 1518 ; AVX512-NEXT: vpshufb %ymm4, %ymm6, %ymm6 1519 ; AVX512-NEXT: vextracti64x4 $1, %zmm2, %ymm2 1520 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm0 1521 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm7 1522 ; AVX512-NEXT: vpshufb %ymm4, %ymm7, %ymm7 1523 ; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm1 1524 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] 1525 ; AVX512-NEXT: vpshufb %ymm4, %ymm2, %ymm2 1526 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 1527 ; AVX512-NEXT: vpshufb %ymm4, %ymm0, %ymm0 1528 ; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm1 1529 ; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm3 1530 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 1531 ; AVX512-NEXT: vmovdqu64 %zmm0, 128(%rdi) 1532 ; AVX512-NEXT: vmovdqu64 %zmm3, 64(%rdi) 1533 ; AVX512-NEXT: vmovdqu64 %zmm1, (%rdi) 1534 ; AVX512-NEXT: vzeroupper 1535 ; AVX512-NEXT: retq 1536 %1 = shufflevector <64 x i8> %a, <64 x i8> %b, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> 1537 %2 = shufflevector <64 x i8> %c, <64 x i8> undef, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1538 %3 = shufflevector <128 x i8> %1, <128 x i8> %2, <192 x i32> <i32 0, i32 64, i32 128, i32 1, i32 65, i32 129, i32 2, i32 66, i32 130, i32 3, i32 67, i32 131, i32 4, i32 68, i32 132, i32 5, i32 69, i32 133, i32 6, i32 70, i32 134, i32 7, i32 71, i32 135, i32 8, i32 72, i32 136, i32 9, i32 73, i32 137, i32 10, i32 74, i32 138, i32 11, i32 75, i32 139, i32 12, i32 76, i32 140, i32 13, i32 77, i32 141, i32 14, i32 78, i32 142, i32 15, i32 79, i32 143, i32 16, i32 80, i32 144, i32 17, i32 81, i32 145, i32 18, i32 82, i32 146, i32 19, i32 83, i32 147, i32 20, i32 84, i32 148, i32 21, i32 85, i32 149, i32 22, i32 86, i32 150, i32 23, i32 87, i32 151, i32 24, i32 88, i32 152, i32 25, i32 89, i32 153, i32 26, i32 90, i32 154, i32 27, i32 91, i32 155, i32 28, i32 92, i32 156, i32 29, i32 93, i32 157, i32 30, i32 94, i32 158, i32 31, i32 95, i32 159, i32 32, i32 96, i32 160, i32 33, i32 97, i32 161, i32 34, i32 98, i32 162, i32 35, i32 99, i32 163, i32 36, i32 100, i32 164, i32 37, i32 101, i32 165, i32 38, i32 102, i32 166, i32 39, i32 103, i32 167, i32 40, i32 104, i32 168, i32 41, i32 105, i32 169, i32 42, i32 106, i32 170, i32 43, i32 107, i32 171, i32 44, i32 108, i32 172, i32 45, i32 109, i32 173, i32 46, i32 110, i32 174, i32 47, i32 111, i32 175, i32 48, i32 112, i32 176, i32 49, i32 113, i32 177, i32 50, i32 114, i32 178, i32 51, i32 115, i32 179, i32 52, i32 116, i32 180, i32 53, i32 117, i32 181, i32 54, i32 118, i32 182, i32 55, i32 119, i32 183, i32 56, i32 120, i32 184, i32 57, i32 121, i32 185, i32 58, i32 122, i32 186, i32 59, i32 123, i32 187, i32 60, i32 124, i32 188, i32 61, i32 125, i32 189, i32 62, i32 126, i32 190, i32 63, i32 127, i32 191> 1539 store <192 x i8> %3, <192 x i8>* %p, align 1 1540 ret void 1541 } 1542 1543 define <64 x i8> @interleaved_load_vf64_i8_stride3(<192 x i8>* %ptr){ 1544 ; AVX1-LABEL: interleaved_load_vf64_i8_stride3: 1545 ; AVX1: # %bb.0: 1546 ; AVX1-NEXT: vmovdqu (%rdi), %xmm11 1547 ; AVX1-NEXT: vmovdqu 16(%rdi), %xmm10 1548 ; AVX1-NEXT: vmovdqu 32(%rdi), %xmm8 1549 ; AVX1-NEXT: vmovdqu 48(%rdi), %xmm3 1550 ; AVX1-NEXT: vmovdqu 64(%rdi), %xmm12 1551 ; AVX1-NEXT: vmovdqu 80(%rdi), %xmm9 1552 ; AVX1-NEXT: vmovdqu 96(%rdi), %xmm6 1553 ; AVX1-NEXT: vmovdqu 112(%rdi), %xmm14 1554 ; AVX1-NEXT: vmovdqu 128(%rdi), %xmm13 1555 ; AVX1-NEXT: vmovdqu 144(%rdi), %xmm5 1556 ; AVX1-NEXT: vmovdqu 160(%rdi), %xmm1 1557 ; AVX1-NEXT: vmovdqu 176(%rdi), %xmm15 1558 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] 1559 ; AVX1-NEXT: vpshufb %xmm4, %xmm6, %xmm6 1560 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm5 1561 ; AVX1-NEXT: vpshufb %xmm4, %xmm11, %xmm2 1562 ; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3 1563 ; AVX1-NEXT: vpshufb %xmm4, %xmm10, %xmm11 1564 ; AVX1-NEXT: vpshufb %xmm4, %xmm12, %xmm12 1565 ; AVX1-NEXT: vpshufb %xmm4, %xmm14, %xmm14 1566 ; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1 1567 ; AVX1-NEXT: vpshufb %xmm4, %xmm13, %xmm0 1568 ; AVX1-NEXT: vpshufb %xmm4, %xmm15, %xmm7 1569 ; AVX1-NEXT: vpshufb %xmm4, %xmm8, %xmm13 1570 ; AVX1-NEXT: vpshufb %xmm4, %xmm9, %xmm4 1571 ; AVX1-NEXT: vpalignr {{.*#+}} xmm15 = xmm4[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10] 1572 ; AVX1-NEXT: vpalignr {{.*#+}} xmm10 = xmm13[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10] 1573 ; AVX1-NEXT: vpalignr {{.*#+}} xmm9 = xmm7[11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7,8,9,10] 1574 ; AVX1-NEXT: vpalignr {{.*#+}} xmm8 = xmm0[11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7,8,9,10] 1575 ; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] 1576 ; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7,8,9,10] 1577 ; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7,8,9,10] 1578 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm7 1579 ; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7,8,9,10] 1580 ; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7,8,9,10] 1581 ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm14[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10] 1582 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm14 1583 ; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm12[11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10] 1584 ; AVX1-NEXT: vpalignr {{.*#+}} xmm11 = xmm11[11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7,8,9,10] 1585 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm11, %ymm12 1586 ; AVX1-NEXT: vmovaps {{.*#+}} ymm13 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] 1587 ; AVX1-NEXT: vandnps %ymm12, %ymm13, %ymm12 1588 ; AVX1-NEXT: vandps %ymm13, %ymm14, %ymm14 1589 ; AVX1-NEXT: vorps %ymm12, %ymm14, %ymm12 1590 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm14 1591 ; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm15[11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10] 1592 ; AVX1-NEXT: vmovdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill 1593 ; AVX1-NEXT: vandnps %ymm14, %ymm13, %ymm14 1594 ; AVX1-NEXT: vandps %ymm13, %ymm7, %ymm7 1595 ; AVX1-NEXT: vorps %ymm14, %ymm7, %ymm13 1596 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm14 = [128,128,128,128,128,128,11,12,13,14,15,128,128,128,128,128] 1597 ; AVX1-NEXT: vpshufb %xmm14, %xmm3, %xmm3 1598 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4] 1599 ; AVX1-NEXT: vpshufb %xmm7, %xmm15, %xmm4 1600 ; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3 1601 ; AVX1-NEXT: vpalignr {{.*#+}} xmm11 = xmm10[11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7,8,9,10] 1602 ; AVX1-NEXT: vpshufb %xmm14, %xmm2, %xmm2 1603 ; AVX1-NEXT: vpshufb %xmm7, %xmm10, %xmm4 1604 ; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2 1605 ; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm9[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] 1606 ; AVX1-NEXT: vpshufb %xmm14, %xmm5, %xmm4 1607 ; AVX1-NEXT: vpshufb %xmm7, %xmm9, %xmm5 1608 ; AVX1-NEXT: vpor %xmm4, %xmm5, %xmm4 1609 ; AVX1-NEXT: vpshufb %xmm14, %xmm6, %xmm5 1610 ; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm8[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10] 1611 ; AVX1-NEXT: vpshufb %xmm7, %xmm8, %xmm0 1612 ; AVX1-NEXT: vpor %xmm5, %xmm0, %xmm5 1613 ; AVX1-NEXT: vextractf128 $1, %ymm13, %xmm0 1614 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 1615 ; AVX1-NEXT: vpaddb %xmm0, %xmm4, %xmm1 1616 ; AVX1-NEXT: vextractf128 $1, %ymm12, %xmm0 1617 ; AVX1-NEXT: vpaddb -{{[0-9]+}}(%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 1618 ; AVX1-NEXT: vpaddb %xmm0, %xmm3, %xmm0 1619 ; AVX1-NEXT: vpaddb %xmm11, %xmm12, %xmm3 1620 ; AVX1-NEXT: vpaddb %xmm3, %xmm2, %xmm2 1621 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 1622 ; AVX1-NEXT: vpaddb %xmm6, %xmm13, %xmm2 1623 ; AVX1-NEXT: vpaddb %xmm2, %xmm5, %xmm2 1624 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 1625 ; AVX1-NEXT: retq 1626 ; 1627 ; AVX2-LABEL: interleaved_load_vf64_i8_stride3: 1628 ; AVX2: # %bb.0: 1629 ; AVX2-NEXT: vmovdqu (%rdi), %xmm0 1630 ; AVX2-NEXT: vmovdqu 16(%rdi), %xmm1 1631 ; AVX2-NEXT: vmovdqu 32(%rdi), %xmm2 1632 ; AVX2-NEXT: vmovdqu 96(%rdi), %xmm3 1633 ; AVX2-NEXT: vmovdqu 112(%rdi), %xmm4 1634 ; AVX2-NEXT: vmovdqu 128(%rdi), %xmm5 1635 ; AVX2-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0 1636 ; AVX2-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1 1637 ; AVX2-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2 1638 ; AVX2-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3 1639 ; AVX2-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4 1640 ; AVX2-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5 1641 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] 1642 ; AVX2-NEXT: vpshufb %ymm6, %ymm3, %ymm3 1643 ; AVX2-NEXT: vpshufb %ymm6, %ymm0, %ymm0 1644 ; AVX2-NEXT: vpshufb %ymm6, %ymm1, %ymm1 1645 ; AVX2-NEXT: vpshufb %ymm6, %ymm4, %ymm4 1646 ; AVX2-NEXT: vpshufb %ymm6, %ymm5, %ymm5 1647 ; AVX2-NEXT: vpshufb %ymm6, %ymm2, %ymm2 1648 ; AVX2-NEXT: vpalignr {{.*#+}} ymm6 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26] 1649 ; AVX2-NEXT: vpalignr {{.*#+}} ymm7 = ymm5[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm5[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26] 1650 ; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26] 1651 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26] 1652 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm1[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26] 1653 ; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm4[11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7,8,9,10],ymm4[27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23,24,25,26] 1654 ; AVX2-NEXT: vpalignr {{.*#+}} ymm4 = ymm7[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm7[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26] 1655 ; AVX2-NEXT: vpalignr {{.*#+}} ymm5 = ymm6[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm6[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26] 1656 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] 1657 ; AVX2-NEXT: # ymm8 = mem[0,1,0,1] 1658 ; AVX2-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm1 1659 ; AVX2-NEXT: vpaddb %ymm5, %ymm1, %ymm1 1660 ; AVX2-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2 1661 ; AVX2-NEXT: vpaddb %ymm4, %ymm2, %ymm2 1662 ; AVX2-NEXT: vpblendvb %ymm8, %ymm6, %ymm0, %ymm0 1663 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20] 1664 ; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 1665 ; AVX2-NEXT: vpblendvb %ymm8, %ymm7, %ymm3, %ymm1 1666 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20] 1667 ; AVX2-NEXT: vpaddb %ymm2, %ymm1, %ymm1 1668 ; AVX2-NEXT: retq 1669 ; 1670 ; AVX512-LABEL: interleaved_load_vf64_i8_stride3: 1671 ; AVX512: # %bb.0: 1672 ; AVX512-NEXT: vmovdqu (%rdi), %xmm0 1673 ; AVX512-NEXT: vmovdqu 16(%rdi), %xmm1 1674 ; AVX512-NEXT: vmovdqu 32(%rdi), %xmm2 1675 ; AVX512-NEXT: vmovdqu 96(%rdi), %xmm3 1676 ; AVX512-NEXT: vmovdqu 112(%rdi), %xmm4 1677 ; AVX512-NEXT: vmovdqu 128(%rdi), %xmm5 1678 ; AVX512-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0 1679 ; AVX512-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1 1680 ; AVX512-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2 1681 ; AVX512-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3 1682 ; AVX512-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4 1683 ; AVX512-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5 1684 ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 1685 ; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 1686 ; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 1687 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] 1688 ; AVX512-NEXT: vpshufb %zmm3, %zmm0, %zmm0 1689 ; AVX512-NEXT: vpshufb %zmm3, %zmm1, %zmm1 1690 ; AVX512-NEXT: vpshufb %zmm3, %zmm2, %zmm2 1691 ; AVX512-NEXT: vpalignr {{.*#+}} zmm3 = zmm2[11,12,13,14,15],zmm0[0,1,2,3,4,5,6,7,8,9,10],zmm2[27,28,29,30,31],zmm0[16,17,18,19,20,21,22,23,24,25,26],zmm2[43,44,45,46,47],zmm0[32,33,34,35,36,37,38,39,40,41,42],zmm2[59,60,61,62,63],zmm0[48,49,50,51,52,53,54,55,56,57,58] 1692 ; AVX512-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58] 1693 ; AVX512-NEXT: movabsq $-576188069258921984, %rax # imm = 0xF800F800F800F800 1694 ; AVX512-NEXT: kmovq %rax, %k1 1695 ; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] 1696 ; AVX512-NEXT: # ymm4 = mem[0,1,0,1] 1697 ; AVX512-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm5 1698 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm6 1699 ; AVX512-NEXT: vpalignr {{.*#+}} zmm0 {%k1} = zmm1[11,12,13,14,15],zmm2[0,1,2,3,4,5,6,7,8,9,10],zmm1[27,28,29,30,31],zmm2[16,17,18,19,20,21,22,23,24,25,26],zmm1[43,44,45,46,47],zmm2[32,33,34,35,36,37,38,39,40,41,42],zmm1[59,60,61,62,63],zmm2[48,49,50,51,52,53,54,55,56,57,58] 1700 ; AVX512-NEXT: vpalignr {{.*#+}} zmm1 = zmm1[11,12,13,14,15],zmm2[0,1,2,3,4,5,6,7,8,9,10],zmm1[27,28,29,30,31],zmm2[16,17,18,19,20,21,22,23,24,25,26],zmm1[43,44,45,46,47],zmm2[32,33,34,35,36,37,38,39,40,41,42],zmm1[59,60,61,62,63],zmm2[48,49,50,51,52,53,54,55,56,57,58] 1701 ; AVX512-NEXT: vpalignr {{.*#+}} zmm1 = zmm3[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm3[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm3[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm3[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58] 1702 ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0 1703 ; AVX512-NEXT: vpalignr {{.*#+}} ymm1 = ymm5[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20] 1704 ; AVX512-NEXT: vextracti64x4 $1, %zmm3, %ymm2 1705 ; AVX512-NEXT: vpblendvb %ymm4, %ymm2, %ymm6, %ymm2 1706 ; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20] 1707 ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 1708 ; AVX512-NEXT: vpaddb %zmm0, %zmm1, %zmm0 1709 ; AVX512-NEXT: retq 1710 %wide.vec = load <192 x i8>, <192 x i8>* %ptr, align 1 1711 %v1 = shufflevector <192 x i8> %wide.vec, <192 x i8> undef, <64 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45, i32 48, i32 51, i32 54, i32 57, i32 60, i32 63, i32 66, i32 69, i32 72, i32 75, i32 78, i32 81, i32 84, i32 87, i32 90, i32 93, i32 96, i32 99, i32 102, i32 105, i32 108, i32 111, i32 114, i32 117, i32 120, i32 123, i32 126, i32 129, i32 132, i32 135, i32 138, i32 141, i32 144, i32 147, i32 150, i32 153, i32 156, i32 159, i32 162, i32 165, i32 168, i32 171, i32 174, i32 177, i32 180, i32 183, i32 186, i32 189> 1712 %v2 = shufflevector <192 x i8> %wide.vec, <192 x i8> undef, <64 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46, i32 49, i32 52, i32 55, i32 58, i32 61, i32 64, i32 67, i32 70, i32 73, i32 76, i32 79, i32 82, i32 85, i32 88, i32 91, i32 94, i32 97, i32 100, i32 103, i32 106, i32 109, i32 112, i32 115, i32 118, i32 121, i32 124, i32 127, i32 130, i32 133, i32 136, i32 139, i32 142, i32 145, i32 148, i32 151, i32 154, i32 157, i32 160, i32 163, i32 166, i32 169, i32 172, i32 175, i32 178, i32 181, i32 184, i32 187, i32 190> 1713 %v3 = shufflevector <192 x i8> %wide.vec, <192 x i8> undef, <64 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47, i32 50, i32 53, i32 56, i32 59, i32 62, i32 65, i32 68, i32 71, i32 74, i32 77, i32 80, i32 83, i32 86, i32 89, i32 92, i32 95, i32 98, i32 101, i32 104, i32 107, i32 110, i32 113, i32 116, i32 119, i32 122, i32 125, i32 128, i32 131, i32 134, i32 137, i32 140, i32 143, i32 146, i32 149, i32 152, i32 155, i32 158, i32 161, i32 164, i32 167, i32 170, i32 173, i32 176, i32 179, i32 182, i32 185, i32 188, i32 191> 1714 %add1 = add <64 x i8> %v1, %v2 1715 %add2 = add <64 x i8> %v3, %add1 1716 ret <64 x i8> %add2 1717 } 1718 1719 define void @interleaved_store_vf64_i8_stride4(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c,<64 x i8> %d, <256 x i8>* %p) { 1720 ; AVX1-LABEL: interleaved_store_vf64_i8_stride4: 1721 ; AVX1: # %bb.0: 1722 ; AVX1-NEXT: subq $24, %rsp 1723 ; AVX1-NEXT: .cfi_def_cfa_offset 32 1724 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 1725 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm11 1726 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm12 1727 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] 1728 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] 1729 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm13 1730 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm14 1731 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] 1732 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] 1733 ; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill 1734 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] 1735 ; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill 1736 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] 1737 ; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill 1738 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] 1739 ; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill 1740 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] 1741 ; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm1 1742 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm3 1743 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] 1744 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] 1745 ; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill 1746 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] 1747 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] 1748 ; AVX1-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill 1749 ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm3 1750 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15] 1751 ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm4 1752 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] 1753 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] 1754 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3] 1755 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] 1756 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm11, %ymm1 1757 ; AVX1-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) # 32-byte Spill 1758 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3] 1759 ; AVX1-NEXT: vmovdqa %xmm8, %xmm2 1760 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3] 1761 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm8, %ymm13 1762 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm15[4],xmm5[4],xmm15[5],xmm5[5],xmm15[6],xmm5[6],xmm15[7],xmm5[7] 1763 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7] 1764 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7] 1765 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] 1766 ; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload 1767 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] 1768 ; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload 1769 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] 1770 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm9, %ymm14 1771 ; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload 1772 ; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload 1773 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] 1774 ; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload 1775 ; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload 1776 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] 1777 ; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm9, %ymm9 1778 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] 1779 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] 1780 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] 1781 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] 1782 ; AVX1-NEXT: vinsertf128 $1, %xmm12, %ymm8, %ymm0 1783 ; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm13, %ymm8 1784 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm13[2,3],ymm0[2,3] 1785 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1 1786 ; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm9, %ymm6 1787 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[2,3],ymm1[2,3] 1788 ; AVX1-NEXT: vinsertf128 $1, %xmm15, %ymm10, %ymm2 1789 ; AVX1-NEXT: vmovups -{{[0-9]+}}(%rsp), %ymm3 # 32-byte Reload 1790 ; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm3, %ymm0 1791 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm3[2,3],ymm2[2,3] 1792 ; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm7, %ymm3 1793 ; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm14, %ymm7 1794 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3] 1795 ; AVX1-NEXT: vmovaps %ymm3, 224(%rdi) 1796 ; AVX1-NEXT: vmovaps %ymm2, 192(%rdi) 1797 ; AVX1-NEXT: vmovaps %ymm7, 160(%rdi) 1798 ; AVX1-NEXT: vmovaps %ymm0, 128(%rdi) 1799 ; AVX1-NEXT: vmovaps %ymm1, 96(%rdi) 1800 ; AVX1-NEXT: vmovaps %ymm5, 64(%rdi) 1801 ; AVX1-NEXT: vmovaps %ymm6, 32(%rdi) 1802 ; AVX1-NEXT: vmovaps %ymm8, (%rdi) 1803 ; AVX1-NEXT: addq $24, %rsp 1804 ; AVX1-NEXT: .cfi_def_cfa_offset 8 1805 ; AVX1-NEXT: vzeroupper 1806 ; AVX1-NEXT: retq 1807 ; 1808 ; AVX2-LABEL: interleaved_store_vf64_i8_stride4: 1809 ; AVX2: # %bb.0: 1810 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm8 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23] 1811 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm9 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23] 1812 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31] 1813 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31] 1814 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[4],ymm6[4],ymm4[5],ymm6[5],ymm4[6],ymm6[6],ymm4[7],ymm6[7],ymm4[16],ymm6[16],ymm4[17],ymm6[17],ymm4[18],ymm6[18],ymm4[19],ymm6[19],ymm4[20],ymm6[20],ymm4[21],ymm6[21],ymm4[22],ymm6[22],ymm4[23],ymm6[23] 1815 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm5[0],ymm7[0],ymm5[1],ymm7[1],ymm5[2],ymm7[2],ymm5[3],ymm7[3],ymm5[4],ymm7[4],ymm5[5],ymm7[5],ymm5[6],ymm7[6],ymm5[7],ymm7[7],ymm5[16],ymm7[16],ymm5[17],ymm7[17],ymm5[18],ymm7[18],ymm5[19],ymm7[19],ymm5[20],ymm7[20],ymm5[21],ymm7[21],ymm5[22],ymm7[22],ymm5[23],ymm7[23] 1816 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm4[8],ymm6[8],ymm4[9],ymm6[9],ymm4[10],ymm6[10],ymm4[11],ymm6[11],ymm4[12],ymm6[12],ymm4[13],ymm6[13],ymm4[14],ymm6[14],ymm4[15],ymm6[15],ymm4[24],ymm6[24],ymm4[25],ymm6[25],ymm4[26],ymm6[26],ymm4[27],ymm6[27],ymm4[28],ymm6[28],ymm4[29],ymm6[29],ymm4[30],ymm6[30],ymm4[31],ymm6[31] 1817 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm5[8],ymm7[8],ymm5[9],ymm7[9],ymm5[10],ymm7[10],ymm5[11],ymm7[11],ymm5[12],ymm7[12],ymm5[13],ymm7[13],ymm5[14],ymm7[14],ymm5[15],ymm7[15],ymm5[24],ymm7[24],ymm5[25],ymm7[25],ymm5[26],ymm7[26],ymm5[27],ymm7[27],ymm5[28],ymm7[28],ymm5[29],ymm7[29],ymm5[30],ymm7[30],ymm5[31],ymm7[31] 1818 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm9[0],ymm3[0],ymm9[1],ymm3[1],ymm9[2],ymm3[2],ymm9[3],ymm3[3],ymm9[8],ymm3[8],ymm9[9],ymm3[9],ymm9[10],ymm3[10],ymm9[11],ymm3[11] 1819 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm8[0],ymm2[0],ymm8[1],ymm2[1],ymm8[2],ymm2[2],ymm8[3],ymm2[3],ymm8[8],ymm2[8],ymm8[9],ymm2[9],ymm8[10],ymm2[10],ymm8[11],ymm2[11] 1820 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm9[4],ymm3[4],ymm9[5],ymm3[5],ymm9[6],ymm3[6],ymm9[7],ymm3[7],ymm9[12],ymm3[12],ymm9[13],ymm3[13],ymm9[14],ymm3[14],ymm9[15],ymm3[15] 1821 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm8[4],ymm2[4],ymm8[5],ymm2[5],ymm8[6],ymm2[6],ymm8[7],ymm2[7],ymm8[12],ymm2[12],ymm8[13],ymm2[13],ymm8[14],ymm2[14],ymm8[15],ymm2[15] 1822 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11] 1823 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[8],ymm4[8],ymm0[9],ymm4[9],ymm0[10],ymm4[10],ymm0[11],ymm4[11] 1824 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15] 1825 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm4[4],ymm0[5],ymm4[5],ymm0[6],ymm4[6],ymm0[7],ymm4[7],ymm0[12],ymm4[12],ymm0[13],ymm4[13],ymm0[14],ymm4[14],ymm0[15],ymm4[15] 1826 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm7, %ymm4 1827 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm9, %ymm5 1828 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] 1829 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3] 1830 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm6, %ymm7 1831 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm8, %ymm9 1832 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm6[2,3],ymm3[2,3] 1833 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] 1834 ; AVX2-NEXT: vmovdqa %ymm1, 224(%rdi) 1835 ; AVX2-NEXT: vmovdqa %ymm3, 192(%rdi) 1836 ; AVX2-NEXT: vmovdqa %ymm0, 96(%rdi) 1837 ; AVX2-NEXT: vmovdqa %ymm2, 64(%rdi) 1838 ; AVX2-NEXT: vmovdqa %ymm9, 160(%rdi) 1839 ; AVX2-NEXT: vmovdqa %ymm7, 128(%rdi) 1840 ; AVX2-NEXT: vmovdqa %ymm5, 32(%rdi) 1841 ; AVX2-NEXT: vmovdqa %ymm4, (%rdi) 1842 ; AVX2-NEXT: vzeroupper 1843 ; AVX2-NEXT: retq 1844 ; 1845 ; AVX512-LABEL: interleaved_store_vf64_i8_stride4: 1846 ; AVX512: # %bb.0: 1847 ; AVX512-NEXT: vpunpcklbw {{.*#+}} zmm4 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] 1848 ; AVX512-NEXT: vpunpckhbw {{.*#+}} zmm0 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63] 1849 ; AVX512-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm2[0],zmm3[0],zmm2[1],zmm3[1],zmm2[2],zmm3[2],zmm2[3],zmm3[3],zmm2[4],zmm3[4],zmm2[5],zmm3[5],zmm2[6],zmm3[6],zmm2[7],zmm3[7],zmm2[16],zmm3[16],zmm2[17],zmm3[17],zmm2[18],zmm3[18],zmm2[19],zmm3[19],zmm2[20],zmm3[20],zmm2[21],zmm3[21],zmm2[22],zmm3[22],zmm2[23],zmm3[23],zmm2[32],zmm3[32],zmm2[33],zmm3[33],zmm2[34],zmm3[34],zmm2[35],zmm3[35],zmm2[36],zmm3[36],zmm2[37],zmm3[37],zmm2[38],zmm3[38],zmm2[39],zmm3[39],zmm2[48],zmm3[48],zmm2[49],zmm3[49],zmm2[50],zmm3[50],zmm2[51],zmm3[51],zmm2[52],zmm3[52],zmm2[53],zmm3[53],zmm2[54],zmm3[54],zmm2[55],zmm3[55] 1850 ; AVX512-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm2[8],zmm3[8],zmm2[9],zmm3[9],zmm2[10],zmm3[10],zmm2[11],zmm3[11],zmm2[12],zmm3[12],zmm2[13],zmm3[13],zmm2[14],zmm3[14],zmm2[15],zmm3[15],zmm2[24],zmm3[24],zmm2[25],zmm3[25],zmm2[26],zmm3[26],zmm2[27],zmm3[27],zmm2[28],zmm3[28],zmm2[29],zmm3[29],zmm2[30],zmm3[30],zmm2[31],zmm3[31],zmm2[40],zmm3[40],zmm2[41],zmm3[41],zmm2[42],zmm3[42],zmm2[43],zmm3[43],zmm2[44],zmm3[44],zmm2[45],zmm3[45],zmm2[46],zmm3[46],zmm2[47],zmm3[47],zmm2[56],zmm3[56],zmm2[57],zmm3[57],zmm2[58],zmm3[58],zmm2[59],zmm3[59],zmm2[60],zmm3[60],zmm2[61],zmm3[61],zmm2[62],zmm3[62],zmm2[63],zmm3[63] 1851 ; AVX512-NEXT: vpunpcklwd {{.*#+}} zmm3 = zmm4[0],zmm1[0],zmm4[1],zmm1[1],zmm4[2],zmm1[2],zmm4[3],zmm1[3],zmm4[8],zmm1[8],zmm4[9],zmm1[9],zmm4[10],zmm1[10],zmm4[11],zmm1[11],zmm4[16],zmm1[16],zmm4[17],zmm1[17],zmm4[18],zmm1[18],zmm4[19],zmm1[19],zmm4[24],zmm1[24],zmm4[25],zmm1[25],zmm4[26],zmm1[26],zmm4[27],zmm1[27] 1852 ; AVX512-NEXT: vpunpckhwd {{.*#+}} zmm1 = zmm4[4],zmm1[4],zmm4[5],zmm1[5],zmm4[6],zmm1[6],zmm4[7],zmm1[7],zmm4[12],zmm1[12],zmm4[13],zmm1[13],zmm4[14],zmm1[14],zmm4[15],zmm1[15],zmm4[20],zmm1[20],zmm4[21],zmm1[21],zmm4[22],zmm1[22],zmm4[23],zmm1[23],zmm4[28],zmm1[28],zmm4[29],zmm1[29],zmm4[30],zmm1[30],zmm4[31],zmm1[31] 1853 ; AVX512-NEXT: vpunpcklwd {{.*#+}} zmm4 = zmm0[0],zmm2[0],zmm0[1],zmm2[1],zmm0[2],zmm2[2],zmm0[3],zmm2[3],zmm0[8],zmm2[8],zmm0[9],zmm2[9],zmm0[10],zmm2[10],zmm0[11],zmm2[11],zmm0[16],zmm2[16],zmm0[17],zmm2[17],zmm0[18],zmm2[18],zmm0[19],zmm2[19],zmm0[24],zmm2[24],zmm0[25],zmm2[25],zmm0[26],zmm2[26],zmm0[27],zmm2[27] 1854 ; AVX512-NEXT: vpunpckhwd {{.*#+}} zmm0 = zmm0[4],zmm2[4],zmm0[5],zmm2[5],zmm0[6],zmm2[6],zmm0[7],zmm2[7],zmm0[12],zmm2[12],zmm0[13],zmm2[13],zmm0[14],zmm2[14],zmm0[15],zmm2[15],zmm0[20],zmm2[20],zmm0[21],zmm2[21],zmm0[22],zmm2[22],zmm0[23],zmm2[23],zmm0[28],zmm2[28],zmm0[29],zmm2[29],zmm0[30],zmm2[30],zmm0[31],zmm2[31] 1855 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm2 1856 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm5 1857 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm3[2,3],ymm1[2,3] 1858 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm4[2,3],ymm0[2,3] 1859 ; AVX512-NEXT: vextracti64x4 $1, %zmm3, %ymm3 1860 ; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm1 1861 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm8 1862 ; AVX512-NEXT: vextracti64x4 $1, %zmm4, %ymm4 1863 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm0 1864 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm9 1865 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] 1866 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm4[2,3],ymm0[2,3] 1867 ; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 1868 ; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm3 1869 ; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm4 1870 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 1871 ; AVX512-NEXT: vmovdqa64 %zmm0, 192(%rdi) 1872 ; AVX512-NEXT: vmovdqa64 %zmm3, 64(%rdi) 1873 ; AVX512-NEXT: vmovdqa64 %zmm4, 128(%rdi) 1874 ; AVX512-NEXT: vmovdqa64 %zmm2, (%rdi) 1875 ; AVX512-NEXT: vzeroupper 1876 ; AVX512-NEXT: retq 1877 %1 = shufflevector <64 x i8> %a, <64 x i8> %b, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> 1878 %2 = shufflevector <64 x i8> %c, <64 x i8> %d, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> 1879 %interleaved = shufflevector <128 x i8> %1, <128 x i8> %2, <256 x i32> <i32 0, i32 64, i32 128, i32 192, i32 1, i32 65, i32 129, i32 193, i32 2, i32 66, i32 130, i32 194, i32 3, i32 67, i32 131, i32 195, i32 4, i32 68, i32 132, i32 196, i32 5, i32 69, i32 133, i32 197, i32 6, i32 70, i32 134, i32 198, i32 7, i32 71, i32 135, i32 199, i32 8, i32 72, i32 136, i32 200, i32 9, i32 73, i32 137, i32 201, i32 10, i32 74, i32 138, i32 202, i32 11, i32 75, i32 139, i32 203, i32 12, i32 76, i32 140, i32 204, i32 13, i32 77, i32 141, i32 205, i32 14, i32 78, i32 142, i32 206, i32 15, i32 79, i32 143, i32 207, i32 16, i32 80, i32 144, i32 208, i32 17, i32 81, i32 145, i32 209, i32 18, i32 82, i32 146, i32 210, i32 19, i32 83, i32 147, i32 211, i32 20, i32 84, i32 148, i32 212, i32 21, i32 85, i32 149, i32 213, i32 22, i32 86, i32 150, i32 214, i32 23, i32 87, i32 151, i32 215, i32 24, i32 88, i32 152, i32 216, i32 25, i32 89, i32 153, i32 217, i32 26, i32 90, i32 154, i32 218, i32 27, i32 91, i32 155, i32 219, i32 28, i32 92, i32 156, i32 220, i32 29, i32 93, i32 157, i32 221, i32 30, i32 94, i32 158, i32 222, i32 31, i32 95, i32 159, i32 223, i32 32, i32 96, i32 160, i32 224, i32 33, i32 97, i32 161, i32 225, i32 34, i32 98, i32 162, i32 226, i32 35, i32 99, i32 163, i32 227, i32 36, i32 100, i32 164, i32 228, i32 37, i32 101, i32 165, i32 229, i32 38, i32 102, i32 166, i32 230, i32 39, i32 103, i32 167, i32 231, i32 40, i32 104, i32 168, i32 232, i32 41, i32 105, i32 169, i32 233, i32 42, i32 106, i32 170, i32 234, i32 43, i32 107, i32 171, i32 235, i32 44, i32 108, i32 172, i32 236, i32 45, i32 109, i32 173, i32 237, i32 46, i32 110, i32 174, i32 238, i32 47, i32 111, i32 175, i32 239, i32 48, i32 112, i32 176, i32 240, i32 49, i32 113, i32 177, i32 241, i32 50, i32 114, i32 178, i32 242, i32 51, i32 115, i32 179, i32 243, i32 52, i32 116, i32 180, i32 244, i32 53, i32 117, i32 181, i32 245, i32 54, i32 118, i32 182, i32 246, i32 55, i32 119, i32 183, i32 247, i32 56, i32 120, i32 184, i32 248, i32 57, i32 121, i32 185, i32 249, i32 58, i32 122, i32 186, i32 250, i32 59, i32 123, i32 187, i32 251, i32 60, i32 124, i32 188, i32 252, i32 61, i32 125, i32 189, i32 253, i32 62, i32 126, i32 190, i32 254, i32 63, i32 127, i32 191, i32 255> 1880 store <256 x i8> %interleaved, <256 x i8>* %p 1881 ret void 1882 } 1883