Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx | FileCheck %s --check-prefix=AVX1
      3 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
      4 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx512f -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512
      5 
      6 define <4 x double> @load_factorf64_4(<16 x double>* %ptr) {
      7 ; AVX1-LABEL: load_factorf64_4:
      8 ; AVX1:       # %bb.0:
      9 ; AVX1-NEXT:    vmovupd (%rdi), %ymm0
     10 ; AVX1-NEXT:    vmovupd 32(%rdi), %ymm1
     11 ; AVX1-NEXT:    vmovupd 64(%rdi), %ymm2
     12 ; AVX1-NEXT:    vmovupd 96(%rdi), %ymm3
     13 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
     14 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
     15 ; AVX1-NEXT:    vhaddpd %ymm5, %ymm4, %ymm4
     16 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
     17 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
     18 ; AVX1-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
     19 ; AVX1-NEXT:    vaddpd %ymm2, %ymm4, %ymm2
     20 ; AVX1-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
     21 ; AVX1-NEXT:    vaddpd %ymm0, %ymm2, %ymm0
     22 ; AVX1-NEXT:    retq
     23 ;
     24 ; AVX-LABEL: load_factorf64_4:
     25 ; AVX:       # %bb.0:
     26 ; AVX-NEXT:    vmovupd (%rdi), %ymm0
     27 ; AVX-NEXT:    vmovupd 32(%rdi), %ymm1
     28 ; AVX-NEXT:    vmovupd 64(%rdi), %ymm2
     29 ; AVX-NEXT:    vmovupd 96(%rdi), %ymm3
     30 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
     31 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
     32 ; AVX-NEXT:    vhaddpd %ymm5, %ymm4, %ymm4
     33 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
     34 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
     35 ; AVX-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
     36 ; AVX-NEXT:    vaddpd %ymm2, %ymm4, %ymm2
     37 ; AVX-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
     38 ; AVX-NEXT:    vaddpd %ymm0, %ymm2, %ymm0
     39 ; AVX-NEXT:    retq
     40   %wide.vec = load <16 x double>, <16 x double>* %ptr, align 16
     41   %strided.v0 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
     42   %strided.v1 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
     43   %strided.v2 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
     44   %strided.v3 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
     45   %add1 = fadd <4 x double> %strided.v0, %strided.v1
     46   %add2 = fadd <4 x double> %add1, %strided.v2
     47   %add3 = fadd <4 x double> %add2, %strided.v3
     48   ret <4 x double> %add3
     49 }
     50 
     51 define <4 x double> @load_factorf64_2(<16 x double>* %ptr) {
     52 ; AVX1-LABEL: load_factorf64_2:
     53 ; AVX1:       # %bb.0:
     54 ; AVX1-NEXT:    vmovupd (%rdi), %ymm0
     55 ; AVX1-NEXT:    vmovupd 32(%rdi), %ymm1
     56 ; AVX1-NEXT:    vmovupd 64(%rdi), %ymm2
     57 ; AVX1-NEXT:    vmovupd 96(%rdi), %ymm3
     58 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
     59 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
     60 ; AVX1-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
     61 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
     62 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
     63 ; AVX1-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
     64 ; AVX1-NEXT:    vmulpd %ymm0, %ymm4, %ymm0
     65 ; AVX1-NEXT:    retq
     66 ;
     67 ; AVX-LABEL: load_factorf64_2:
     68 ; AVX:       # %bb.0:
     69 ; AVX-NEXT:    vmovupd (%rdi), %ymm0
     70 ; AVX-NEXT:    vmovupd 32(%rdi), %ymm1
     71 ; AVX-NEXT:    vmovupd 64(%rdi), %ymm2
     72 ; AVX-NEXT:    vmovupd 96(%rdi), %ymm3
     73 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
     74 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
     75 ; AVX-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
     76 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
     77 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
     78 ; AVX-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
     79 ; AVX-NEXT:    vmulpd %ymm0, %ymm4, %ymm0
     80 ; AVX-NEXT:    retq
     81   %wide.vec = load <16 x double>, <16 x double>* %ptr, align 16
     82   %strided.v0 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
     83   %strided.v3 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
     84   %mul = fmul <4 x double> %strided.v0, %strided.v3
     85   ret <4 x double> %mul
     86 }
     87 
     88 define <4 x double> @load_factorf64_1(<16 x double>* %ptr) {
     89 ; AVX1-LABEL: load_factorf64_1:
     90 ; AVX1:       # %bb.0:
     91 ; AVX1-NEXT:    vmovupd (%rdi), %ymm0
     92 ; AVX1-NEXT:    vmovupd 32(%rdi), %ymm1
     93 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],mem[0,1]
     94 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],mem[0,1]
     95 ; AVX1-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
     96 ; AVX1-NEXT:    vmulpd %ymm0, %ymm0, %ymm0
     97 ; AVX1-NEXT:    retq
     98 ;
     99 ; AVX-LABEL: load_factorf64_1:
    100 ; AVX:       # %bb.0:
    101 ; AVX-NEXT:    vmovupd (%rdi), %ymm0
    102 ; AVX-NEXT:    vmovupd 32(%rdi), %ymm1
    103 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],mem[0,1]
    104 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],mem[0,1]
    105 ; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
    106 ; AVX-NEXT:    vmulpd %ymm0, %ymm0, %ymm0
    107 ; AVX-NEXT:    retq
    108   %wide.vec = load <16 x double>, <16 x double>* %ptr, align 16
    109   %strided.v0 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
    110   %strided.v3 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
    111   %mul = fmul <4 x double> %strided.v0, %strided.v3
    112   ret <4 x double> %mul
    113 }
    114 
    115 define <4 x i64> @load_factori64_4(<16 x i64>* %ptr) {
    116 ; AVX1-LABEL: load_factori64_4:
    117 ; AVX1:       # %bb.0:
    118 ; AVX1-NEXT:    vmovups (%rdi), %ymm0
    119 ; AVX1-NEXT:    vmovups 32(%rdi), %ymm1
    120 ; AVX1-NEXT:    vmovups 64(%rdi), %ymm2
    121 ; AVX1-NEXT:    vmovups 96(%rdi), %ymm3
    122 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
    123 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
    124 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
    125 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
    126 ; AVX1-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
    127 ; AVX1-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
    128 ; AVX1-NEXT:    vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
    129 ; AVX1-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
    130 ; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm1
    131 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
    132 ; AVX1-NEXT:    vpaddq %xmm3, %xmm4, %xmm4
    133 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
    134 ; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
    135 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
    136 ; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
    137 ; AVX1-NEXT:    vpaddq %xmm1, %xmm5, %xmm1
    138 ; AVX1-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
    139 ; AVX1-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
    140 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    141 ; AVX1-NEXT:    retq
    142 ;
    143 ; AVX-LABEL: load_factori64_4:
    144 ; AVX:       # %bb.0:
    145 ; AVX-NEXT:    vmovdqu (%rdi), %ymm0
    146 ; AVX-NEXT:    vmovdqu 32(%rdi), %ymm1
    147 ; AVX-NEXT:    vmovdqu 64(%rdi), %ymm2
    148 ; AVX-NEXT:    vmovdqu 96(%rdi), %ymm3
    149 ; AVX-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
    150 ; AVX-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
    151 ; AVX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
    152 ; AVX-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
    153 ; AVX-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
    154 ; AVX-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
    155 ; AVX-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
    156 ; AVX-NEXT:    vpaddq %ymm3, %ymm4, %ymm3
    157 ; AVX-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
    158 ; AVX-NEXT:    vpaddq %ymm0, %ymm3, %ymm0
    159 ; AVX-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
    160 ; AVX-NEXT:    retq
    161   %wide.vec = load <16 x i64>, <16 x i64>* %ptr, align 16
    162   %strided.v0 = shufflevector <16 x i64> %wide.vec, <16 x i64> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
    163   %strided.v1 = shufflevector <16 x i64> %wide.vec, <16 x i64> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
    164   %strided.v2 = shufflevector <16 x i64> %wide.vec, <16 x i64> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
    165   %strided.v3 = shufflevector <16 x i64> %wide.vec, <16 x i64> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
    166   %add1 = add <4 x i64> %strided.v0, %strided.v1
    167   %add2 = add <4 x i64> %add1, %strided.v2
    168   %add3 = add <4 x i64> %add2, %strided.v3
    169   ret <4 x i64> %add3
    170 }
    171 
    172 define void @store_factorf64_4(<16 x double>* %ptr, <4 x double> %v0, <4 x double> %v1, <4 x double> %v2, <4 x double> %v3) {
    173 ; AVX1-LABEL: store_factorf64_4:
    174 ; AVX1:       # %bb.0:
    175 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm4
    176 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm5
    177 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
    178 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
    179 ; AVX1-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
    180 ; AVX1-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
    181 ; AVX1-NEXT:    vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
    182 ; AVX1-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
    183 ; AVX1-NEXT:    vmovups %ymm0, 96(%rdi)
    184 ; AVX1-NEXT:    vmovups %ymm3, 64(%rdi)
    185 ; AVX1-NEXT:    vmovups %ymm4, 32(%rdi)
    186 ; AVX1-NEXT:    vmovups %ymm2, (%rdi)
    187 ; AVX1-NEXT:    vzeroupper
    188 ; AVX1-NEXT:    retq
    189 ;
    190 ; AVX2-LABEL: store_factorf64_4:
    191 ; AVX2:       # %bb.0:
    192 ; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm4
    193 ; AVX2-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm5
    194 ; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
    195 ; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
    196 ; AVX2-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
    197 ; AVX2-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
    198 ; AVX2-NEXT:    vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
    199 ; AVX2-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
    200 ; AVX2-NEXT:    vmovups %ymm0, 96(%rdi)
    201 ; AVX2-NEXT:    vmovups %ymm3, 64(%rdi)
    202 ; AVX2-NEXT:    vmovups %ymm4, 32(%rdi)
    203 ; AVX2-NEXT:    vmovups %ymm2, (%rdi)
    204 ; AVX2-NEXT:    vzeroupper
    205 ; AVX2-NEXT:    retq
    206 ;
    207 ; AVX512-LABEL: store_factorf64_4:
    208 ; AVX512:       # %bb.0:
    209 ; AVX512-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm4
    210 ; AVX512-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm5
    211 ; AVX512-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
    212 ; AVX512-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
    213 ; AVX512-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
    214 ; AVX512-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
    215 ; AVX512-NEXT:    vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
    216 ; AVX512-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
    217 ; AVX512-NEXT:    vinsertf64x4 $1, %ymm4, %zmm2, %zmm1
    218 ; AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm3, %zmm0
    219 ; AVX512-NEXT:    vmovups %zmm0, 64(%rdi)
    220 ; AVX512-NEXT:    vmovups %zmm1, (%rdi)
    221 ; AVX512-NEXT:    vzeroupper
    222 ; AVX512-NEXT:    retq
    223   %s0 = shufflevector <4 x double> %v0, <4 x double> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
    224   %s1 = shufflevector <4 x double> %v2, <4 x double> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
    225   %interleaved.vec = shufflevector <8 x double> %s0, <8 x double> %s1, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
    226   store <16 x double> %interleaved.vec, <16 x double>* %ptr, align 16
    227   ret void
    228 }
    229 
    230 define void @store_factori64_4(<16 x i64>* %ptr, <4 x i64> %v0, <4 x i64> %v1, <4 x i64> %v2, <4 x i64> %v3) {
    231 ; AVX1-LABEL: store_factori64_4:
    232 ; AVX1:       # %bb.0:
    233 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm4
    234 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm5
    235 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
    236 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
    237 ; AVX1-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
    238 ; AVX1-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
    239 ; AVX1-NEXT:    vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
    240 ; AVX1-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
    241 ; AVX1-NEXT:    vmovups %ymm0, 96(%rdi)
    242 ; AVX1-NEXT:    vmovups %ymm3, 64(%rdi)
    243 ; AVX1-NEXT:    vmovups %ymm4, 32(%rdi)
    244 ; AVX1-NEXT:    vmovups %ymm2, (%rdi)
    245 ; AVX1-NEXT:    vzeroupper
    246 ; AVX1-NEXT:    retq
    247 ;
    248 ; AVX2-LABEL: store_factori64_4:
    249 ; AVX2:       # %bb.0:
    250 ; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm4
    251 ; AVX2-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm5
    252 ; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
    253 ; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
    254 ; AVX2-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
    255 ; AVX2-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
    256 ; AVX2-NEXT:    vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
    257 ; AVX2-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
    258 ; AVX2-NEXT:    vmovups %ymm0, 96(%rdi)
    259 ; AVX2-NEXT:    vmovups %ymm3, 64(%rdi)
    260 ; AVX2-NEXT:    vmovups %ymm4, 32(%rdi)
    261 ; AVX2-NEXT:    vmovups %ymm2, (%rdi)
    262 ; AVX2-NEXT:    vzeroupper
    263 ; AVX2-NEXT:    retq
    264 ;
    265 ; AVX512-LABEL: store_factori64_4:
    266 ; AVX512:       # %bb.0:
    267 ; AVX512-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm4
    268 ; AVX512-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm5
    269 ; AVX512-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
    270 ; AVX512-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
    271 ; AVX512-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
    272 ; AVX512-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
    273 ; AVX512-NEXT:    vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
    274 ; AVX512-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
    275 ; AVX512-NEXT:    vinsertf64x4 $1, %ymm4, %zmm2, %zmm1
    276 ; AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm3, %zmm0
    277 ; AVX512-NEXT:    vmovups %zmm0, 64(%rdi)
    278 ; AVX512-NEXT:    vmovups %zmm1, (%rdi)
    279 ; AVX512-NEXT:    vzeroupper
    280 ; AVX512-NEXT:    retq
    281   %s0 = shufflevector <4 x i64> %v0, <4 x i64> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
    282   %s1 = shufflevector <4 x i64> %v2, <4 x i64> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
    283   %interleaved.vec = shufflevector <8 x i64> %s0, <8 x i64> %s1, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
    284   store <16 x i64> %interleaved.vec, <16 x i64>* %ptr, align 16
    285   ret void
    286 }
    287 
    288 
    289 define void @interleaved_store_vf32_i8_stride4(<32 x i8> %x1, <32 x i8> %x2, <32 x i8> %x3, <32 x i8> %x4, <128 x i8>* %p) {
    290 ; AVX1-LABEL: interleaved_store_vf32_i8_stride4:
    291 ; AVX1:       # %bb.0:
    292 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
    293 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
    294 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
    295 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
    296 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
    297 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15]
    298 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
    299 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm6
    300 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm0
    301 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
    302 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
    303 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15]
    304 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3]
    305 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
    306 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm6, %ymm3
    307 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7]
    308 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
    309 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm4, %ymm1
    310 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
    311 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3]
    312 ; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm6, %ymm5
    313 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7]
    314 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7]
    315 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
    316 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm4
    317 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm5, %ymm2
    318 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
    319 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3]
    320 ; AVX1-NEXT:    vmovaps %ymm0, 96(%rdi)
    321 ; AVX1-NEXT:    vmovaps %ymm1, 64(%rdi)
    322 ; AVX1-NEXT:    vmovaps %ymm2, 32(%rdi)
    323 ; AVX1-NEXT:    vmovaps %ymm4, (%rdi)
    324 ; AVX1-NEXT:    vzeroupper
    325 ; AVX1-NEXT:    retq
    326 ;
    327 ; AVX2-LABEL: interleaved_store_vf32_i8_stride4:
    328 ; AVX2:       # %bb.0:
    329 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
    330 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
    331 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23]
    332 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31]
    333 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11]
    334 ; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15]
    335 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11]
    336 ; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15]
    337 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm3, %ymm2
    338 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm4, %ymm5
    339 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
    340 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm4[2,3],ymm0[2,3]
    341 ; AVX2-NEXT:    vmovdqa %ymm0, 96(%rdi)
    342 ; AVX2-NEXT:    vmovdqa %ymm1, 64(%rdi)
    343 ; AVX2-NEXT:    vmovdqa %ymm5, 32(%rdi)
    344 ; AVX2-NEXT:    vmovdqa %ymm2, (%rdi)
    345 ; AVX2-NEXT:    vzeroupper
    346 ; AVX2-NEXT:    retq
    347 ;
    348 ; AVX512-LABEL: interleaved_store_vf32_i8_stride4:
    349 ; AVX512:       # %bb.0:
    350 ; AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
    351 ; AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
    352 ; AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23]
    353 ; AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31]
    354 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11]
    355 ; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15]
    356 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11]
    357 ; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15]
    358 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm3, %ymm2
    359 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm4, %ymm5
    360 ; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
    361 ; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm4[2,3],ymm0[2,3]
    362 ; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm2, %zmm2
    363 ; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
    364 ; AVX512-NEXT:    vmovdqa64 %zmm0, 64(%rdi)
    365 ; AVX512-NEXT:    vmovdqa64 %zmm2, (%rdi)
    366 ; AVX512-NEXT:    vzeroupper
    367 ; AVX512-NEXT:    retq
    368   %v1 = shufflevector <32 x i8> %x1, <32 x i8> %x2, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
    369   %v2 = shufflevector <32 x i8> %x3, <32 x i8> %x4, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
    370   %interleaved.vec = shufflevector <64 x i8> %v1, <64 x i8> %v2, <128 x i32> <i32 0, i32 32, i32 64, i32 96, i32 1, i32 33, i32 65, i32 97, i32 2, i32 34, i32 66, i32 98, i32 3, i32 35, i32 67, i32 99, i32 4, i32 36, i32 68, i32 100, i32 5, i32 37, i32 69, i32 101, i32 6, i32 38, i32 70, i32 102, i32 7, i32 39, i32 71, i32 103, i32 8, i32 40, i32 72, i32 104, i32 9, i32 41, i32 73, i32 105, i32 10, i32 42, i32 74, i32 106, i32 11, i32 43, i32 75, i32 107, i32 12, i32 44, i32 76, i32 108, i32 13, i32 45, i32 77, i32 109, i32 14, i32 46, i32 78, i32 110, i32 15, i32 47, i32 79, i32 111, i32 16, i32 48, i32 80, i32 112, i32 17, i32 49, i32 81, i32 113, i32 18, i32 50, i32 82, i32 114, i32 19, i32 51, i32 83, i32 115, i32 20, i32 52, i32 84, i32 116, i32 21, i32 53, i32 85, i32 117, i32 22, i32 54, i32 86, i32 118, i32 23, i32 55, i32 87, i32 119, i32 24, i32 56, i32 88, i32 120, i32 25, i32 57, i32 89, i32 121, i32 26, i32 58, i32 90, i32 122, i32 27, i32 59, i32 91, i32 123, i32 28, i32 60, i32 92, i32 124, i32 29, i32 61, i32 93, i32 125, i32 30, i32 62, i32 94, i32 126, i32 31, i32 63, i32 95, i32 127>
    371   store <128 x i8> %interleaved.vec, <128 x i8>* %p
    372 ret void
    373 }
    374 
    375 define void @interleaved_store_vf16_i8_stride4(<16 x i8> %x1, <16 x i8> %x2, <16 x i8> %x3, <16 x i8> %x4, <64 x i8>* %p) {
    376 ; AVX1-LABEL: interleaved_store_vf16_i8_stride4:
    377 ; AVX1:       # %bb.0:
    378 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
    379 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
    380 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
    381 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
    382 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
    383 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
    384 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
    385 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
    386 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
    387 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm4, %ymm0
    388 ; AVX1-NEXT:    vmovaps %ymm0, 32(%rdi)
    389 ; AVX1-NEXT:    vmovaps %ymm1, (%rdi)
    390 ; AVX1-NEXT:    vzeroupper
    391 ; AVX1-NEXT:    retq
    392 ;
    393 ; AVX2-LABEL: interleaved_store_vf16_i8_stride4:
    394 ; AVX2:       # %bb.0:
    395 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
    396 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
    397 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
    398 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
    399 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
    400 ; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
    401 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
    402 ; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
    403 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm3, %ymm1
    404 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm4, %ymm0
    405 ; AVX2-NEXT:    vmovdqa %ymm0, 32(%rdi)
    406 ; AVX2-NEXT:    vmovdqa %ymm1, (%rdi)
    407 ; AVX2-NEXT:    vzeroupper
    408 ; AVX2-NEXT:    retq
    409 ;
    410 ; AVX512-LABEL: interleaved_store_vf16_i8_stride4:
    411 ; AVX512:       # %bb.0:
    412 ; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
    413 ; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
    414 ; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
    415 ; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
    416 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
    417 ; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
    418 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
    419 ; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
    420 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm3, %ymm1
    421 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm4, %ymm0
    422 ; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
    423 ; AVX512-NEXT:    vmovdqa64 %zmm0, (%rdi)
    424 ; AVX512-NEXT:    vzeroupper
    425 ; AVX512-NEXT:    retq
    426 %v1 = shufflevector <16 x i8> %x1, <16 x i8> %x2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
    427 %v2 = shufflevector <16 x i8> %x3, <16 x i8> %x4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
    428 %interleaved.vec = shufflevector <32 x i8> %v1, <32 x i8> %v2, <64 x i32> <i32 0,i32 16,i32 32,i32 48,i32 1,i32 17,i32 33,i32 49,i32 2,i32 18,i32 34,i32 50,i32 3,i32 19,i32 35,i32 51,i32 4,i32 20,i32 36,i32 52,i32 5,i32 21,i32 37,i32 53,i32 6,i32 22,i32 38,i32 54,i32 7,i32 23,i32 39,i32 55,i32 8,i32 24,i32 40,i32 56,i32 9,i32 25,i32 41,i32 57,i32 10,i32 26,i32 42,i32 58,i32 11,i32 27,i32 43,i32 59,i32 12,i32 28,i32 44,i32 60,i32 13,i32 29,i32 45,i32 61,i32 14,i32 30,i32 46,i32 62,i32 15,i32 31,i32 47,i32 63>
    429 store <64 x i8> %interleaved.vec, <64 x i8>* %p
    430 ret void
    431 }
    432 
    433 define <8 x i8> @interleaved_load_vf8_i8_stride4(<32 x i8>* %ptr) {
    434 ; AVX1-LABEL: interleaved_load_vf8_i8_stride4:
    435 ; AVX1:       # %bb.0:
    436 ; AVX1-NEXT:    vmovdqu (%rdi), %ymm0
    437 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
    438 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
    439 ; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm3
    440 ; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
    441 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm1[0],xmm3[0]
    442 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [1,1,3,3,5,5,7,7,7,7,3,3,6,6,7,7]
    443 ; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
    444 ; AVX1-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
    445 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
    446 ; AVX1-NEXT:    vpaddw %xmm1, %xmm4, %xmm1
    447 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [6,7,2,3,14,15,10,11,14,15,10,11,12,13,14,15]
    448 ; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    449 ; AVX1-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm2[1,0,3,2,4,5,6,7]
    450 ; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
    451 ; AVX1-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm0[1,0,3,2,4,5,6,7]
    452 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
    453 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [3,3,1,1,7,7,5,5,1,1,5,5,0,0,1,1]
    454 ; AVX1-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
    455 ; AVX1-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
    456 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
    457 ; AVX1-NEXT:    vpaddw %xmm3, %xmm0, %xmm0
    458 ; AVX1-NEXT:    vpmullw %xmm0, %xmm1, %xmm0
    459 ; AVX1-NEXT:    vzeroupper
    460 ; AVX1-NEXT:    retq
    461 ;
    462 ; AVX-LABEL: interleaved_load_vf8_i8_stride4:
    463 ; AVX:       # %bb.0:
    464 ; AVX-NEXT:    vmovdqu (%rdi), %ymm0
    465 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
    466 ; AVX-NEXT:    vextracti128 $1, %ymm0, %xmm2
    467 ; AVX-NEXT:    vpshufb %xmm1, %xmm2, %xmm3
    468 ; AVX-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
    469 ; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm1[0],xmm3[0]
    470 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm5 = [1,1,3,3,5,5,7,7,7,7,3,3,6,6,7,7]
    471 ; AVX-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
    472 ; AVX-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
    473 ; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
    474 ; AVX-NEXT:    vpaddw %xmm1, %xmm4, %xmm1
    475 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [6,7,2,3,14,15,10,11,14,15,10,11,12,13,14,15]
    476 ; AVX-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    477 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm2[1,0,3,2,4,5,6,7]
    478 ; AVX-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
    479 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm0[1,0,3,2,4,5,6,7]
    480 ; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
    481 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm4 = [3,3,1,1,7,7,5,5,1,1,5,5,0,0,1,1]
    482 ; AVX-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
    483 ; AVX-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
    484 ; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
    485 ; AVX-NEXT:    vpaddw %xmm3, %xmm0, %xmm0
    486 ; AVX-NEXT:    vpmullw %xmm0, %xmm1, %xmm0
    487 ; AVX-NEXT:    vzeroupper
    488 ; AVX-NEXT:    retq
    489   %wide.vec = load <32 x i8>, <32 x i8>* %ptr, align 16
    490   %v1 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
    491   %v2 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
    492   %v3 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
    493   %v4 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
    494 
    495   %add1 = add <8 x i8> %v1, %v2
    496   %add2 = add <8 x i8> %v4, %v3
    497   %add3 = mul <8 x i8> %add1, %add2
    498   ret <8 x i8> %add3
    499 }
    500 
    501 define <16 x i1> @interleaved_load_vf16_i8_stride4(<64 x i8>* %ptr) {
    502 ; AVX1-LABEL: interleaved_load_vf16_i8_stride4:
    503 ; AVX1:       # %bb.0:
    504 ; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
    505 ; AVX1-NEXT:    vmovdqa 32(%rdi), %ymm1
    506 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
    507 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
    508 ; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm4
    509 ; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm3
    510 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
    511 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
    512 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
    513 ; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm6
    514 ; AVX1-NEXT:    vpshufb %xmm5, %xmm0, %xmm5
    515 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
    516 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7]
    517 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
    518 ; AVX1-NEXT:    vpshufb %xmm5, %xmm2, %xmm6
    519 ; AVX1-NEXT:    vpshufb %xmm5, %xmm1, %xmm5
    520 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
    521 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
    522 ; AVX1-NEXT:    vpshufb %xmm6, %xmm4, %xmm7
    523 ; AVX1-NEXT:    vpshufb %xmm6, %xmm0, %xmm6
    524 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
    525 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
    526 ; AVX1-NEXT:    vpcmpeqb %xmm5, %xmm3, %xmm3
    527 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
    528 ; AVX1-NEXT:    vpshufb %xmm5, %xmm2, %xmm6
    529 ; AVX1-NEXT:    vpshufb %xmm5, %xmm1, %xmm5
    530 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
    531 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
    532 ; AVX1-NEXT:    vpshufb %xmm6, %xmm4, %xmm7
    533 ; AVX1-NEXT:    vpshufb %xmm6, %xmm0, %xmm6
    534 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
    535 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
    536 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
    537 ; AVX1-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
    538 ; AVX1-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
    539 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
    540 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
    541 ; AVX1-NEXT:    vpshufb %xmm2, %xmm4, %xmm4
    542 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    543 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
    544 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
    545 ; AVX1-NEXT:    vpcmpeqb %xmm0, %xmm5, %xmm0
    546 ; AVX1-NEXT:    vpxor %xmm0, %xmm3, %xmm0
    547 ; AVX1-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
    548 ; AVX1-NEXT:    vzeroupper
    549 ; AVX1-NEXT:    retq
    550 ;
    551 ; AVX2-LABEL: interleaved_load_vf16_i8_stride4:
    552 ; AVX2:       # %bb.0:
    553 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
    554 ; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
    555 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
    556 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
    557 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm4
    558 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm2
    559 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
    560 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
    561 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm5
    562 ; AVX2-NEXT:    vpshufb %xmm4, %xmm5, %xmm6
    563 ; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm4
    564 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
    565 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3]
    566 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
    567 ; AVX2-NEXT:    vpshufb %xmm4, %xmm3, %xmm6
    568 ; AVX2-NEXT:    vpshufb %xmm4, %xmm1, %xmm4
    569 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
    570 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm6 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
    571 ; AVX2-NEXT:    vpshufb %xmm6, %xmm5, %xmm7
    572 ; AVX2-NEXT:    vpshufb %xmm6, %xmm0, %xmm6
    573 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
    574 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
    575 ; AVX2-NEXT:    vpcmpeqb %xmm4, %xmm2, %xmm2
    576 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
    577 ; AVX2-NEXT:    vpshufb %xmm4, %xmm3, %xmm6
    578 ; AVX2-NEXT:    vpshufb %xmm4, %xmm1, %xmm4
    579 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
    580 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm6 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
    581 ; AVX2-NEXT:    vpshufb %xmm6, %xmm5, %xmm7
    582 ; AVX2-NEXT:    vpshufb %xmm6, %xmm0, %xmm6
    583 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
    584 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
    585 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
    586 ; AVX2-NEXT:    vpshufb %xmm6, %xmm3, %xmm3
    587 ; AVX2-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
    588 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
    589 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
    590 ; AVX2-NEXT:    vpshufb %xmm3, %xmm5, %xmm5
    591 ; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
    592 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
    593 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
    594 ; AVX2-NEXT:    vpcmpeqb %xmm0, %xmm4, %xmm0
    595 ; AVX2-NEXT:    vpxor %xmm0, %xmm2, %xmm0
    596 ; AVX2-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
    597 ; AVX2-NEXT:    vzeroupper
    598 ; AVX2-NEXT:    retq
    599 ;
    600 ; AVX512-LABEL: interleaved_load_vf16_i8_stride4:
    601 ; AVX512:       # %bb.0:
    602 ; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
    603 ; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
    604 ; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
    605 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
    606 ; AVX512-NEXT:    vpshufb %xmm3, %xmm2, %xmm4
    607 ; AVX512-NEXT:    vpshufb %xmm3, %xmm1, %xmm3
    608 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
    609 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm4
    610 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm5 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
    611 ; AVX512-NEXT:    vpshufb %xmm5, %xmm4, %xmm6
    612 ; AVX512-NEXT:    vpshufb %xmm5, %xmm0, %xmm5
    613 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
    614 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm8 = xmm5[0,1],xmm3[2,3]
    615 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm5 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
    616 ; AVX512-NEXT:    vpshufb %xmm5, %xmm2, %xmm6
    617 ; AVX512-NEXT:    vpshufb %xmm5, %xmm1, %xmm5
    618 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
    619 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm6 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
    620 ; AVX512-NEXT:    vpshufb %xmm6, %xmm4, %xmm7
    621 ; AVX512-NEXT:    vpshufb %xmm6, %xmm0, %xmm6
    622 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
    623 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
    624 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
    625 ; AVX512-NEXT:    vpshufb %xmm6, %xmm2, %xmm7
    626 ; AVX512-NEXT:    vpshufb %xmm6, %xmm1, %xmm6
    627 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
    628 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm7 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
    629 ; AVX512-NEXT:    vpshufb %xmm7, %xmm4, %xmm3
    630 ; AVX512-NEXT:    vpshufb %xmm7, %xmm0, %xmm7
    631 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1]
    632 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3]
    633 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
    634 ; AVX512-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
    635 ; AVX512-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
    636 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
    637 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
    638 ; AVX512-NEXT:    vpshufb %xmm2, %xmm4, %xmm4
    639 ; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    640 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
    641 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
    642 ; AVX512-NEXT:    vpcmpeqb %zmm5, %zmm8, %k0
    643 ; AVX512-NEXT:    vpcmpeqb %zmm0, %zmm3, %k1
    644 ; AVX512-NEXT:    kxnorw %k1, %k0, %k0
    645 ; AVX512-NEXT:    vpmovm2b %k0, %zmm0
    646 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
    647 ; AVX512-NEXT:    vzeroupper
    648 ; AVX512-NEXT:    retq
    649   %wide.vec = load <64 x i8>, <64 x i8>* %ptr
    650   %v1 = shufflevector <64 x i8> %wide.vec, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
    651   %v2 = shufflevector <64 x i8> %wide.vec, <64 x i8> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
    652   %v3 = shufflevector <64 x i8> %wide.vec, <64 x i8> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
    653   %v4 = shufflevector <64 x i8> %wide.vec, <64 x i8> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
    654 
    655   %cmp1 = icmp eq <16 x i8> %v1, %v2
    656   %cmp2 = icmp eq <16 x i8> %v3, %v4
    657   %res = icmp eq <16 x i1> %cmp1, %cmp2
    658 
    659   ret <16 x i1> %res
    660 }
    661 
    662 define <32 x i1> @interleaved_load_vf32_i8_stride4(<128 x i8>* %ptr) {
    663 ; AVX1-LABEL: interleaved_load_vf32_i8_stride4:
    664 ; AVX1:       # %bb.0:
    665 ; AVX1-NEXT:    vmovdqa (%rdi), %ymm11
    666 ; AVX1-NEXT:    vmovdqa 32(%rdi), %ymm14
    667 ; AVX1-NEXT:    vmovdqa 64(%rdi), %ymm2
    668 ; AVX1-NEXT:    vmovdqa 96(%rdi), %ymm3
    669 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
    670 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm12
    671 ; AVX1-NEXT:    vpshufb %xmm6, %xmm12, %xmm5
    672 ; AVX1-NEXT:    vpshufb %xmm6, %xmm3, %xmm7
    673 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
    674 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
    675 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm13
    676 ; AVX1-NEXT:    vpshufb %xmm0, %xmm13, %xmm4
    677 ; AVX1-NEXT:    vpshufb %xmm0, %xmm2, %xmm5
    678 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
    679 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm7[4,5,6,7]
    680 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm8
    681 ; AVX1-NEXT:    vextractf128 $1, %ymm14, %xmm15
    682 ; AVX1-NEXT:    vpshufb %xmm6, %xmm15, %xmm5
    683 ; AVX1-NEXT:    vpshufb %xmm6, %xmm14, %xmm6
    684 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
    685 ; AVX1-NEXT:    vextractf128 $1, %ymm11, %xmm6
    686 ; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm4
    687 ; AVX1-NEXT:    vpshufb %xmm0, %xmm11, %xmm0
    688 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
    689 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4,5,6,7]
    690 ; AVX1-NEXT:    vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm8[4,5,6,7]
    691 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
    692 ; AVX1-NEXT:    vpshufb %xmm0, %xmm12, %xmm4
    693 ; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm5
    694 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
    695 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
    696 ; AVX1-NEXT:    vpshufb %xmm5, %xmm13, %xmm1
    697 ; AVX1-NEXT:    vpshufb %xmm5, %xmm2, %xmm7
    698 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1]
    699 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7]
    700 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
    701 ; AVX1-NEXT:    vpshufb %xmm0, %xmm15, %xmm4
    702 ; AVX1-NEXT:    vpshufb %xmm0, %xmm14, %xmm0
    703 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
    704 ; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm4
    705 ; AVX1-NEXT:    vpshufb %xmm5, %xmm11, %xmm5
    706 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
    707 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7]
    708 ; AVX1-NEXT:    vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5,6,7]
    709 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
    710 ; AVX1-NEXT:    vpshufb %xmm0, %xmm12, %xmm1
    711 ; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm4
    712 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
    713 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
    714 ; AVX1-NEXT:    vpshufb %xmm4, %xmm13, %xmm5
    715 ; AVX1-NEXT:    vpshufb %xmm4, %xmm2, %xmm7
    716 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
    717 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7]
    718 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
    719 ; AVX1-NEXT:    vpshufb %xmm0, %xmm15, %xmm5
    720 ; AVX1-NEXT:    vpshufb %xmm0, %xmm14, %xmm0
    721 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
    722 ; AVX1-NEXT:    vpshufb %xmm4, %xmm6, %xmm5
    723 ; AVX1-NEXT:    vpshufb %xmm4, %xmm11, %xmm4
    724 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
    725 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7]
    726 ; AVX1-NEXT:    vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5,6,7]
    727 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
    728 ; AVX1-NEXT:    vpshufb %xmm0, %xmm12, %xmm1
    729 ; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
    730 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
    731 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
    732 ; AVX1-NEXT:    vpshufb %xmm3, %xmm13, %xmm4
    733 ; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    734 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
    735 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
    736 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
    737 ; AVX1-NEXT:    vpshufb %xmm0, %xmm15, %xmm2
    738 ; AVX1-NEXT:    vpshufb %xmm0, %xmm14, %xmm0
    739 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
    740 ; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm2
    741 ; AVX1-NEXT:    vpshufb %xmm3, %xmm11, %xmm3
    742 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
    743 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
    744 ; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
    745 ; AVX1-NEXT:    vextractf128 $1, %ymm9, %xmm1
    746 ; AVX1-NEXT:    vextractf128 $1, %ymm8, %xmm2
    747 ; AVX1-NEXT:    vpcmpeqb %xmm1, %xmm2, %xmm1
    748 ; AVX1-NEXT:    vpcmpeqb %xmm9, %xmm8, %xmm2
    749 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
    750 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
    751 ; AVX1-NEXT:    vextractf128 $1, %ymm10, %xmm3
    752 ; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm3, %xmm2
    753 ; AVX1-NEXT:    vpcmpeqb %xmm0, %xmm10, %xmm0
    754 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
    755 ; AVX1-NEXT:    vxorps %ymm0, %ymm1, %ymm0
    756 ; AVX1-NEXT:    vxorps {{.*}}(%rip), %ymm0, %ymm0
    757 ; AVX1-NEXT:    retq
    758 ;
    759 ; AVX2-LABEL: interleaved_load_vf32_i8_stride4:
    760 ; AVX2:       # %bb.0:
    761 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm11
    762 ; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
    763 ; AVX2-NEXT:    vmovdqa 64(%rdi), %ymm7
    764 ; AVX2-NEXT:    vmovdqa 96(%rdi), %ymm5
    765 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm9
    766 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
    767 ; AVX2-NEXT:    vpshufb %xmm6, %xmm9, %xmm3
    768 ; AVX2-NEXT:    vpshufb %xmm6, %xmm1, %xmm4
    769 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
    770 ; AVX2-NEXT:    vextracti128 $1, %ymm11, %xmm10
    771 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
    772 ; AVX2-NEXT:    vpshufb %xmm2, %xmm10, %xmm3
    773 ; AVX2-NEXT:    vpshufb %xmm2, %xmm11, %xmm0
    774 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
    775 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm8 = xmm0[0,1],xmm4[2,3]
    776 ; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm12
    777 ; AVX2-NEXT:    vpshufb %xmm6, %xmm12, %xmm3
    778 ; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1]
    779 ; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm13
    780 ; AVX2-NEXT:    vpshufb %xmm6, %xmm13, %xmm6
    781 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
    782 ; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
    783 ; AVX2-NEXT:    vextracti128 $1, %ymm7, %xmm6
    784 ; AVX2-NEXT:    vpshufb %xmm2, %xmm6, %xmm0
    785 ; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,3,0,1]
    786 ; AVX2-NEXT:    vextracti128 $1, %ymm7, %xmm7
    787 ; AVX2-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
    788 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
    789 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
    790 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
    791 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm0[4,5,6,7]
    792 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
    793 ; AVX2-NEXT:    vpshufb %xmm2, %xmm9, %xmm3
    794 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm0
    795 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
    796 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
    797 ; AVX2-NEXT:    vpshufb %xmm3, %xmm10, %xmm4
    798 ; AVX2-NEXT:    vpshufb %xmm3, %xmm11, %xmm5
    799 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
    800 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3]
    801 ; AVX2-NEXT:    vpshufb %xmm2, %xmm12, %xmm4
    802 ; AVX2-NEXT:    vpshufb %xmm2, %xmm13, %xmm2
    803 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
    804 ; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
    805 ; AVX2-NEXT:    vpshufb %xmm3, %xmm6, %xmm4
    806 ; AVX2-NEXT:    vpshufb %xmm3, %xmm7, %xmm3
    807 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
    808 ; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
    809 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
    810 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
    811 ; AVX2-NEXT:    vpcmpeqb %ymm0, %ymm8, %ymm8
    812 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
    813 ; AVX2-NEXT:    vpshufb %xmm0, %xmm9, %xmm2
    814 ; AVX2-NEXT:    vpshufb %xmm0, %xmm1, %xmm3
    815 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
    816 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
    817 ; AVX2-NEXT:    vpshufb %xmm3, %xmm10, %xmm4
    818 ; AVX2-NEXT:    vpshufb %xmm3, %xmm11, %xmm5
    819 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
    820 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3]
    821 ; AVX2-NEXT:    vpshufb %xmm0, %xmm12, %xmm4
    822 ; AVX2-NEXT:    vpshufb %xmm0, %xmm13, %xmm0
    823 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
    824 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
    825 ; AVX2-NEXT:    vpshufb %xmm3, %xmm6, %xmm4
    826 ; AVX2-NEXT:    vpshufb %xmm3, %xmm7, %xmm3
    827 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
    828 ; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
    829 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
    830 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
    831 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
    832 ; AVX2-NEXT:    vpshufb %xmm2, %xmm9, %xmm3
    833 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    834 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
    835 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
    836 ; AVX2-NEXT:    vpshufb %xmm3, %xmm10, %xmm4
    837 ; AVX2-NEXT:    vpshufb %xmm3, %xmm11, %xmm5
    838 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
    839 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
    840 ; AVX2-NEXT:    vpshufb %xmm2, %xmm12, %xmm4
    841 ; AVX2-NEXT:    vpshufb %xmm2, %xmm13, %xmm2
    842 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
    843 ; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
    844 ; AVX2-NEXT:    vpshufb %xmm3, %xmm6, %xmm4
    845 ; AVX2-NEXT:    vpshufb %xmm3, %xmm7, %xmm3
    846 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
    847 ; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
    848 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
    849 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
    850 ; AVX2-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
    851 ; AVX2-NEXT:    vpxor %ymm0, %ymm8, %ymm0
    852 ; AVX2-NEXT:    vpxor {{.*}}(%rip), %ymm0, %ymm0
    853 ; AVX2-NEXT:    retq
    854 ;
    855 ; AVX512-LABEL: interleaved_load_vf32_i8_stride4:
    856 ; AVX512:       # %bb.0:
    857 ; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
    858 ; AVX512-NEXT:    vmovdqa64 64(%rdi), %zmm7
    859 ; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
    860 ; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm10
    861 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
    862 ; AVX512-NEXT:    vpshufb %xmm6, %xmm10, %xmm3
    863 ; AVX512-NEXT:    vpshufb %xmm6, %xmm1, %xmm4
    864 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
    865 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm11
    866 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
    867 ; AVX512-NEXT:    vpshufb %xmm2, %xmm11, %xmm5
    868 ; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm3
    869 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
    870 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm8 = xmm3[0,1],xmm4[2,3]
    871 ; AVX512-NEXT:    vextracti64x4 $1, %zmm7, %ymm5
    872 ; AVX512-NEXT:    vextracti128 $1, %ymm5, %xmm12
    873 ; AVX512-NEXT:    vpshufb %xmm6, %xmm12, %xmm3
    874 ; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1]
    875 ; AVX512-NEXT:    vextracti128 $1, %ymm5, %xmm13
    876 ; AVX512-NEXT:    vpshufb %xmm6, %xmm13, %xmm6
    877 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
    878 ; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
    879 ; AVX512-NEXT:    vextracti128 $1, %ymm7, %xmm14
    880 ; AVX512-NEXT:    vpshufb %xmm2, %xmm14, %xmm4
    881 ; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,3,0,1]
    882 ; AVX512-NEXT:    vextracti128 $1, %ymm7, %xmm7
    883 ; AVX512-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
    884 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
    885 ; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
    886 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
    887 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm2[4,5,6,7]
    888 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
    889 ; AVX512-NEXT:    vpshufb %xmm2, %xmm10, %xmm3
    890 ; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm4
    891 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
    892 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm4 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
    893 ; AVX512-NEXT:    vpshufb %xmm4, %xmm11, %xmm5
    894 ; AVX512-NEXT:    vpshufb %xmm4, %xmm0, %xmm6
    895 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
    896 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
    897 ; AVX512-NEXT:    vpshufb %xmm2, %xmm12, %xmm5
    898 ; AVX512-NEXT:    vpshufb %xmm2, %xmm13, %xmm2
    899 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
    900 ; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
    901 ; AVX512-NEXT:    vpshufb %xmm4, %xmm14, %xmm5
    902 ; AVX512-NEXT:    vpshufb %xmm4, %xmm7, %xmm4
    903 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
    904 ; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
    905 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
    906 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm2[4,5,6,7]
    907 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
    908 ; AVX512-NEXT:    vpshufb %xmm2, %xmm10, %xmm3
    909 ; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm4
    910 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
    911 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm4 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
    912 ; AVX512-NEXT:    vpshufb %xmm4, %xmm11, %xmm5
    913 ; AVX512-NEXT:    vpshufb %xmm4, %xmm0, %xmm6
    914 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
    915 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
    916 ; AVX512-NEXT:    vpshufb %xmm2, %xmm12, %xmm5
    917 ; AVX512-NEXT:    vpshufb %xmm2, %xmm13, %xmm2
    918 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
    919 ; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
    920 ; AVX512-NEXT:    vpshufb %xmm4, %xmm14, %xmm5
    921 ; AVX512-NEXT:    vpshufb %xmm4, %xmm7, %xmm4
    922 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
    923 ; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
    924 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
    925 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
    926 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
    927 ; AVX512-NEXT:    vpshufb %xmm3, %xmm10, %xmm4
    928 ; AVX512-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
    929 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
    930 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
    931 ; AVX512-NEXT:    vpshufb %xmm4, %xmm11, %xmm5
    932 ; AVX512-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
    933 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
    934 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
    935 ; AVX512-NEXT:    vpshufb %xmm3, %xmm12, %xmm1
    936 ; AVX512-NEXT:    vpshufb %xmm3, %xmm13, %xmm3
    937 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
    938 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
    939 ; AVX512-NEXT:    vpshufb %xmm4, %xmm14, %xmm3
    940 ; AVX512-NEXT:    vpshufb %xmm4, %xmm7, %xmm4
    941 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
    942 ; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
    943 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
    944 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
    945 ; AVX512-NEXT:    vpcmpeqb %zmm9, %zmm8, %k0
    946 ; AVX512-NEXT:    vpcmpeqb %zmm0, %zmm2, %k1
    947 ; AVX512-NEXT:    kxnord %k1, %k0, %k0
    948 ; AVX512-NEXT:    vpmovm2b %k0, %zmm0
    949 ; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
    950 ; AVX512-NEXT:    retq
    951   %wide.vec = load <128 x i8>, <128 x i8>* %ptr
    952   %v1 = shufflevector <128 x i8> %wide.vec, <128 x i8> undef, <32 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60, i32 64, i32 68, i32 72, i32 76, i32 80, i32 84, i32 88, i32 92, i32 96, i32 100, i32 104, i32 108, i32 112, i32 116, i32 120, i32 124>
    953 
    954   %v2 = shufflevector <128 x i8> %wide.vec, <128 x i8> undef, <32 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61, i32 65, i32 69, i32 73, i32 77, i32 81, i32 85, i32 89, i32 93, i32 97, i32 101, i32 105, i32 109, i32 113, i32 117, i32 121, i32 125>
    955 
    956   %v3 = shufflevector <128 x i8> %wide.vec, <128 x i8> undef, <32 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62, i32 66, i32 70, i32 74, i32 78, i32 82, i32 86, i32 90, i32 94, i32 98, i32 102, i32 106, i32 110, i32 114, i32 118, i32 122, i32 126>
    957 
    958   %v4 = shufflevector <128 x i8> %wide.vec, <128 x i8> undef, <32 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63, i32 67, i32 71, i32 75, i32 79, i32 83, i32 87, i32 91, i32 95, i32 99, i32 103, i32 107, i32 111, i32 115, i32 119, i32 123, i32 127>
    959 
    960   %cmp1 = icmp eq <32 x i8> %v1, %v2
    961   %cmp2 = icmp eq <32 x i8> %v3, %v4
    962   %res = icmp eq <32 x i1> %cmp1, %cmp2
    963 
    964   ret <32 x i1> %res
    965 }
    966 
    967 define void @interleaved_store_vf8_i8_stride4(<8 x i8> %x1, <8 x i8> %x2, <8 x i8> %x3, <8 x i8> %x4, <32 x i8>* %p) {
    968 ; AVX1-LABEL: interleaved_store_vf8_i8_stride4:
    969 ; AVX1:       # %bb.0:
    970 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
    971 ; AVX1-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
    972 ; AVX1-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
    973 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
    974 ; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm1
    975 ; AVX1-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
    976 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
    977 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
    978 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
    979 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
    980 ; AVX1-NEXT:    vmovaps %ymm0, (%rdi)
    981 ; AVX1-NEXT:    vzeroupper
    982 ; AVX1-NEXT:    retq
    983 ;
    984 ; AVX-LABEL: interleaved_store_vf8_i8_stride4:
    985 ; AVX:       # %bb.0:
    986 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
    987 ; AVX-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
    988 ; AVX-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
    989 ; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
    990 ; AVX-NEXT:    vpshufb %xmm4, %xmm3, %xmm1
    991 ; AVX-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
    992 ; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
    993 ; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
    994 ; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
    995 ; AVX-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm0
    996 ; AVX-NEXT:    vmovdqa %ymm0, (%rdi)
    997 ; AVX-NEXT:    vzeroupper
    998 ; AVX-NEXT:    retq
    999 %v1 = shufflevector <8 x i8> %x1, <8 x i8> %x2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   1000 %v2 = shufflevector <8 x i8> %x3, <8 x i8> %x4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   1001 %interleaved.vec = shufflevector <16 x i8> %v1, <16 x i8> %v2, <32 x i32> <i32 0,i32 8,i32 16,i32 24,i32 1,i32 9,i32 17,i32 25,i32 2,i32 10,i32 18,i32 26,i32 3,i32 11,i32 19,i32 27,i32 4,i32 12,i32 20,i32 28,i32 5,i32 13,i32 21,i32 29,i32 6,i32 14,i32 22,i32 30,i32 7,i32 15,i32 23,i32 31>
   1002 store <32 x i8> %interleaved.vec, <32 x i8>* %p
   1003 ret void
   1004 }
   1005 
   1006 define <32 x i8> @interleaved_load_vf32_i8_stride3(<96 x i8>* %ptr){
   1007 ; AVX1-LABEL: interleaved_load_vf32_i8_stride3:
   1008 ; AVX1:       # %bb.0:
   1009 ; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
   1010 ; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
   1011 ; AVX1-NEXT:    vmovdqa 32(%rdi), %xmm2
   1012 ; AVX1-NEXT:    vmovdqa 48(%rdi), %xmm3
   1013 ; AVX1-NEXT:    vmovdqa 64(%rdi), %xmm4
   1014 ; AVX1-NEXT:    vmovdqa 80(%rdi), %xmm5
   1015 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
   1016 ; AVX1-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
   1017 ; AVX1-NEXT:    vpshufb %xmm6, %xmm3, %xmm3
   1018 ; AVX1-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
   1019 ; AVX1-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
   1020 ; AVX1-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
   1021 ; AVX1-NEXT:    vpshufb %xmm6, %xmm5, %xmm5
   1022 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm6 = xmm5[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10]
   1023 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm7 = xmm2[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
   1024 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm3 = xmm3[11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10]
   1025 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
   1026 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm8
   1027 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm4 = xmm4[11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7,8,9,10]
   1028 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10]
   1029 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm2
   1030 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm9 = xmm7[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
   1031 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm4 = xmm6[11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10]
   1032 ; AVX1-NEXT:    vmovaps {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
   1033 ; AVX1-NEXT:    vandnps %ymm2, %ymm5, %ymm2
   1034 ; AVX1-NEXT:    vandps %ymm5, %ymm8, %ymm5
   1035 ; AVX1-NEXT:    vorps %ymm2, %ymm5, %ymm2
   1036 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,128,128,128,128,11,12,13,14,15,128,128,128,128,128]
   1037 ; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
   1038 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4]
   1039 ; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm6
   1040 ; AVX1-NEXT:    vpor %xmm3, %xmm6, %xmm3
   1041 ; AVX1-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
   1042 ; AVX1-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
   1043 ; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
   1044 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm1
   1045 ; AVX1-NEXT:    vpaddb %xmm4, %xmm1, %xmm1
   1046 ; AVX1-NEXT:    vpaddb %xmm1, %xmm3, %xmm1
   1047 ; AVX1-NEXT:    vpaddb %xmm9, %xmm2, %xmm2
   1048 ; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
   1049 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
   1050 ; AVX1-NEXT:    retq
   1051 ;
   1052 ; AVX-LABEL: interleaved_load_vf32_i8_stride3:
   1053 ; AVX:       # %bb.0:
   1054 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
   1055 ; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
   1056 ; AVX-NEXT:    vmovdqa 32(%rdi), %xmm2
   1057 ; AVX-NEXT:    vinserti128 $1, 48(%rdi), %ymm0, %ymm0
   1058 ; AVX-NEXT:    vinserti128 $1, 64(%rdi), %ymm1, %ymm1
   1059 ; AVX-NEXT:    vinserti128 $1, 80(%rdi), %ymm2, %ymm2
   1060 ; AVX-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
   1061 ; AVX-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
   1062 ; AVX-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
   1063 ; AVX-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
   1064 ; AVX-NEXT:    vpalignr {{.*#+}} ymm3 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26]
   1065 ; AVX-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
   1066 ; AVX-NEXT:    vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm1[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26]
   1067 ; AVX-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
   1068 ; AVX-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
   1069 ; AVX-NEXT:    vpblendvb %ymm4, %ymm0, %ymm1, %ymm1
   1070 ; AVX-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
   1071 ; AVX-NEXT:    vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
   1072 ; AVX-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20]
   1073 ; AVX-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
   1074 ; AVX-NEXT:    retq
   1075 	%wide.vec = load <96 x i8>, <96 x i8>* %ptr
   1076 	%v1 = shufflevector <96 x i8> %wide.vec, <96 x i8> undef,<32 x i32> <i32 0,i32 3,i32 6,i32 9,i32 12,i32 15,i32 18,i32 21,i32 24,i32 27,i32 30,i32 33,i32 36,i32 39,i32 42,i32 45,i32 48,i32 51,i32 54,i32 57,i32 60,i32 63,i32 66,i32 69,i32 72,i32 75,i32 78,i32 81,i32 84,i32 87,i32 90,i32 93>
   1077 	%v2 = shufflevector <96 x i8> %wide.vec, <96 x i8> undef,<32 x i32> <i32 1,i32 4,i32 7,i32 10,i32 13,i32 16,i32 19,i32 22,i32 25,i32 28,i32 31,i32 34,i32 37,i32 40,i32 43,i32 46,i32 49,i32 52,i32 55,i32 58,i32 61,i32 64,i32 67,i32 70,i32 73,i32 76,i32 79,i32 82,i32 85,i32 88,i32 91,i32 94>
   1078 	%v3 = shufflevector <96 x i8> %wide.vec, <96 x i8> undef,<32 x i32> <i32 2,i32 5,i32 8,i32 11,i32 14,i32 17,i32 20,i32 23,i32 26,i32 29,i32 32,i32 35,i32 38,i32 41,i32 44,i32 47,i32 50,i32 53,i32 56,i32 59,i32 62,i32 65,i32 68,i32 71,i32 74,i32 77,i32 80,i32 83,i32 86,i32 89,i32 92,i32 95>
   1079 	%add1 = add <32 x i8> %v1, %v2
   1080 	%add2 = add <32 x i8> %v3, %add1
   1081 	ret <32 x i8> %add2
   1082 }
   1083 
   1084 define <16 x i8> @interleaved_load_vf16_i8_stride3(<48 x i8>* %ptr){
   1085 ; AVX1-LABEL: interleaved_load_vf16_i8_stride3:
   1086 ; AVX1:       # %bb.0:
   1087 ; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
   1088 ; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
   1089 ; AVX1-NEXT:    vmovdqa 32(%rdi), %xmm2
   1090 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
   1091 ; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
   1092 ; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
   1093 ; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
   1094 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm3 = xmm2[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
   1095 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
   1096 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10]
   1097 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm2 = xmm3[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
   1098 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
   1099 ; AVX1-NEXT:    vpblendvb %xmm4, %xmm0, %xmm1, %xmm1
   1100 ; AVX1-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
   1101 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[11,12,13,14,15],zero,zero,zero,zero,zero
   1102 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = xmm3[5,6,7,8,9,10],zero,zero,zero,zero,zero,xmm3[0,1,2,3,4]
   1103 ; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
   1104 ; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
   1105 ; AVX1-NEXT:    retq
   1106 ;
   1107 ; AVX-LABEL: interleaved_load_vf16_i8_stride3:
   1108 ; AVX:       # %bb.0:
   1109 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
   1110 ; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
   1111 ; AVX-NEXT:    vmovdqa 32(%rdi), %xmm2
   1112 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
   1113 ; AVX-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
   1114 ; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
   1115 ; AVX-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
   1116 ; AVX-NEXT:    vpalignr {{.*#+}} xmm3 = xmm2[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
   1117 ; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
   1118 ; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10]
   1119 ; AVX-NEXT:    vpalignr {{.*#+}} xmm2 = xmm3[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
   1120 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
   1121 ; AVX-NEXT:    vpblendvb %xmm4, %xmm0, %xmm1, %xmm1
   1122 ; AVX-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
   1123 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[11,12,13,14,15],zero,zero,zero,zero,zero
   1124 ; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm3[5,6,7,8,9,10],zero,zero,zero,zero,zero,xmm3[0,1,2,3,4]
   1125 ; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
   1126 ; AVX-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
   1127 ; AVX-NEXT:    retq
   1128 	%wide.vec = load <48 x i8>, <48 x i8>* %ptr
   1129 	%v1 = shufflevector <48 x i8> %wide.vec, <48 x i8> undef,<16 x i32> <i32 0,i32 3,i32 6,i32 9,i32 12,i32 15,i32 18,i32 21,i32 24,i32 27,i32 30,i32 33,i32 36,i32 39,i32 42 ,i32 45>
   1130 	%v2 = shufflevector <48 x i8> %wide.vec, <48 x i8> undef,<16 x i32> <i32 1,i32 4,i32 7,i32 10,i32 13,i32 16,i32 19,i32 22,i32 25,i32 28,i32 31,i32 34,i32 37,i32 40,i32 43,i32 46>
   1131 	%v3 = shufflevector <48 x i8> %wide.vec, <48 x i8> undef,<16 x i32> <i32 2,i32 5,i32 8,i32 11,i32 14,i32 17,i32 20,i32 23,i32 26,i32 29,i32 32,i32 35,i32 38,i32 41,i32 44,i32 47>
   1132 	%add1 = add <16 x i8> %v1, %v2
   1133 	%add2 = add <16 x i8> %v3, %add1
   1134 	ret <16 x i8> %add2
   1135 }
   1136 
   1137 define <8 x i8> @interleaved_load_vf8_i8_stride3(<24 x i8>* %ptr){
   1138 ; AVX1-LABEL: interleaved_load_vf8_i8_stride3:
   1139 ; AVX1:       # %bb.0:
   1140 ; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
   1141 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   1142 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,2,u,5,u]
   1143 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,u,3,u,6,u,9,u,12,u,15,u],zero,xmm0[u],zero,xmm0[u]
   1144 ; AVX1-NEXT:    vpor %xmm2, %xmm3, %xmm2
   1145 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm3 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,0,u,3,u,6,u]
   1146 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[1,u,4,u,7,u,10,u,13,u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u]
   1147 ; AVX1-NEXT:    vpor %xmm3, %xmm4, %xmm3
   1148 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,1,u,4,u,7,u]
   1149 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,u,5,u,8,u,11,u,14,u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u]
   1150 ; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
   1151 ; AVX1-NEXT:    vpaddw %xmm0, %xmm3, %xmm0
   1152 ; AVX1-NEXT:    vpaddw %xmm0, %xmm2, %xmm0
   1153 ; AVX1-NEXT:    vzeroupper
   1154 ; AVX1-NEXT:    retq
   1155 ;
   1156 ; AVX-LABEL: interleaved_load_vf8_i8_stride3:
   1157 ; AVX:       # %bb.0:
   1158 ; AVX-NEXT:    vmovdqa (%rdi), %ymm0
   1159 ; AVX-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1160 ; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,2,u,5,u]
   1161 ; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,u,3,u,6,u,9,u,12,u,15,u],zero,xmm0[u],zero,xmm0[u]
   1162 ; AVX-NEXT:    vpor %xmm2, %xmm3, %xmm2
   1163 ; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,0,u,3,u,6,u]
   1164 ; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[1,u,4,u,7,u,10,u,13,u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u]
   1165 ; AVX-NEXT:    vpor %xmm3, %xmm4, %xmm3
   1166 ; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,1,u,4,u,7,u]
   1167 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,u,5,u,8,u,11,u,14,u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u]
   1168 ; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
   1169 ; AVX-NEXT:    vpaddw %xmm0, %xmm3, %xmm0
   1170 ; AVX-NEXT:    vpaddw %xmm0, %xmm2, %xmm0
   1171 ; AVX-NEXT:    vzeroupper
   1172 ; AVX-NEXT:    retq
   1173 	%wide.vec = load <24 x i8>, <24 x i8>* %ptr
   1174 	%v1 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef,<8 x i32> <i32 0,i32 3,i32 6,i32  9,i32 12,i32 15,i32 18,i32 21>
   1175 	%v2 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef,<8 x i32> <i32 1,i32 4,i32 7,i32 10,i32 13,i32 16,i32 19,i32 22>
   1176 	%v3 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef,<8 x i32> <i32 2,i32 5,i32 8,i32 11,i32 14,i32 17,i32 20,i32 23>
   1177 	%add1 = add <8 x i8> %v1, %v2
   1178 	%add2 = add <8 x i8> %v3, %add1
   1179 	ret <8 x i8> %add2
   1180 }
   1181 
   1182 define void @interleaved_store_vf8_i8_stride3(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <24 x i8>* %p) {
   1183 ; AVX1-LABEL: interleaved_store_vf8_i8_stride3:
   1184 ; AVX1:       # %bb.0:
   1185 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   1186 ; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
   1187 ; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
   1188 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1189 ; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm1
   1190 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5]
   1191 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[0],zero,zero,xmm1[1],zero,zero,xmm1[2],zero,zero,xmm1[3],zero,zero,xmm1[4],zero
   1192 ; AVX1-NEXT:    vpor %xmm3, %xmm2, %xmm2
   1193 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u]
   1194 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = zero,xmm1[5],zero,zero,xmm1[6],zero,zero,xmm1[7,u,u,u,u,u,u,u,u]
   1195 ; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
   1196 ; AVX1-NEXT:    vmovq %xmm0, 16(%rdi)
   1197 ; AVX1-NEXT:    vmovdqu %xmm2, (%rdi)
   1198 ; AVX1-NEXT:    retq
   1199 ;
   1200 ; AVX-LABEL: interleaved_store_vf8_i8_stride3:
   1201 ; AVX:       # %bb.0:
   1202 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   1203 ; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
   1204 ; AVX-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
   1205 ; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1206 ; AVX-NEXT:    vpshufb %xmm3, %xmm2, %xmm1
   1207 ; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5]
   1208 ; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[0],zero,zero,xmm1[1],zero,zero,xmm1[2],zero,zero,xmm1[3],zero,zero,xmm1[4],zero
   1209 ; AVX-NEXT:    vpor %xmm3, %xmm2, %xmm2
   1210 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u]
   1211 ; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = zero,xmm1[5],zero,zero,xmm1[6],zero,zero,xmm1[7,u,u,u,u,u,u,u,u]
   1212 ; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
   1213 ; AVX-NEXT:    vmovq %xmm0, 16(%rdi)
   1214 ; AVX-NEXT:    vmovdqu %xmm2, (%rdi)
   1215 ; AVX-NEXT:    retq
   1216 %1 = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   1217 %2 = shufflevector <8 x i8> %c, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1218 %interleaved.vec = shufflevector <16 x i8> %1, <16 x i8> %2, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
   1219 store <24 x i8> %interleaved.vec, <24 x i8>* %p, align 1
   1220 ret void
   1221 }
   1222 
   1223 define void @interleaved_store_vf16_i8_stride3(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <48 x i8>* %p) {
   1224 ; AVX1-LABEL: interleaved_store_vf16_i8_stride3:
   1225 ; AVX1:       # %bb.0:
   1226 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
   1227 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
   1228 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm3 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
   1229 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
   1230 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm1 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
   1231 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [128,0,128,128,1,128,128,2,128,128,3,128,128,4,128,128]
   1232 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm4
   1233 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [5,128,11,6,128,12,7,128,13,8,128,14,9,128,15,10]
   1234 ; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm6
   1235 ; AVX1-NEXT:    vpor %xmm4, %xmm6, %xmm4
   1236 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm6
   1237 ; AVX1-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
   1238 ; AVX1-NEXT:    vpor %xmm6, %xmm0, %xmm0
   1239 ; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
   1240 ; AVX1-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
   1241 ; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
   1242 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm4, %ymm0
   1243 ; AVX1-NEXT:    vmovdqu %xmm1, 32(%rdi)
   1244 ; AVX1-NEXT:    vmovups %ymm0, (%rdi)
   1245 ; AVX1-NEXT:    vzeroupper
   1246 ; AVX1-NEXT:    retq
   1247 ;
   1248 ; AVX2-LABEL: interleaved_store_vf16_i8_stride3:
   1249 ; AVX2:       # %bb.0:
   1250 ; AVX2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
   1251 ; AVX2-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
   1252 ; AVX2-NEXT:    vpalignr {{.*#+}} xmm3 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
   1253 ; AVX2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
   1254 ; AVX2-NEXT:    vpalignr {{.*#+}} xmm1 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
   1255 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [128,0,128,128,1,128,128,2,128,128,3,128,128,4,128,128]
   1256 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm4
   1257 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = [5,128,11,6,128,12,7,128,13,8,128,14,9,128,15,10]
   1258 ; AVX2-NEXT:    vpshufb %xmm5, %xmm3, %xmm6
   1259 ; AVX2-NEXT:    vpor %xmm4, %xmm6, %xmm4
   1260 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm6
   1261 ; AVX2-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
   1262 ; AVX2-NEXT:    vpor %xmm6, %xmm0, %xmm0
   1263 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
   1264 ; AVX2-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
   1265 ; AVX2-NEXT:    vpor %xmm2, %xmm1, %xmm1
   1266 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm4, %ymm0
   1267 ; AVX2-NEXT:    vmovdqu %xmm1, 32(%rdi)
   1268 ; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
   1269 ; AVX2-NEXT:    vzeroupper
   1270 ; AVX2-NEXT:    retq
   1271 ;
   1272 ; AVX512-LABEL: interleaved_store_vf16_i8_stride3:
   1273 ; AVX512:       # %bb.0:
   1274 ; AVX512-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
   1275 ; AVX512-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
   1276 ; AVX512-NEXT:    vpalignr {{.*#+}} xmm3 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
   1277 ; AVX512-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
   1278 ; AVX512-NEXT:    vpalignr {{.*#+}} xmm1 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
   1279 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [128,0,128,128,1,128,128,2,128,128,3,128,128,4,128,128]
   1280 ; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm4
   1281 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm5 = [5,128,11,6,128,12,7,128,13,8,128,14,9,128,15,10]
   1282 ; AVX512-NEXT:    vpshufb %xmm5, %xmm3, %xmm6
   1283 ; AVX512-NEXT:    vpor %xmm4, %xmm6, %xmm4
   1284 ; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm6
   1285 ; AVX512-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
   1286 ; AVX512-NEXT:    vpor %xmm6, %xmm0, %xmm0
   1287 ; AVX512-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
   1288 ; AVX512-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
   1289 ; AVX512-NEXT:    vpor %xmm2, %xmm1, %xmm1
   1290 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm4, %ymm0
   1291 ; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm1
   1292 ; AVX512-NEXT:    vmovdqu %ymm0, (%rdi)
   1293 ; AVX512-NEXT:    vextracti32x4 $2, %zmm1, 32(%rdi)
   1294 ; AVX512-NEXT:    vzeroupper
   1295 ; AVX512-NEXT:    retq
   1296 %1 = shufflevector <16 x i8> %a, <16 x i8> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   1297 %2 = shufflevector <16 x i8> %c, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1298 %interleaved.vec = shufflevector <32 x i8> %1, <32 x i8> %2, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
   1299 store <48 x i8> %interleaved.vec, <48 x i8>* %p, align 1
   1300 ret void
   1301 }
   1302 
   1303 define void @interleaved_store_vf32_i8_stride3(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <96 x i8>* %p) {
   1304 ; AVX1-LABEL: interleaved_store_vf32_i8_stride3:
   1305 ; AVX1:       # %bb.0:
   1306 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
   1307 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
   1308 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
   1309 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
   1310 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm4 = xmm4[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
   1311 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
   1312 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm5 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
   1313 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm6
   1314 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm7 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4]
   1315 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
   1316 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm3 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
   1317 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm1 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
   1318 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm2 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
   1319 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm4 = xmm7[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
   1320 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm6 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
   1321 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm3 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
   1322 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
   1323 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4]
   1324 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4]
   1325 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
   1326 ; AVX1-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
   1327 ; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm6
   1328 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm6, %ymm0
   1329 ; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
   1330 ; AVX1-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
   1331 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
   1332 ; AVX1-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
   1333 ; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
   1334 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
   1335 ; AVX1-NEXT:    vmovups %ymm2, 64(%rdi)
   1336 ; AVX1-NEXT:    vmovups %ymm1, 32(%rdi)
   1337 ; AVX1-NEXT:    vmovups %ymm0, (%rdi)
   1338 ; AVX1-NEXT:    vzeroupper
   1339 ; AVX1-NEXT:    retq
   1340 ;
   1341 ; AVX2-LABEL: interleaved_store_vf32_i8_stride3:
   1342 ; AVX2:       # %bb.0:
   1343 ; AVX2-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21]
   1344 ; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26]
   1345 ; AVX2-NEXT:    vpalignr {{.*#+}} ymm3 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
   1346 ; AVX2-NEXT:    vpalignr {{.*#+}} ymm0 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20]
   1347 ; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20]
   1348 ; AVX2-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20]
   1349 ; AVX2-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20]
   1350 ; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20]
   1351 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm3
   1352 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
   1353 ; AVX2-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
   1354 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
   1355 ; AVX2-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
   1356 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
   1357 ; AVX2-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
   1358 ; AVX2-NEXT:    vmovdqu %ymm0, 64(%rdi)
   1359 ; AVX2-NEXT:    vmovdqu %ymm2, 32(%rdi)
   1360 ; AVX2-NEXT:    vmovdqu %ymm3, (%rdi)
   1361 ; AVX2-NEXT:    vzeroupper
   1362 ; AVX2-NEXT:    retq
   1363 ;
   1364 ; AVX512-LABEL: interleaved_store_vf32_i8_stride3:
   1365 ; AVX512:       # %bb.0:
   1366 ; AVX512-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21]
   1367 ; AVX512-NEXT:    vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26]
   1368 ; AVX512-NEXT:    vpalignr {{.*#+}} ymm3 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
   1369 ; AVX512-NEXT:    vpalignr {{.*#+}} ymm0 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20]
   1370 ; AVX512-NEXT:    vpalignr {{.*#+}} ymm1 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20]
   1371 ; AVX512-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20]
   1372 ; AVX512-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20]
   1373 ; AVX512-NEXT:    vpalignr {{.*#+}} ymm1 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20]
   1374 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm3
   1375 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
   1376 ; AVX512-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
   1377 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
   1378 ; AVX512-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
   1379 ; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
   1380 ; AVX512-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
   1381 ; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm1
   1382 ; AVX512-NEXT:    vmovdqu %ymm0, 64(%rdi)
   1383 ; AVX512-NEXT:    vmovdqu64 %zmm1, (%rdi)
   1384 ; AVX512-NEXT:    vzeroupper
   1385 ; AVX512-NEXT:    retq
   1386 %1 = shufflevector <32 x i8> %a, <32 x i8> %b, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
   1387 %2 = shufflevector <32 x i8> %c, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1388 %interleaved.vec = shufflevector <64 x i8> %1, <64 x i8> %2, <96 x i32> <i32 0, i32 32, i32 64, i32 1, i32 33, i32 65, i32 2, i32 34, i32 66, i32 3, i32 35, i32 67, i32 4, i32 36, i32 68, i32 5, i32 37, i32 69, i32 6, i32 38, i32 70, i32 7, i32 39, i32 71, i32 8, i32 40, i32 72, i32 9, i32 41, i32 73, i32 10, i32 42, i32 74, i32 11, i32 43, i32 75, i32 12, i32 44, i32 76, i32 13, i32 45, i32 77, i32 14, i32 46, i32 78, i32 15, i32 47, i32 79, i32 16, i32 48, i32 80, i32 17, i32 49, i32 81, i32 18, i32 50, i32 82, i32 19, i32 51, i32 83, i32 20, i32 52, i32 84, i32 21, i32 53, i32 85, i32 22, i32 54, i32 86, i32 23, i32 55, i32 87, i32 24, i32 56, i32 88, i32 25, i32 57, i32 89, i32 26, i32 58, i32 90, i32 27, i32 59, i32 91, i32 28, i32 60, i32 92, i32 29, i32 61, i32 93, i32 30, i32 62, i32 94, i32 31, i32 63, i32 95>
   1389 store <96 x i8> %interleaved.vec, <96 x i8>* %p, align 1
   1390 ret void
   1391 }
   1392 
   1393 define void @interleaved_store_vf64_i8_stride3(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <192 x i8>* %p) {
   1394 ; AVX1-LABEL: interleaved_store_vf64_i8_stride3:
   1395 ; AVX1:       # %bb.0:
   1396 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
   1397 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm8 = xmm6[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
   1398 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm9 = xmm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
   1399 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm7
   1400 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm14 = xmm7[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
   1401 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
   1402 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm6
   1403 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm11 = xmm6[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
   1404 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm15 = xmm3[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
   1405 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm1
   1406 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
   1407 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
   1408 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm10 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
   1409 ; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm6
   1410 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm12 = xmm14[5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4]
   1411 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm13 = xmm9[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4]
   1412 ; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm7
   1413 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm3 = xmm8[5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4]
   1414 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
   1415 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm14 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4]
   1416 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm9 = xmm15[5,6,7,8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4]
   1417 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm8 = xmm11[5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4]
   1418 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm2 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
   1419 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm1 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
   1420 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm4 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4]
   1421 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm5 = xmm7[5,6,7,8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4]
   1422 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm11 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4]
   1423 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm15 = xmm13[5,6,7,8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4]
   1424 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm6 = xmm12[5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4]
   1425 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm7 = xmm10[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
   1426 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm8 = xmm8[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4]
   1427 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm9 = xmm9[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
   1428 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm14 = xmm14[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
   1429 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
   1430 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm3 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
   1431 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm4 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4]
   1432 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4]
   1433 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4]
   1434 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
   1435 ; AVX1-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
   1436 ; AVX1-NEXT:    vpshufb %xmm5, %xmm7, %xmm7
   1437 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm7, %ymm0
   1438 ; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm6
   1439 ; AVX1-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
   1440 ; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm2, %ymm2
   1441 ; AVX1-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
   1442 ; AVX1-NEXT:    vpshufb %xmm5, %xmm14, %xmm6
   1443 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm6, %ymm1
   1444 ; AVX1-NEXT:    vpshufb %xmm5, %xmm9, %xmm6
   1445 ; AVX1-NEXT:    vpshufb %xmm5, %xmm15, %xmm7
   1446 ; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm7, %ymm6
   1447 ; AVX1-NEXT:    vpshufb %xmm5, %xmm11, %xmm7
   1448 ; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
   1449 ; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm4, %ymm4
   1450 ; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
   1451 ; AVX1-NEXT:    vpshufb %xmm5, %xmm8, %xmm5
   1452 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm3
   1453 ; AVX1-NEXT:    vmovups %ymm3, 160(%rdi)
   1454 ; AVX1-NEXT:    vmovups %ymm4, 128(%rdi)
   1455 ; AVX1-NEXT:    vmovups %ymm6, 96(%rdi)
   1456 ; AVX1-NEXT:    vmovups %ymm1, 64(%rdi)
   1457 ; AVX1-NEXT:    vmovups %ymm2, 32(%rdi)
   1458 ; AVX1-NEXT:    vmovups %ymm0, (%rdi)
   1459 ; AVX1-NEXT:    vzeroupper
   1460 ; AVX1-NEXT:    retq
   1461 ;
   1462 ; AVX2-LABEL: interleaved_store_vf64_i8_stride3:
   1463 ; AVX2:       # %bb.0:
   1464 ; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21]
   1465 ; AVX2-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21]
   1466 ; AVX2-NEXT:    vpalignr {{.*#+}} ymm3 = ymm3[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26]
   1467 ; AVX2-NEXT:    vpalignr {{.*#+}} ymm2 = ymm2[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26]
   1468 ; AVX2-NEXT:    vpalignr {{.*#+}} ymm6 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20]
   1469 ; AVX2-NEXT:    vpalignr {{.*#+}} ymm7 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20]
   1470 ; AVX2-NEXT:    vpalignr {{.*#+}} ymm0 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20]
   1471 ; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20]
   1472 ; AVX2-NEXT:    vpalignr {{.*#+}} ymm2 = ymm4[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm4[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
   1473 ; AVX2-NEXT:    vpalignr {{.*#+}} ymm3 = ymm5[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm5[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20]
   1474 ; AVX2-NEXT:    vpalignr {{.*#+}} ymm4 = ymm7[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm7[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20]
   1475 ; AVX2-NEXT:    vpalignr {{.*#+}} ymm5 = ymm6[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm6[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20]
   1476 ; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20]
   1477 ; AVX2-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
   1478 ; AVX2-NEXT:    vpalignr {{.*#+}} ymm3 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20]
   1479 ; AVX2-NEXT:    vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20]
   1480 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm5, %ymm6
   1481 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
   1482 ; AVX2-NEXT:    vpshufb %ymm7, %ymm6, %ymm6
   1483 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm5[4,5,6,7]
   1484 ; AVX2-NEXT:    vpshufb %ymm7, %ymm5, %ymm5
   1485 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
   1486 ; AVX2-NEXT:    vpshufb %ymm7, %ymm0, %ymm0
   1487 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm4, %ymm2
   1488 ; AVX2-NEXT:    vpshufb %ymm7, %ymm2, %ymm2
   1489 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm4[4,5,6,7]
   1490 ; AVX2-NEXT:    vpshufb %ymm7, %ymm4, %ymm4
   1491 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
   1492 ; AVX2-NEXT:    vpshufb %ymm7, %ymm1, %ymm1
   1493 ; AVX2-NEXT:    vmovdqu %ymm1, 160(%rdi)
   1494 ; AVX2-NEXT:    vmovdqu %ymm4, 128(%rdi)
   1495 ; AVX2-NEXT:    vmovdqu %ymm0, 64(%rdi)
   1496 ; AVX2-NEXT:    vmovdqu %ymm5, 32(%rdi)
   1497 ; AVX2-NEXT:    vmovdqu %ymm2, 96(%rdi)
   1498 ; AVX2-NEXT:    vmovdqu %ymm6, (%rdi)
   1499 ; AVX2-NEXT:    vzeroupper
   1500 ; AVX2-NEXT:    retq
   1501 ;
   1502 ; AVX512-LABEL: interleaved_store_vf64_i8_stride3:
   1503 ; AVX512:       # %bb.0:
   1504 ; AVX512-NEXT:    vpalignr {{.*#+}} zmm0 = zmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21,38,39,40,41,42,43,44,45,46,47,32,33,34,35,36,37,54,55,56,57,58,59,60,61,62,63,48,49,50,51,52,53]
   1505 ; AVX512-NEXT:    vpalignr {{.*#+}} zmm1 = zmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26,43,44,45,46,47,32,33,34,35,36,37,38,39,40,41,42,59,60,61,62,63,48,49,50,51,52,53,54,55,56,57,58]
   1506 ; AVX512-NEXT:    vpalignr {{.*#+}} zmm3 = zmm0[5,6,7,8,9,10,11,12,13,14,15],zmm2[0,1,2,3,4],zmm0[21,22,23,24,25,26,27,28,29,30,31],zmm2[16,17,18,19,20],zmm0[37,38,39,40,41,42,43,44,45,46,47],zmm2[32,33,34,35,36],zmm0[53,54,55,56,57,58,59,60,61,62,63],zmm2[48,49,50,51,52]
   1507 ; AVX512-NEXT:    vpalignr {{.*#+}} zmm0 = zmm1[5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1,2,3,4],zmm1[21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17,18,19,20],zmm1[37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33,34,35,36],zmm1[53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49,50,51,52]
   1508 ; AVX512-NEXT:    vpalignr {{.*#+}} zmm1 = zmm2[5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1,2,3,4],zmm2[21,22,23,24,25,26,27,28,29,30,31],zmm1[16,17,18,19,20],zmm2[37,38,39,40,41,42,43,44,45,46,47],zmm1[32,33,34,35,36],zmm2[53,54,55,56,57,58,59,60,61,62,63],zmm1[48,49,50,51,52]
   1509 ; AVX512-NEXT:    vpalignr {{.*#+}} zmm2 = zmm3[5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1,2,3,4],zmm3[21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17,18,19,20],zmm3[37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33,34,35,36],zmm3[53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49,50,51,52]
   1510 ; AVX512-NEXT:    vpalignr {{.*#+}} zmm0 = zmm0[5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1,2,3,4],zmm0[21,22,23,24,25,26,27,28,29,30,31],zmm1[16,17,18,19,20],zmm0[37,38,39,40,41,42,43,44,45,46,47],zmm1[32,33,34,35,36],zmm0[53,54,55,56,57,58,59,60,61,62,63],zmm1[48,49,50,51,52]
   1511 ; AVX512-NEXT:    vpalignr {{.*#+}} zmm1 = zmm1[5,6,7,8,9,10,11,12,13,14,15],zmm3[0,1,2,3,4],zmm1[21,22,23,24,25,26,27,28,29,30,31],zmm3[16,17,18,19,20],zmm1[37,38,39,40,41,42,43,44,45,46,47],zmm3[32,33,34,35,36],zmm1[53,54,55,56,57,58,59,60,61,62,63],zmm3[48,49,50,51,52]
   1512 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm3
   1513 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
   1514 ; AVX512-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
   1515 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm2[4,5,6,7]
   1516 ; AVX512-NEXT:    vpshufb %ymm4, %ymm5, %ymm5
   1517 ; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm1[2,3]
   1518 ; AVX512-NEXT:    vpshufb %ymm4, %ymm6, %ymm6
   1519 ; AVX512-NEXT:    vextracti64x4 $1, %zmm2, %ymm2
   1520 ; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
   1521 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm7
   1522 ; AVX512-NEXT:    vpshufb %ymm4, %ymm7, %ymm7
   1523 ; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
   1524 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
   1525 ; AVX512-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
   1526 ; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
   1527 ; AVX512-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
   1528 ; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm3, %zmm1
   1529 ; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm3
   1530 ; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
   1531 ; AVX512-NEXT:    vmovdqu64 %zmm0, 128(%rdi)
   1532 ; AVX512-NEXT:    vmovdqu64 %zmm3, 64(%rdi)
   1533 ; AVX512-NEXT:    vmovdqu64 %zmm1, (%rdi)
   1534 ; AVX512-NEXT:    vzeroupper
   1535 ; AVX512-NEXT:    retq
   1536 %1 = shufflevector <64 x i8> %a, <64 x i8> %b, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
   1537 %2 = shufflevector <64 x i8> %c, <64 x i8> undef, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1538 %3 = shufflevector <128 x i8> %1, <128 x i8> %2, <192 x i32> <i32 0, i32 64, i32 128, i32 1, i32 65, i32 129, i32 2, i32 66, i32 130, i32 3, i32 67, i32 131, i32 4, i32 68, i32 132, i32 5, i32 69, i32 133, i32 6, i32 70, i32 134, i32 7, i32 71, i32 135, i32 8, i32 72, i32 136, i32 9, i32 73, i32 137, i32 10, i32 74, i32 138, i32 11, i32 75, i32 139, i32 12, i32 76, i32 140, i32 13, i32 77, i32 141, i32 14, i32 78, i32 142, i32 15, i32 79, i32 143, i32 16, i32 80, i32 144, i32 17, i32 81, i32 145, i32 18, i32 82, i32 146, i32 19, i32 83, i32 147, i32 20, i32 84, i32 148, i32 21, i32 85, i32 149, i32 22, i32 86, i32 150, i32 23, i32 87, i32 151, i32 24, i32 88, i32 152, i32 25, i32 89, i32 153, i32 26, i32 90, i32 154, i32 27, i32 91, i32 155, i32 28, i32 92, i32 156, i32 29, i32 93, i32 157, i32 30, i32 94, i32 158, i32 31, i32 95, i32 159, i32 32, i32 96, i32 160, i32 33, i32 97, i32 161, i32 34, i32 98, i32 162, i32 35, i32 99, i32 163, i32 36, i32 100, i32 164, i32 37, i32 101, i32 165, i32 38, i32 102, i32 166, i32 39, i32 103, i32 167, i32 40, i32 104, i32 168, i32 41, i32 105, i32 169, i32 42, i32 106, i32 170, i32 43, i32 107, i32 171, i32 44, i32 108, i32 172, i32 45, i32 109, i32 173, i32 46, i32 110, i32 174, i32 47, i32 111, i32 175, i32 48, i32 112, i32 176, i32 49, i32 113, i32 177, i32 50, i32 114, i32 178, i32 51, i32 115, i32 179, i32 52, i32 116, i32 180, i32 53, i32 117, i32 181, i32 54, i32 118, i32 182, i32 55, i32 119, i32 183, i32 56, i32 120, i32 184, i32 57, i32 121, i32 185, i32 58, i32 122, i32 186, i32 59, i32 123, i32 187, i32 60, i32 124, i32 188, i32 61, i32 125, i32 189, i32 62, i32 126, i32 190, i32 63, i32 127, i32 191>
   1539 store <192 x i8> %3, <192 x i8>* %p, align 1
   1540 ret void
   1541 }
   1542 
   1543 define <64 x i8> @interleaved_load_vf64_i8_stride3(<192 x i8>* %ptr){
   1544 ; AVX1-LABEL: interleaved_load_vf64_i8_stride3:
   1545 ; AVX1:       # %bb.0:
   1546 ; AVX1-NEXT:    vmovdqu (%rdi), %xmm11
   1547 ; AVX1-NEXT:    vmovdqu 16(%rdi), %xmm10
   1548 ; AVX1-NEXT:    vmovdqu 32(%rdi), %xmm8
   1549 ; AVX1-NEXT:    vmovdqu 48(%rdi), %xmm3
   1550 ; AVX1-NEXT:    vmovdqu 64(%rdi), %xmm12
   1551 ; AVX1-NEXT:    vmovdqu 80(%rdi), %xmm9
   1552 ; AVX1-NEXT:    vmovdqu 96(%rdi), %xmm6
   1553 ; AVX1-NEXT:    vmovdqu 112(%rdi), %xmm14
   1554 ; AVX1-NEXT:    vmovdqu 128(%rdi), %xmm13
   1555 ; AVX1-NEXT:    vmovdqu 144(%rdi), %xmm5
   1556 ; AVX1-NEXT:    vmovdqu 160(%rdi), %xmm1
   1557 ; AVX1-NEXT:    vmovdqu 176(%rdi), %xmm15
   1558 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
   1559 ; AVX1-NEXT:    vpshufb %xmm4, %xmm6, %xmm6
   1560 ; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm5
   1561 ; AVX1-NEXT:    vpshufb %xmm4, %xmm11, %xmm2
   1562 ; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
   1563 ; AVX1-NEXT:    vpshufb %xmm4, %xmm10, %xmm11
   1564 ; AVX1-NEXT:    vpshufb %xmm4, %xmm12, %xmm12
   1565 ; AVX1-NEXT:    vpshufb %xmm4, %xmm14, %xmm14
   1566 ; AVX1-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
   1567 ; AVX1-NEXT:    vpshufb %xmm4, %xmm13, %xmm0
   1568 ; AVX1-NEXT:    vpshufb %xmm4, %xmm15, %xmm7
   1569 ; AVX1-NEXT:    vpshufb %xmm4, %xmm8, %xmm13
   1570 ; AVX1-NEXT:    vpshufb %xmm4, %xmm9, %xmm4
   1571 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm15 = xmm4[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10]
   1572 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm10 = xmm13[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10]
   1573 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm9 = xmm7[11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7,8,9,10]
   1574 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm8 = xmm0[11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7,8,9,10]
   1575 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm5 = xmm5[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
   1576 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm6 = xmm6[11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7,8,9,10]
   1577 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7,8,9,10]
   1578 ; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm6, %ymm7
   1579 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm3 = xmm3[11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7,8,9,10]
   1580 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7,8,9,10]
   1581 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm14[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
   1582 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm14
   1583 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm4 = xmm12[11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10]
   1584 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm11 = xmm11[11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7,8,9,10]
   1585 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm11, %ymm12
   1586 ; AVX1-NEXT:    vmovaps {{.*#+}} ymm13 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
   1587 ; AVX1-NEXT:    vandnps %ymm12, %ymm13, %ymm12
   1588 ; AVX1-NEXT:    vandps %ymm13, %ymm14, %ymm14
   1589 ; AVX1-NEXT:    vorps %ymm12, %ymm14, %ymm12
   1590 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm14
   1591 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm4 = xmm15[11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10]
   1592 ; AVX1-NEXT:    vmovdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill
   1593 ; AVX1-NEXT:    vandnps %ymm14, %ymm13, %ymm14
   1594 ; AVX1-NEXT:    vandps %ymm13, %ymm7, %ymm7
   1595 ; AVX1-NEXT:    vorps %ymm14, %ymm7, %ymm13
   1596 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm14 = [128,128,128,128,128,128,11,12,13,14,15,128,128,128,128,128]
   1597 ; AVX1-NEXT:    vpshufb %xmm14, %xmm3, %xmm3
   1598 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4]
   1599 ; AVX1-NEXT:    vpshufb %xmm7, %xmm15, %xmm4
   1600 ; AVX1-NEXT:    vpor %xmm3, %xmm4, %xmm3
   1601 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm11 = xmm10[11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7,8,9,10]
   1602 ; AVX1-NEXT:    vpshufb %xmm14, %xmm2, %xmm2
   1603 ; AVX1-NEXT:    vpshufb %xmm7, %xmm10, %xmm4
   1604 ; AVX1-NEXT:    vpor %xmm2, %xmm4, %xmm2
   1605 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm1 = xmm9[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
   1606 ; AVX1-NEXT:    vpshufb %xmm14, %xmm5, %xmm4
   1607 ; AVX1-NEXT:    vpshufb %xmm7, %xmm9, %xmm5
   1608 ; AVX1-NEXT:    vpor %xmm4, %xmm5, %xmm4
   1609 ; AVX1-NEXT:    vpshufb %xmm14, %xmm6, %xmm5
   1610 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm6 = xmm8[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
   1611 ; AVX1-NEXT:    vpshufb %xmm7, %xmm8, %xmm0
   1612 ; AVX1-NEXT:    vpor %xmm5, %xmm0, %xmm5
   1613 ; AVX1-NEXT:    vextractf128 $1, %ymm13, %xmm0
   1614 ; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
   1615 ; AVX1-NEXT:    vpaddb %xmm0, %xmm4, %xmm1
   1616 ; AVX1-NEXT:    vextractf128 $1, %ymm12, %xmm0
   1617 ; AVX1-NEXT:    vpaddb -{{[0-9]+}}(%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
   1618 ; AVX1-NEXT:    vpaddb %xmm0, %xmm3, %xmm0
   1619 ; AVX1-NEXT:    vpaddb %xmm11, %xmm12, %xmm3
   1620 ; AVX1-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
   1621 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
   1622 ; AVX1-NEXT:    vpaddb %xmm6, %xmm13, %xmm2
   1623 ; AVX1-NEXT:    vpaddb %xmm2, %xmm5, %xmm2
   1624 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
   1625 ; AVX1-NEXT:    retq
   1626 ;
   1627 ; AVX2-LABEL: interleaved_load_vf64_i8_stride3:
   1628 ; AVX2:       # %bb.0:
   1629 ; AVX2-NEXT:    vmovdqu (%rdi), %xmm0
   1630 ; AVX2-NEXT:    vmovdqu 16(%rdi), %xmm1
   1631 ; AVX2-NEXT:    vmovdqu 32(%rdi), %xmm2
   1632 ; AVX2-NEXT:    vmovdqu 96(%rdi), %xmm3
   1633 ; AVX2-NEXT:    vmovdqu 112(%rdi), %xmm4
   1634 ; AVX2-NEXT:    vmovdqu 128(%rdi), %xmm5
   1635 ; AVX2-NEXT:    vinserti128 $1, 48(%rdi), %ymm0, %ymm0
   1636 ; AVX2-NEXT:    vinserti128 $1, 64(%rdi), %ymm1, %ymm1
   1637 ; AVX2-NEXT:    vinserti128 $1, 80(%rdi), %ymm2, %ymm2
   1638 ; AVX2-NEXT:    vinserti128 $1, 144(%rdi), %ymm3, %ymm3
   1639 ; AVX2-NEXT:    vinserti128 $1, 160(%rdi), %ymm4, %ymm4
   1640 ; AVX2-NEXT:    vinserti128 $1, 176(%rdi), %ymm5, %ymm5
   1641 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
   1642 ; AVX2-NEXT:    vpshufb %ymm6, %ymm3, %ymm3
   1643 ; AVX2-NEXT:    vpshufb %ymm6, %ymm0, %ymm0
   1644 ; AVX2-NEXT:    vpshufb %ymm6, %ymm1, %ymm1
   1645 ; AVX2-NEXT:    vpshufb %ymm6, %ymm4, %ymm4
   1646 ; AVX2-NEXT:    vpshufb %ymm6, %ymm5, %ymm5
   1647 ; AVX2-NEXT:    vpshufb %ymm6, %ymm2, %ymm2
   1648 ; AVX2-NEXT:    vpalignr {{.*#+}} ymm6 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26]
   1649 ; AVX2-NEXT:    vpalignr {{.*#+}} ymm7 = ymm5[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm5[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26]
   1650 ; AVX2-NEXT:    vpalignr {{.*#+}} ymm3 = ymm3[11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26]
   1651 ; AVX2-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
   1652 ; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm1[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26]
   1653 ; AVX2-NEXT:    vpalignr {{.*#+}} ymm2 = ymm4[11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7,8,9,10],ymm4[27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23,24,25,26]
   1654 ; AVX2-NEXT:    vpalignr {{.*#+}} ymm4 = ymm7[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm7[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26]
   1655 ; AVX2-NEXT:    vpalignr {{.*#+}} ymm5 = ymm6[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm6[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
   1656 ; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
   1657 ; AVX2-NEXT:    # ymm8 = mem[0,1,0,1]
   1658 ; AVX2-NEXT:    vpblendvb %ymm8, %ymm0, %ymm1, %ymm1
   1659 ; AVX2-NEXT:    vpaddb %ymm5, %ymm1, %ymm1
   1660 ; AVX2-NEXT:    vpblendvb %ymm8, %ymm3, %ymm2, %ymm2
   1661 ; AVX2-NEXT:    vpaddb %ymm4, %ymm2, %ymm2
   1662 ; AVX2-NEXT:    vpblendvb %ymm8, %ymm6, %ymm0, %ymm0
   1663 ; AVX2-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20]
   1664 ; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
   1665 ; AVX2-NEXT:    vpblendvb %ymm8, %ymm7, %ymm3, %ymm1
   1666 ; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm1[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20]
   1667 ; AVX2-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
   1668 ; AVX2-NEXT:    retq
   1669 ;
   1670 ; AVX512-LABEL: interleaved_load_vf64_i8_stride3:
   1671 ; AVX512:       # %bb.0:
   1672 ; AVX512-NEXT:    vmovdqu (%rdi), %xmm0
   1673 ; AVX512-NEXT:    vmovdqu 16(%rdi), %xmm1
   1674 ; AVX512-NEXT:    vmovdqu 32(%rdi), %xmm2
   1675 ; AVX512-NEXT:    vmovdqu 96(%rdi), %xmm3
   1676 ; AVX512-NEXT:    vmovdqu 112(%rdi), %xmm4
   1677 ; AVX512-NEXT:    vmovdqu 128(%rdi), %xmm5
   1678 ; AVX512-NEXT:    vinserti128 $1, 48(%rdi), %ymm0, %ymm0
   1679 ; AVX512-NEXT:    vinserti128 $1, 64(%rdi), %ymm1, %ymm1
   1680 ; AVX512-NEXT:    vinserti128 $1, 80(%rdi), %ymm2, %ymm2
   1681 ; AVX512-NEXT:    vinserti128 $1, 144(%rdi), %ymm3, %ymm3
   1682 ; AVX512-NEXT:    vinserti128 $1, 160(%rdi), %ymm4, %ymm4
   1683 ; AVX512-NEXT:    vinserti128 $1, 176(%rdi), %ymm5, %ymm5
   1684 ; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
   1685 ; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm1, %zmm1
   1686 ; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm2, %zmm2
   1687 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
   1688 ; AVX512-NEXT:    vpshufb %zmm3, %zmm0, %zmm0
   1689 ; AVX512-NEXT:    vpshufb %zmm3, %zmm1, %zmm1
   1690 ; AVX512-NEXT:    vpshufb %zmm3, %zmm2, %zmm2
   1691 ; AVX512-NEXT:    vpalignr {{.*#+}} zmm3 = zmm2[11,12,13,14,15],zmm0[0,1,2,3,4,5,6,7,8,9,10],zmm2[27,28,29,30,31],zmm0[16,17,18,19,20,21,22,23,24,25,26],zmm2[43,44,45,46,47],zmm0[32,33,34,35,36,37,38,39,40,41,42],zmm2[59,60,61,62,63],zmm0[48,49,50,51,52,53,54,55,56,57,58]
   1692 ; AVX512-NEXT:    vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
   1693 ; AVX512-NEXT:    movabsq $-576188069258921984, %rax # imm = 0xF800F800F800F800
   1694 ; AVX512-NEXT:    kmovq %rax, %k1
   1695 ; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
   1696 ; AVX512-NEXT:    # ymm4 = mem[0,1,0,1]
   1697 ; AVX512-NEXT:    vpblendvb %ymm4, %ymm3, %ymm0, %ymm5
   1698 ; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm6
   1699 ; AVX512-NEXT:    vpalignr {{.*#+}} zmm0 {%k1} = zmm1[11,12,13,14,15],zmm2[0,1,2,3,4,5,6,7,8,9,10],zmm1[27,28,29,30,31],zmm2[16,17,18,19,20,21,22,23,24,25,26],zmm1[43,44,45,46,47],zmm2[32,33,34,35,36,37,38,39,40,41,42],zmm1[59,60,61,62,63],zmm2[48,49,50,51,52,53,54,55,56,57,58]
   1700 ; AVX512-NEXT:    vpalignr {{.*#+}} zmm1 = zmm1[11,12,13,14,15],zmm2[0,1,2,3,4,5,6,7,8,9,10],zmm1[27,28,29,30,31],zmm2[16,17,18,19,20,21,22,23,24,25,26],zmm1[43,44,45,46,47],zmm2[32,33,34,35,36,37,38,39,40,41,42],zmm1[59,60,61,62,63],zmm2[48,49,50,51,52,53,54,55,56,57,58]
   1701 ; AVX512-NEXT:    vpalignr {{.*#+}} zmm1 = zmm3[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm3[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm3[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm3[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
   1702 ; AVX512-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
   1703 ; AVX512-NEXT:    vpalignr {{.*#+}} ymm1 = ymm5[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20]
   1704 ; AVX512-NEXT:    vextracti64x4 $1, %zmm3, %ymm2
   1705 ; AVX512-NEXT:    vpblendvb %ymm4, %ymm2, %ymm6, %ymm2
   1706 ; AVX512-NEXT:    vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20]
   1707 ; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
   1708 ; AVX512-NEXT:    vpaddb %zmm0, %zmm1, %zmm0
   1709 ; AVX512-NEXT:    retq
   1710 %wide.vec = load <192 x i8>, <192 x i8>* %ptr, align 1
   1711 %v1 = shufflevector <192 x i8> %wide.vec, <192 x i8> undef, <64 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45, i32 48, i32 51, i32 54, i32 57, i32 60, i32 63, i32 66, i32 69, i32 72, i32 75, i32 78, i32 81, i32 84, i32 87, i32 90, i32 93, i32 96, i32 99, i32 102, i32 105, i32 108, i32 111, i32 114, i32 117, i32 120, i32 123, i32 126, i32 129, i32 132, i32 135, i32 138, i32 141, i32 144, i32 147, i32 150, i32 153, i32 156, i32 159, i32 162, i32 165, i32 168, i32 171, i32 174, i32 177, i32 180, i32 183, i32 186, i32 189>
   1712 %v2 = shufflevector <192 x i8> %wide.vec, <192 x i8> undef, <64 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46, i32 49, i32 52, i32 55, i32 58, i32 61, i32 64, i32 67, i32 70, i32 73, i32 76, i32 79, i32 82, i32 85, i32 88, i32 91, i32 94, i32 97, i32 100, i32 103, i32 106, i32 109, i32 112, i32 115, i32 118, i32 121, i32 124, i32 127, i32 130, i32 133, i32 136, i32 139, i32 142, i32 145, i32 148, i32 151, i32 154, i32 157, i32 160, i32 163, i32 166, i32 169, i32 172, i32 175, i32 178, i32 181, i32 184, i32 187, i32 190>
   1713 %v3 = shufflevector <192 x i8> %wide.vec, <192 x i8> undef, <64 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47, i32 50, i32 53, i32 56, i32 59, i32 62, i32 65, i32 68, i32 71, i32 74, i32 77, i32 80, i32 83, i32 86, i32 89, i32 92, i32 95, i32 98, i32 101, i32 104, i32 107, i32 110, i32 113, i32 116, i32 119, i32 122, i32 125, i32 128, i32 131, i32 134, i32 137, i32 140, i32 143, i32 146, i32 149, i32 152, i32 155, i32 158, i32 161, i32 164, i32 167, i32 170, i32 173, i32 176, i32 179, i32 182, i32 185, i32 188, i32 191>
   1714 %add1 = add <64 x i8> %v1, %v2
   1715 %add2 = add <64 x i8> %v3, %add1
   1716 ret <64 x i8> %add2
   1717 }
   1718 
   1719 define void @interleaved_store_vf64_i8_stride4(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c,<64 x i8> %d, <256 x i8>* %p) {
   1720 ; AVX1-LABEL: interleaved_store_vf64_i8_stride4:
   1721 ; AVX1:       # %bb.0:
   1722 ; AVX1-NEXT:    subq $24, %rsp
   1723 ; AVX1-NEXT:    .cfi_def_cfa_offset 32
   1724 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
   1725 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm11
   1726 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm12
   1727 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7]
   1728 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm10 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
   1729 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm13
   1730 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm14
   1731 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm15 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
   1732 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
   1733 ; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
   1734 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15]
   1735 ; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
   1736 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
   1737 ; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
   1738 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15]
   1739 ; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
   1740 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
   1741 ; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm1
   1742 ; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm3
   1743 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm12 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
   1744 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15]
   1745 ; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
   1746 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
   1747 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
   1748 ; AVX1-NEXT:    vmovdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
   1749 ; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm3
   1750 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15]
   1751 ; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm4
   1752 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
   1753 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
   1754 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3]
   1755 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
   1756 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm11, %ymm1
   1757 ; AVX1-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp) # 32-byte Spill
   1758 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3]
   1759 ; AVX1-NEXT:    vmovdqa %xmm8, %xmm2
   1760 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3]
   1761 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm8, %ymm13
   1762 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm15 = xmm15[4],xmm5[4],xmm15[5],xmm5[5],xmm15[6],xmm5[6],xmm15[7],xmm5[7]
   1763 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7]
   1764 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm12 = xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7]
   1765 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7]
   1766 ; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
   1767 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
   1768 ; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
   1769 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3]
   1770 ; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm9, %ymm14
   1771 ; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
   1772 ; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
   1773 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
   1774 ; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
   1775 ; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
   1776 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
   1777 ; AVX1-NEXT:    vinsertf128 $1, %xmm11, %ymm9, %ymm9
   1778 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm11 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
   1779 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7]
   1780 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
   1781 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
   1782 ; AVX1-NEXT:    vinsertf128 $1, %xmm12, %ymm8, %ymm0
   1783 ; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm13, %ymm8
   1784 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm13[2,3],ymm0[2,3]
   1785 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm6, %ymm1
   1786 ; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm9, %ymm6
   1787 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm9[2,3],ymm1[2,3]
   1788 ; AVX1-NEXT:    vinsertf128 $1, %xmm15, %ymm10, %ymm2
   1789 ; AVX1-NEXT:    vmovups -{{[0-9]+}}(%rsp), %ymm3 # 32-byte Reload
   1790 ; AVX1-NEXT:    vinsertf128 $1, %xmm10, %ymm3, %ymm0
   1791 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm3[2,3],ymm2[2,3]
   1792 ; AVX1-NEXT:    vinsertf128 $1, %xmm11, %ymm7, %ymm3
   1793 ; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm14, %ymm7
   1794 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
   1795 ; AVX1-NEXT:    vmovaps %ymm3, 224(%rdi)
   1796 ; AVX1-NEXT:    vmovaps %ymm2, 192(%rdi)
   1797 ; AVX1-NEXT:    vmovaps %ymm7, 160(%rdi)
   1798 ; AVX1-NEXT:    vmovaps %ymm0, 128(%rdi)
   1799 ; AVX1-NEXT:    vmovaps %ymm1, 96(%rdi)
   1800 ; AVX1-NEXT:    vmovaps %ymm5, 64(%rdi)
   1801 ; AVX1-NEXT:    vmovaps %ymm6, 32(%rdi)
   1802 ; AVX1-NEXT:    vmovaps %ymm8, (%rdi)
   1803 ; AVX1-NEXT:    addq $24, %rsp
   1804 ; AVX1-NEXT:    .cfi_def_cfa_offset 8
   1805 ; AVX1-NEXT:    vzeroupper
   1806 ; AVX1-NEXT:    retq
   1807 ;
   1808 ; AVX2-LABEL: interleaved_store_vf64_i8_stride4:
   1809 ; AVX2:       # %bb.0:
   1810 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm8 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
   1811 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm9 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23]
   1812 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
   1813 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31]
   1814 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[4],ymm6[4],ymm4[5],ymm6[5],ymm4[6],ymm6[6],ymm4[7],ymm6[7],ymm4[16],ymm6[16],ymm4[17],ymm6[17],ymm4[18],ymm6[18],ymm4[19],ymm6[19],ymm4[20],ymm6[20],ymm4[21],ymm6[21],ymm4[22],ymm6[22],ymm4[23],ymm6[23]
   1815 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm5[0],ymm7[0],ymm5[1],ymm7[1],ymm5[2],ymm7[2],ymm5[3],ymm7[3],ymm5[4],ymm7[4],ymm5[5],ymm7[5],ymm5[6],ymm7[6],ymm5[7],ymm7[7],ymm5[16],ymm7[16],ymm5[17],ymm7[17],ymm5[18],ymm7[18],ymm5[19],ymm7[19],ymm5[20],ymm7[20],ymm5[21],ymm7[21],ymm5[22],ymm7[22],ymm5[23],ymm7[23]
   1816 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm4[8],ymm6[8],ymm4[9],ymm6[9],ymm4[10],ymm6[10],ymm4[11],ymm6[11],ymm4[12],ymm6[12],ymm4[13],ymm6[13],ymm4[14],ymm6[14],ymm4[15],ymm6[15],ymm4[24],ymm6[24],ymm4[25],ymm6[25],ymm4[26],ymm6[26],ymm4[27],ymm6[27],ymm4[28],ymm6[28],ymm4[29],ymm6[29],ymm4[30],ymm6[30],ymm4[31],ymm6[31]
   1817 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm5[8],ymm7[8],ymm5[9],ymm7[9],ymm5[10],ymm7[10],ymm5[11],ymm7[11],ymm5[12],ymm7[12],ymm5[13],ymm7[13],ymm5[14],ymm7[14],ymm5[15],ymm7[15],ymm5[24],ymm7[24],ymm5[25],ymm7[25],ymm5[26],ymm7[26],ymm5[27],ymm7[27],ymm5[28],ymm7[28],ymm5[29],ymm7[29],ymm5[30],ymm7[30],ymm5[31],ymm7[31]
   1818 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm9[0],ymm3[0],ymm9[1],ymm3[1],ymm9[2],ymm3[2],ymm9[3],ymm3[3],ymm9[8],ymm3[8],ymm9[9],ymm3[9],ymm9[10],ymm3[10],ymm9[11],ymm3[11]
   1819 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm8[0],ymm2[0],ymm8[1],ymm2[1],ymm8[2],ymm2[2],ymm8[3],ymm2[3],ymm8[8],ymm2[8],ymm8[9],ymm2[9],ymm8[10],ymm2[10],ymm8[11],ymm2[11]
   1820 ; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm9[4],ymm3[4],ymm9[5],ymm3[5],ymm9[6],ymm3[6],ymm9[7],ymm3[7],ymm9[12],ymm3[12],ymm9[13],ymm3[13],ymm9[14],ymm3[14],ymm9[15],ymm3[15]
   1821 ; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm8[4],ymm2[4],ymm8[5],ymm2[5],ymm8[6],ymm2[6],ymm8[7],ymm2[7],ymm8[12],ymm2[12],ymm8[13],ymm2[13],ymm8[14],ymm2[14],ymm8[15],ymm2[15]
   1822 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11]
   1823 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm9 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[8],ymm4[8],ymm0[9],ymm4[9],ymm0[10],ymm4[10],ymm0[11],ymm4[11]
   1824 ; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15]
   1825 ; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm4[4],ymm0[5],ymm4[5],ymm0[6],ymm4[6],ymm0[7],ymm4[7],ymm0[12],ymm4[12],ymm0[13],ymm4[13],ymm0[14],ymm4[14],ymm0[15],ymm4[15]
   1826 ; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm7, %ymm4
   1827 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm9, %ymm5
   1828 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3]
   1829 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3]
   1830 ; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm6, %ymm7
   1831 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm8, %ymm9
   1832 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm6[2,3],ymm3[2,3]
   1833 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3]
   1834 ; AVX2-NEXT:    vmovdqa %ymm1, 224(%rdi)
   1835 ; AVX2-NEXT:    vmovdqa %ymm3, 192(%rdi)
   1836 ; AVX2-NEXT:    vmovdqa %ymm0, 96(%rdi)
   1837 ; AVX2-NEXT:    vmovdqa %ymm2, 64(%rdi)
   1838 ; AVX2-NEXT:    vmovdqa %ymm9, 160(%rdi)
   1839 ; AVX2-NEXT:    vmovdqa %ymm7, 128(%rdi)
   1840 ; AVX2-NEXT:    vmovdqa %ymm5, 32(%rdi)
   1841 ; AVX2-NEXT:    vmovdqa %ymm4, (%rdi)
   1842 ; AVX2-NEXT:    vzeroupper
   1843 ; AVX2-NEXT:    retq
   1844 ;
   1845 ; AVX512-LABEL: interleaved_store_vf64_i8_stride4:
   1846 ; AVX512:       # %bb.0:
   1847 ; AVX512-NEXT:    vpunpcklbw {{.*#+}} zmm4 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
   1848 ; AVX512-NEXT:    vpunpckhbw {{.*#+}} zmm0 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
   1849 ; AVX512-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm2[0],zmm3[0],zmm2[1],zmm3[1],zmm2[2],zmm3[2],zmm2[3],zmm3[3],zmm2[4],zmm3[4],zmm2[5],zmm3[5],zmm2[6],zmm3[6],zmm2[7],zmm3[7],zmm2[16],zmm3[16],zmm2[17],zmm3[17],zmm2[18],zmm3[18],zmm2[19],zmm3[19],zmm2[20],zmm3[20],zmm2[21],zmm3[21],zmm2[22],zmm3[22],zmm2[23],zmm3[23],zmm2[32],zmm3[32],zmm2[33],zmm3[33],zmm2[34],zmm3[34],zmm2[35],zmm3[35],zmm2[36],zmm3[36],zmm2[37],zmm3[37],zmm2[38],zmm3[38],zmm2[39],zmm3[39],zmm2[48],zmm3[48],zmm2[49],zmm3[49],zmm2[50],zmm3[50],zmm2[51],zmm3[51],zmm2[52],zmm3[52],zmm2[53],zmm3[53],zmm2[54],zmm3[54],zmm2[55],zmm3[55]
   1850 ; AVX512-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm2[8],zmm3[8],zmm2[9],zmm3[9],zmm2[10],zmm3[10],zmm2[11],zmm3[11],zmm2[12],zmm3[12],zmm2[13],zmm3[13],zmm2[14],zmm3[14],zmm2[15],zmm3[15],zmm2[24],zmm3[24],zmm2[25],zmm3[25],zmm2[26],zmm3[26],zmm2[27],zmm3[27],zmm2[28],zmm3[28],zmm2[29],zmm3[29],zmm2[30],zmm3[30],zmm2[31],zmm3[31],zmm2[40],zmm3[40],zmm2[41],zmm3[41],zmm2[42],zmm3[42],zmm2[43],zmm3[43],zmm2[44],zmm3[44],zmm2[45],zmm3[45],zmm2[46],zmm3[46],zmm2[47],zmm3[47],zmm2[56],zmm3[56],zmm2[57],zmm3[57],zmm2[58],zmm3[58],zmm2[59],zmm3[59],zmm2[60],zmm3[60],zmm2[61],zmm3[61],zmm2[62],zmm3[62],zmm2[63],zmm3[63]
   1851 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} zmm3 = zmm4[0],zmm1[0],zmm4[1],zmm1[1],zmm4[2],zmm1[2],zmm4[3],zmm1[3],zmm4[8],zmm1[8],zmm4[9],zmm1[9],zmm4[10],zmm1[10],zmm4[11],zmm1[11],zmm4[16],zmm1[16],zmm4[17],zmm1[17],zmm4[18],zmm1[18],zmm4[19],zmm1[19],zmm4[24],zmm1[24],zmm4[25],zmm1[25],zmm4[26],zmm1[26],zmm4[27],zmm1[27]
   1852 ; AVX512-NEXT:    vpunpckhwd {{.*#+}} zmm1 = zmm4[4],zmm1[4],zmm4[5],zmm1[5],zmm4[6],zmm1[6],zmm4[7],zmm1[7],zmm4[12],zmm1[12],zmm4[13],zmm1[13],zmm4[14],zmm1[14],zmm4[15],zmm1[15],zmm4[20],zmm1[20],zmm4[21],zmm1[21],zmm4[22],zmm1[22],zmm4[23],zmm1[23],zmm4[28],zmm1[28],zmm4[29],zmm1[29],zmm4[30],zmm1[30],zmm4[31],zmm1[31]
   1853 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} zmm4 = zmm0[0],zmm2[0],zmm0[1],zmm2[1],zmm0[2],zmm2[2],zmm0[3],zmm2[3],zmm0[8],zmm2[8],zmm0[9],zmm2[9],zmm0[10],zmm2[10],zmm0[11],zmm2[11],zmm0[16],zmm2[16],zmm0[17],zmm2[17],zmm0[18],zmm2[18],zmm0[19],zmm2[19],zmm0[24],zmm2[24],zmm0[25],zmm2[25],zmm0[26],zmm2[26],zmm0[27],zmm2[27]
   1854 ; AVX512-NEXT:    vpunpckhwd {{.*#+}} zmm0 = zmm0[4],zmm2[4],zmm0[5],zmm2[5],zmm0[6],zmm2[6],zmm0[7],zmm2[7],zmm0[12],zmm2[12],zmm0[13],zmm2[13],zmm0[14],zmm2[14],zmm0[15],zmm2[15],zmm0[20],zmm2[20],zmm0[21],zmm2[21],zmm0[22],zmm2[22],zmm0[23],zmm2[23],zmm0[28],zmm2[28],zmm0[29],zmm2[29],zmm0[30],zmm2[30],zmm0[31],zmm2[31]
   1855 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm3, %ymm2
   1856 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm4, %ymm5
   1857 ; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm6 = ymm3[2,3],ymm1[2,3]
   1858 ; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm7 = ymm4[2,3],ymm0[2,3]
   1859 ; AVX512-NEXT:    vextracti64x4 $1, %zmm3, %ymm3
   1860 ; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
   1861 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm3, %ymm8
   1862 ; AVX512-NEXT:    vextracti64x4 $1, %zmm4, %ymm4
   1863 ; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
   1864 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm4, %ymm9
   1865 ; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
   1866 ; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm4[2,3],ymm0[2,3]
   1867 ; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm2, %zmm2
   1868 ; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm3
   1869 ; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm8, %zmm4
   1870 ; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
   1871 ; AVX512-NEXT:    vmovdqa64 %zmm0, 192(%rdi)
   1872 ; AVX512-NEXT:    vmovdqa64 %zmm3, 64(%rdi)
   1873 ; AVX512-NEXT:    vmovdqa64 %zmm4, 128(%rdi)
   1874 ; AVX512-NEXT:    vmovdqa64 %zmm2, (%rdi)
   1875 ; AVX512-NEXT:    vzeroupper
   1876 ; AVX512-NEXT:    retq
   1877 %1 = shufflevector <64 x i8> %a, <64 x i8> %b, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
   1878 %2 = shufflevector <64 x i8> %c, <64 x i8> %d, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
   1879 %interleaved = shufflevector <128 x i8> %1, <128 x i8> %2, <256 x i32> <i32 0, i32 64, i32 128, i32 192, i32 1, i32 65, i32 129, i32 193, i32 2, i32 66, i32 130, i32 194, i32 3, i32 67, i32 131, i32 195, i32 4, i32 68, i32 132, i32 196, i32 5, i32 69, i32 133, i32 197, i32 6, i32 70, i32 134, i32 198, i32 7, i32 71, i32 135, i32 199, i32 8, i32 72, i32 136, i32 200, i32 9, i32 73, i32 137, i32 201, i32 10, i32 74, i32 138, i32 202, i32 11, i32 75, i32 139, i32 203, i32 12, i32 76, i32 140, i32 204, i32 13, i32 77, i32 141, i32 205, i32 14, i32 78, i32 142, i32 206, i32 15, i32 79, i32 143, i32 207, i32 16, i32 80, i32 144, i32 208, i32 17, i32 81, i32 145, i32 209, i32 18, i32 82, i32 146, i32 210, i32 19, i32 83, i32 147, i32 211, i32 20, i32 84, i32 148, i32 212, i32 21, i32 85, i32 149, i32 213, i32 22, i32 86, i32 150, i32 214, i32 23, i32 87, i32 151, i32 215, i32 24, i32 88, i32 152, i32 216, i32 25, i32 89, i32 153, i32 217, i32 26, i32 90, i32 154, i32 218, i32 27, i32 91, i32 155, i32 219, i32 28, i32 92, i32 156, i32 220, i32 29, i32 93, i32 157, i32 221, i32 30, i32 94, i32 158, i32 222, i32 31, i32 95, i32 159, i32 223, i32 32, i32 96, i32 160, i32 224, i32 33, i32 97, i32 161, i32 225, i32 34, i32 98, i32 162, i32 226, i32 35, i32 99, i32 163, i32 227, i32 36, i32 100, i32 164, i32 228, i32 37, i32 101, i32 165, i32 229, i32 38, i32 102, i32 166, i32 230, i32 39, i32 103, i32 167, i32 231, i32 40, i32 104, i32 168, i32 232, i32 41, i32 105, i32 169, i32 233, i32 42, i32 106, i32 170, i32 234, i32 43, i32 107, i32 171, i32 235, i32 44, i32 108, i32 172, i32 236, i32 45, i32 109, i32 173, i32 237, i32 46, i32 110, i32 174, i32 238, i32 47, i32 111, i32 175, i32 239, i32 48, i32 112, i32 176, i32 240, i32 49, i32 113, i32 177, i32 241, i32 50, i32 114, i32 178, i32 242, i32 51, i32 115, i32 179, i32 243, i32 52, i32 116, i32 180, i32 244, i32 53, i32 117, i32 181, i32 245, i32 54, i32 118, i32 182, i32 246, i32 55, i32 119, i32 183, i32 247, i32 56, i32 120, i32 184, i32 248, i32 57, i32 121, i32 185, i32 249, i32 58, i32 122, i32 186, i32 250, i32 59, i32 123, i32 187, i32 251, i32 60, i32 124, i32 188, i32 252, i32 61, i32 125, i32 189, i32 253, i32 62, i32 126, i32 190, i32 254, i32 63, i32 127, i32 191, i32 255>
   1880 store <256 x i8> %interleaved, <256 x i8>* %p
   1881 ret void
   1882 }
   1883