Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
      3 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE42
      4 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
      5 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
      6 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST
      7 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+xop | FileCheck %s --check-prefix=XOP
      8 
      9 define void @v3i64(<2 x i64> %a, <2 x i64> %b, <3 x i64>* %p) nounwind {
     10 ; SSE2-LABEL: v3i64:
     11 ; SSE2:       # %bb.0:
     12 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
     13 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
     14 ; SSE2-NEXT:    movq %xmm2, 16(%rdi)
     15 ; SSE2-NEXT:    movdqa %xmm0, (%rdi)
     16 ; SSE2-NEXT:    retq
     17 ;
     18 ; SSE42-LABEL: v3i64:
     19 ; SSE42:       # %bb.0:
     20 ; SSE42-NEXT:    pextrq $1, %xmm0, 16(%rdi)
     21 ; SSE42-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
     22 ; SSE42-NEXT:    movdqa %xmm0, (%rdi)
     23 ; SSE42-NEXT:    retq
     24 ;
     25 ; AVX-LABEL: v3i64:
     26 ; AVX:       # %bb.0:
     27 ; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm0[0],xmm1[0]
     28 ; AVX-NEXT:    vpextrq $1, %xmm0, 16(%rdi)
     29 ; AVX-NEXT:    vmovdqa %xmm1, (%rdi)
     30 ; AVX-NEXT:    retq
     31 ;
     32 ; XOP-LABEL: v3i64:
     33 ; XOP:       # %bb.0:
     34 ; XOP-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm0[0],xmm1[0]
     35 ; XOP-NEXT:    vpextrq $1, %xmm0, 16(%rdi)
     36 ; XOP-NEXT:    vmovdqa %xmm1, (%rdi)
     37 ; XOP-NEXT:    retq
     38   %r = shufflevector <2 x i64> %a, <2 x i64> %b, <3 x i32> <i32 0, i32 2, i32 1>
     39   store <3 x i64> %r, <3 x i64>* %p
     40   ret void
     41 }
     42 define void @v3f64(<2 x double> %a, <2 x double> %b, <3 x double>* %p) nounwind {
     43 ; SSE-LABEL: v3f64:
     44 ; SSE:       # %bb.0:
     45 ; SSE-NEXT:    movhpd %xmm0, 16(%rdi)
     46 ; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
     47 ; SSE-NEXT:    movapd %xmm0, (%rdi)
     48 ; SSE-NEXT:    retq
     49 ;
     50 ; AVX-LABEL: v3f64:
     51 ; AVX:       # %bb.0:
     52 ; AVX-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm1[0]
     53 ; AVX-NEXT:    vmovhpd %xmm0, 16(%rdi)
     54 ; AVX-NEXT:    vmovapd %xmm1, (%rdi)
     55 ; AVX-NEXT:    retq
     56 ;
     57 ; XOP-LABEL: v3f64:
     58 ; XOP:       # %bb.0:
     59 ; XOP-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm1[0]
     60 ; XOP-NEXT:    vmovhpd %xmm0, 16(%rdi)
     61 ; XOP-NEXT:    vmovapd %xmm1, (%rdi)
     62 ; XOP-NEXT:    retq
     63   %r = shufflevector <2 x double> %a, <2 x double> %b, <3 x i32> <i32 0, i32 2, i32 1>
     64   store <3 x double> %r, <3 x double>* %p
     65   ret void
     66 }
     67 
     68 define void @v3i32(<2 x i32> %a, <2 x i32> %b, <3 x i32>* %p) nounwind {
     69 ; SSE2-LABEL: v3i32:
     70 ; SSE2:       # %bb.0:
     71 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
     72 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
     73 ; SSE2-NEXT:    movd %xmm2, 8(%rdi)
     74 ; SSE2-NEXT:    movq %xmm0, (%rdi)
     75 ; SSE2-NEXT:    retq
     76 ;
     77 ; SSE42-LABEL: v3i32:
     78 ; SSE42:       # %bb.0:
     79 ; SSE42-NEXT:    extractps $2, %xmm0, 8(%rdi)
     80 ; SSE42-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
     81 ; SSE42-NEXT:    movlps %xmm0, (%rdi)
     82 ; SSE42-NEXT:    retq
     83 ;
     84 ; AVX-LABEL: v3i32:
     85 ; AVX:       # %bb.0:
     86 ; AVX-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
     87 ; AVX-NEXT:    vextractps $2, %xmm0, 8(%rdi)
     88 ; AVX-NEXT:    vmovlps %xmm1, (%rdi)
     89 ; AVX-NEXT:    retq
     90 ;
     91 ; XOP-LABEL: v3i32:
     92 ; XOP:       # %bb.0:
     93 ; XOP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
     94 ; XOP-NEXT:    vextractps $2, %xmm0, 8(%rdi)
     95 ; XOP-NEXT:    vmovlps %xmm1, (%rdi)
     96 ; XOP-NEXT:    retq
     97   %r = shufflevector <2 x i32> %a, <2 x i32> %b, <3 x i32> <i32 0, i32 2, i32 1>
     98   store <3 x i32> %r, <3 x i32>* %p
     99   ret void
    100 }
    101 
    102 define void @v5i16(<4 x i16> %a, <4 x i16> %b, <5 x i16>* %p) nounwind {
    103 ; SSE2-LABEL: v5i16:
    104 ; SSE2:       # %bb.0:
    105 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7]
    106 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
    107 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
    108 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
    109 ; SSE2-NEXT:    pextrw $6, %xmm0, %eax
    110 ; SSE2-NEXT:    movw %ax, 8(%rdi)
    111 ; SSE2-NEXT:    movq %xmm2, (%rdi)
    112 ; SSE2-NEXT:    retq
    113 ;
    114 ; SSE42-LABEL: v5i16:
    115 ; SSE42:       # %bb.0:
    116 ; SSE42-NEXT:    pshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7]
    117 ; SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
    118 ; SSE42-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
    119 ; SSE42-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
    120 ; SSE42-NEXT:    pextrw $6, %xmm0, 8(%rdi)
    121 ; SSE42-NEXT:    movq %xmm2, (%rdi)
    122 ; SSE42-NEXT:    retq
    123 ;
    124 ; AVX1-LABEL: v5i16:
    125 ; AVX1:       # %bb.0:
    126 ; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7]
    127 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
    128 ; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
    129 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
    130 ; AVX1-NEXT:    vpextrw $6, %xmm0, 8(%rdi)
    131 ; AVX1-NEXT:    vmovq %xmm1, (%rdi)
    132 ; AVX1-NEXT:    retq
    133 ;
    134 ; AVX2-SLOW-LABEL: v5i16:
    135 ; AVX2-SLOW:       # %bb.0:
    136 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7]
    137 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
    138 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
    139 ; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
    140 ; AVX2-SLOW-NEXT:    vpextrw $6, %xmm0, 8(%rdi)
    141 ; AVX2-SLOW-NEXT:    vmovq %xmm1, (%rdi)
    142 ; AVX2-SLOW-NEXT:    retq
    143 ;
    144 ; AVX2-FAST-LABEL: v5i16:
    145 ; AVX2-FAST:       # %bb.0:
    146 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,5,8,9,4,5,6,7,8,9,10,11,12,13,14,15]
    147 ; AVX2-FAST-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7]
    148 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
    149 ; AVX2-FAST-NEXT:    vpextrw $6, %xmm0, 8(%rdi)
    150 ; AVX2-FAST-NEXT:    vmovq %xmm1, (%rdi)
    151 ; AVX2-FAST-NEXT:    retq
    152 ;
    153 ; XOP-LABEL: v5i16:
    154 ; XOP:       # %bb.0:
    155 ; XOP-NEXT:    vpperm {{.*#+}} xmm1 = xmm0[0,1],xmm1[4,5],xmm0[4,5],xmm1[8,9],xmm0[4,5],xmm1[4,5],xmm0[6,7],xmm1[6,7]
    156 ; XOP-NEXT:    vpextrw $6, %xmm0, 8(%rdi)
    157 ; XOP-NEXT:    vmovq %xmm1, (%rdi)
    158 ; XOP-NEXT:    retq
    159   %r = shufflevector <4 x i16> %a, <4 x i16> %b, <5 x i32> <i32 0, i32 5, i32 1, i32 6, i32 3>
    160   store <5 x i16> %r, <5 x i16>* %p
    161   ret void
    162 }
    163 
    164 define void @v5i32(<4 x i32> %a, <4 x i32> %b, <5 x i32>* %p) nounwind {
    165 ; SSE2-LABEL: v5i32:
    166 ; SSE2:       # %bb.0:
    167 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,2,2,3]
    168 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
    169 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    170 ; SSE2-NEXT:    movd %xmm2, 16(%rdi)
    171 ; SSE2-NEXT:    movdqa %xmm0, (%rdi)
    172 ; SSE2-NEXT:    retq
    173 ;
    174 ; SSE42-LABEL: v5i32:
    175 ; SSE42:       # %bb.0:
    176 ; SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2]
    177 ; SSE42-NEXT:    pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
    178 ; SSE42-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
    179 ; SSE42-NEXT:    pextrd $3, %xmm0, 16(%rdi)
    180 ; SSE42-NEXT:    movdqa %xmm2, (%rdi)
    181 ; SSE42-NEXT:    retq
    182 ;
    183 ; AVX1-LABEL: v5i32:
    184 ; AVX1:       # %bb.0:
    185 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,2]
    186 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
    187 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
    188 ; AVX1-NEXT:    vpextrd $3, %xmm0, 16(%rdi)
    189 ; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
    190 ; AVX1-NEXT:    retq
    191 ;
    192 ; AVX2-LABEL: v5i32:
    193 ; AVX2:       # %bb.0:
    194 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,2]
    195 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
    196 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
    197 ; AVX2-NEXT:    vpextrd $3, %xmm0, 16(%rdi)
    198 ; AVX2-NEXT:    vmovdqa %xmm1, (%rdi)
    199 ; AVX2-NEXT:    retq
    200 ;
    201 ; XOP-LABEL: v5i32:
    202 ; XOP:       # %bb.0:
    203 ; XOP-NEXT:    vpperm {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7],xmm0[4,5,6,7],xmm1[8,9,10,11]
    204 ; XOP-NEXT:    vpextrd $3, %xmm0, 16(%rdi)
    205 ; XOP-NEXT:    vmovdqa %xmm1, (%rdi)
    206 ; XOP-NEXT:    retq
    207   %r = shufflevector <4 x i32> %a, <4 x i32> %b, <5 x i32> <i32 0, i32 5, i32 1, i32 6, i32 3>
    208   store <5 x i32> %r, <5 x i32>* %p
    209   ret void
    210 }
    211 
    212 define void @v5f32(<4 x float> %a, <4 x float> %b, <5 x float>* %p) nounwind {
    213 ; SSE2-LABEL: v5f32:
    214 ; SSE2:       # %bb.0:
    215 ; SSE2-NEXT:    movaps %xmm0, %xmm2
    216 ; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[1,2]
    217 ; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
    218 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    219 ; SSE2-NEXT:    movss %xmm0, 16(%rdi)
    220 ; SSE2-NEXT:    movaps %xmm2, (%rdi)
    221 ; SSE2-NEXT:    retq
    222 ;
    223 ; SSE42-LABEL: v5f32:
    224 ; SSE42:       # %bb.0:
    225 ; SSE42-NEXT:    extractps $3, %xmm0, 16(%rdi)
    226 ; SSE42-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1,2]
    227 ; SSE42-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
    228 ; SSE42-NEXT:    movaps %xmm0, (%rdi)
    229 ; SSE42-NEXT:    retq
    230 ;
    231 ; AVX-LABEL: v5f32:
    232 ; AVX:       # %bb.0:
    233 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[0,1],xmm1[1,2]
    234 ; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,2,1,3]
    235 ; AVX-NEXT:    vextractps $3, %xmm0, 16(%rdi)
    236 ; AVX-NEXT:    vmovaps %xmm1, (%rdi)
    237 ; AVX-NEXT:    retq
    238 ;
    239 ; XOP-LABEL: v5f32:
    240 ; XOP:       # %bb.0:
    241 ; XOP-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[0,1],xmm1[1,2]
    242 ; XOP-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,2,1,3]
    243 ; XOP-NEXT:    vextractps $3, %xmm0, 16(%rdi)
    244 ; XOP-NEXT:    vmovaps %xmm1, (%rdi)
    245 ; XOP-NEXT:    retq
    246   %r = shufflevector <4 x float> %a, <4 x float> %b, <5 x i32> <i32 0, i32 5, i32 1, i32 6, i32 3>
    247   store <5 x float> %r, <5 x float>* %p
    248   ret void
    249 }
    250 
    251 define void @v7i8(<4 x i8> %a, <4 x i8> %b, <7 x i8>* %p) nounwind {
    252 ; SSE2-LABEL: v7i8:
    253 ; SSE2:       # %bb.0:
    254 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,1,3]
    255 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,65535,0,65535,0,65535,65535,65535]
    256 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3]
    257 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,0,4,5,6,7]
    258 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7]
    259 ; SSE2-NEXT:    pand %xmm2, %xmm1
    260 ; SSE2-NEXT:    pandn %xmm0, %xmm2
    261 ; SSE2-NEXT:    por %xmm1, %xmm2
    262 ; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
    263 ; SSE2-NEXT:    pand %xmm2, %xmm0
    264 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
    265 ; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp)
    266 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
    267 ; SSE2-NEXT:    movb %al, 6(%rdi)
    268 ; SSE2-NEXT:    movd %xmm0, (%rdi)
    269 ; SSE2-NEXT:    pextrw $2, %xmm0, %eax
    270 ; SSE2-NEXT:    movw %ax, 4(%rdi)
    271 ; SSE2-NEXT:    retq
    272 ;
    273 ; SSE42-LABEL: v7i8:
    274 ; SSE42:       # %bb.0:
    275 ; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,1,3]
    276 ; SSE42-NEXT:    pextrb $0, %xmm1, 6(%rdi)
    277 ; SSE42-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,4,5,8,9,0,1,12,13,0,1,14,15]
    278 ; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7]
    279 ; SSE42-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,u,u,u,u,u,u,u,u,u]
    280 ; SSE42-NEXT:    pextrw $2, %xmm1, 4(%rdi)
    281 ; SSE42-NEXT:    movd %xmm1, (%rdi)
    282 ; SSE42-NEXT:    retq
    283 ;
    284 ; AVX-LABEL: v7i8:
    285 ; AVX:       # %bb.0:
    286 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,3,1,3]
    287 ; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm1[8,9,8,9,4,5,8,9,0,1,12,13,0,1,14,15]
    288 ; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5,6,7]
    289 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,u,u,u,u,u,u,u,u,u]
    290 ; AVX-NEXT:    vpextrb $0, %xmm1, 6(%rdi)
    291 ; AVX-NEXT:    vpextrw $2, %xmm0, 4(%rdi)
    292 ; AVX-NEXT:    vmovd %xmm0, (%rdi)
    293 ; AVX-NEXT:    retq
    294 ;
    295 ; XOP-LABEL: v7i8:
    296 ; XOP:       # %bb.0:
    297 ; XOP-NEXT:    vpperm {{.*#+}} xmm0 = xmm0[0],xmm1[8],xmm0[12],xmm1[8],xmm0[4],xmm1[12,0,u,u,u,u,u,u,u,u,u]
    298 ; XOP-NEXT:    vpextrb $0, %xmm1, 6(%rdi)
    299 ; XOP-NEXT:    vpextrw $2, %xmm0, 4(%rdi)
    300 ; XOP-NEXT:    vmovd %xmm0, (%rdi)
    301 ; XOP-NEXT:    retq
    302   %r = shufflevector <4 x i8> %a, <4 x i8> %b, <7 x i32> <i32 0, i32 6, i32 3, i32 6, i32 1, i32 7, i32 4>
    303   store <7 x i8> %r, <7 x i8>* %p
    304   ret void
    305 }
    306 
    307 define void @v7i16(<4 x i16> %a, <4 x i16> %b, <7 x i16>* %p) nounwind {
    308 ; SSE2-LABEL: v7i16:
    309 ; SSE2:       # %bb.0:
    310 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,1,3]
    311 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,65535,0,65535,0,65535,65535,65535]
    312 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,1,0,3]
    313 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,0,2,0,4,5,6,7]
    314 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,4,7]
    315 ; SSE2-NEXT:    pand %xmm2, %xmm3
    316 ; SSE2-NEXT:    pandn %xmm0, %xmm2
    317 ; SSE2-NEXT:    por %xmm3, %xmm2
    318 ; SSE2-NEXT:    movd %xmm1, %eax
    319 ; SSE2-NEXT:    movw %ax, 12(%rdi)
    320 ; SSE2-NEXT:    movq %xmm2, (%rdi)
    321 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
    322 ; SSE2-NEXT:    movd %xmm0, 8(%rdi)
    323 ; SSE2-NEXT:    retq
    324 ;
    325 ; SSE42-LABEL: v7i16:
    326 ; SSE42:       # %bb.0:
    327 ; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,1,3]
    328 ; SSE42-NEXT:    pextrw $0, %xmm1, 12(%rdi)
    329 ; SSE42-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,4,5,8,9,0,1,12,13,0,1,14,15]
    330 ; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7]
    331 ; SSE42-NEXT:    pextrd $2, %xmm1, 8(%rdi)
    332 ; SSE42-NEXT:    movq %xmm1, (%rdi)
    333 ; SSE42-NEXT:    retq
    334 ;
    335 ; AVX-LABEL: v7i16:
    336 ; AVX:       # %bb.0:
    337 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,3,1,3]
    338 ; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm1[8,9,8,9,4,5,8,9,0,1,12,13,0,1,14,15]
    339 ; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5,6,7]
    340 ; AVX-NEXT:    vpextrw $0, %xmm1, 12(%rdi)
    341 ; AVX-NEXT:    vpextrd $2, %xmm0, 8(%rdi)
    342 ; AVX-NEXT:    vmovq %xmm0, (%rdi)
    343 ; AVX-NEXT:    retq
    344 ;
    345 ; XOP-LABEL: v7i16:
    346 ; XOP:       # %bb.0:
    347 ; XOP-NEXT:    vpperm {{.*#+}} xmm0 = xmm0[0,1],xmm1[8,9],xmm0[12,13],xmm1[8,9],xmm0[4,5],xmm1[12,13,0,1,14,15]
    348 ; XOP-NEXT:    vpextrw $0, %xmm1, 12(%rdi)
    349 ; XOP-NEXT:    vpextrd $2, %xmm0, 8(%rdi)
    350 ; XOP-NEXT:    vmovq %xmm0, (%rdi)
    351 ; XOP-NEXT:    retq
    352   %r = shufflevector <4 x i16> %a, <4 x i16> %b, <7 x i32> <i32 0, i32 6, i32 3, i32 6, i32 1, i32 7, i32 4>
    353   store <7 x i16> %r, <7 x i16>* %p
    354   ret void
    355 }
    356 
    357 
    358 define void @v7i32(<4 x i32> %a, <4 x i32> %b, <7 x i32>* %p) nounwind {
    359 ; SSE2-LABEL: v7i32:
    360 ; SSE2:       # %bb.0:
    361 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,1,2,2]
    362 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,1,0,3]
    363 ; SSE2-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
    364 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3]
    365 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
    366 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
    367 ; SSE2-NEXT:    movd %xmm1, 24(%rdi)
    368 ; SSE2-NEXT:    movq %xmm0, 16(%rdi)
    369 ; SSE2-NEXT:    movdqa %xmm3, (%rdi)
    370 ; SSE2-NEXT:    retq
    371 ;
    372 ; SSE42-LABEL: v7i32:
    373 ; SSE42:       # %bb.0:
    374 ; SSE42-NEXT:    movdqa %xmm0, %xmm2
    375 ; SSE42-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7]
    376 ; SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,3,2]
    377 ; SSE42-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
    378 ; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
    379 ; SSE42-NEXT:    movd %xmm1, 24(%rdi)
    380 ; SSE42-NEXT:    movq %xmm0, 16(%rdi)
    381 ; SSE42-NEXT:    movdqa %xmm2, (%rdi)
    382 ; SSE42-NEXT:    retq
    383 ;
    384 ; AVX1-LABEL: v7i32:
    385 ; AVX1:       # %bb.0:
    386 ; AVX1-NEXT:    vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm1[2],xmm0[3]
    387 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[0,2,3,2]
    388 ; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
    389 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,3,0,3]
    390 ; AVX1-NEXT:    vmovss %xmm1, 24(%rdi)
    391 ; AVX1-NEXT:    vmovlps %xmm0, 16(%rdi)
    392 ; AVX1-NEXT:    vmovaps %xmm2, (%rdi)
    393 ; AVX1-NEXT:    retq
    394 ;
    395 ; AVX2-LABEL: v7i32:
    396 ; AVX2:       # %bb.0:
    397 ; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
    398 ; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    399 ; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = <0,6,3,6,1,7,4,u>
    400 ; AVX2-NEXT:    vpermps %ymm0, %ymm2, %ymm0
    401 ; AVX2-NEXT:    vmovss %xmm1, 24(%rdi)
    402 ; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
    403 ; AVX2-NEXT:    vmovlps %xmm1, 16(%rdi)
    404 ; AVX2-NEXT:    vmovaps %xmm0, (%rdi)
    405 ; AVX2-NEXT:    vzeroupper
    406 ; AVX2-NEXT:    retq
    407 ;
    408 ; XOP-LABEL: v7i32:
    409 ; XOP:       # %bb.0:
    410 ; XOP-NEXT:    vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm1[2],xmm0[3]
    411 ; XOP-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[0,2,3,2]
    412 ; XOP-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
    413 ; XOP-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,3,0,3]
    414 ; XOP-NEXT:    vmovss %xmm1, 24(%rdi)
    415 ; XOP-NEXT:    vmovlps %xmm0, 16(%rdi)
    416 ; XOP-NEXT:    vmovaps %xmm2, (%rdi)
    417 ; XOP-NEXT:    retq
    418   %r = shufflevector <4 x i32> %a, <4 x i32> %b, <7 x i32> <i32 0, i32 6, i32 3, i32 6, i32 1, i32 7, i32 4>
    419   store <7 x i32> %r, <7 x i32>* %p
    420   ret void
    421 }
    422 
    423 define void @v12i8(<8 x i8> %a, <8 x i8> %b, <12 x i8>* %p) nounwind {
    424 ; SSE2-LABEL: v12i8:
    425 ; SSE2:       # %bb.0:
    426 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
    427 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
    428 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
    429 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,255,255]
    430 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
    431 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,1,2,3]
    432 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,3,1,3,4,5,6,7]
    433 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
    434 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
    435 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,1,4,5,6,7]
    436 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,4]
    437 ; SSE2-NEXT:    packuswb %xmm3, %xmm0
    438 ; SSE2-NEXT:    pand %xmm2, %xmm0
    439 ; SSE2-NEXT:    pandn %xmm1, %xmm2
    440 ; SSE2-NEXT:    por %xmm0, %xmm2
    441 ; SSE2-NEXT:    movq %xmm2, (%rdi)
    442 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
    443 ; SSE2-NEXT:    movd %xmm0, 8(%rdi)
    444 ; SSE2-NEXT:    retq
    445 ;
    446 ; SSE42-LABEL: v12i8:
    447 ; SSE42:       # %bb.0:
    448 ; SSE42-NEXT:    pshufb {{.*#+}} xmm1 = zero,zero,xmm1[0],zero,zero,xmm1[2],zero,zero,xmm1[4],zero,zero,xmm1[6,u,u,u,u]
    449 ; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,8],zero,xmm0[2,10],zero,xmm0[4,12],zero,xmm0[6,14],zero,xmm0[u,u,u,u]
    450 ; SSE42-NEXT:    por %xmm1, %xmm0
    451 ; SSE42-NEXT:    pextrd $2, %xmm0, 8(%rdi)
    452 ; SSE42-NEXT:    movq %xmm0, (%rdi)
    453 ; SSE42-NEXT:    retq
    454 ;
    455 ; AVX-LABEL: v12i8:
    456 ; AVX:       # %bb.0:
    457 ; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[0],zero,zero,xmm1[2],zero,zero,xmm1[4],zero,zero,xmm1[6,u,u,u,u]
    458 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8],zero,xmm0[2,10],zero,xmm0[4,12],zero,xmm0[6,14],zero,xmm0[u,u,u,u]
    459 ; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
    460 ; AVX-NEXT:    vpextrd $2, %xmm0, 8(%rdi)
    461 ; AVX-NEXT:    vmovq %xmm0, (%rdi)
    462 ; AVX-NEXT:    retq
    463 ;
    464 ; XOP-LABEL: v12i8:
    465 ; XOP:       # %bb.0:
    466 ; XOP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[0],zero,zero,xmm1[2],zero,zero,xmm1[4],zero,zero,xmm1[6,u,u,u,u]
    467 ; XOP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8],zero,xmm0[2,10],zero,xmm0[4,12],zero,xmm0[6,14],zero,xmm0[u,u,u,u]
    468 ; XOP-NEXT:    vpor %xmm1, %xmm0, %xmm0
    469 ; XOP-NEXT:    vpextrd $2, %xmm0, 8(%rdi)
    470 ; XOP-NEXT:    vmovq %xmm0, (%rdi)
    471 ; XOP-NEXT:    retq
    472   %r = shufflevector <8 x i8> %a, <8 x i8> %b, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
    473   store <12 x i8> %r, <12 x i8>* %p
    474   ret void
    475 }
    476 
    477 define void @v12i16(<8 x i16> %a, <8 x i16> %b, <12 x i16>* %p) nounwind {
    478 ; SSE2-LABEL: v12i16:
    479 ; SSE2:       # %bb.0:
    480 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,0,0,3]
    481 ; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,65535,0,65535,65535]
    482 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,6,5,4,7]
    483 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,3,2,1]
    484 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,1,4,5,6,7]
    485 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,6,4]
    486 ; SSE2-NEXT:    pand %xmm3, %xmm4
    487 ; SSE2-NEXT:    pandn %xmm2, %xmm3
    488 ; SSE2-NEXT:    por %xmm4, %xmm3
    489 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
    490 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [65535,0,0,65535,65535,65535,65535,65535]
    491 ; SSE2-NEXT:    pand %xmm2, %xmm1
    492 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
    493 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,1,3,4,5,6,7]
    494 ; SSE2-NEXT:    pandn %xmm0, %xmm2
    495 ; SSE2-NEXT:    por %xmm1, %xmm2
    496 ; SSE2-NEXT:    movq %xmm2, 16(%rdi)
    497 ; SSE2-NEXT:    movdqa %xmm3, (%rdi)
    498 ; SSE2-NEXT:    retq
    499 ;
    500 ; SSE42-LABEL: v12i16:
    501 ; SSE42:       # %bb.0:
    502 ; SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
    503 ; SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,1,2,3]
    504 ; SSE42-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,3,1,3,4,5,6,7]
    505 ; SSE42-NEXT:    pblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1,2],xmm2[3,4,5,6,7]
    506 ; SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,3]
    507 ; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,2,3,10,11,10,11,4,5,12,13]
    508 ; SSE42-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
    509 ; SSE42-NEXT:    movdqa %xmm0, (%rdi)
    510 ; SSE42-NEXT:    movq %xmm3, 16(%rdi)
    511 ; SSE42-NEXT:    retq
    512 ;
    513 ; AVX1-LABEL: v12i16:
    514 ; AVX1:       # %bb.0:
    515 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
    516 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[3,1,2,3]
    517 ; AVX1-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,3,1,3,4,5,6,7]
    518 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3],xmm3[4,5,6,7]
    519 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,3]
    520 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,2,3,10,11,10,11,4,5,12,13]
    521 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
    522 ; AVX1-NEXT:    vmovdqa %xmm0, (%rdi)
    523 ; AVX1-NEXT:    vmovq %xmm2, 16(%rdi)
    524 ; AVX1-NEXT:    retq
    525 ;
    526 ; AVX2-SLOW-LABEL: v12i16:
    527 ; AVX2-SLOW:       # %bb.0:
    528 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
    529 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[3,1,2,3]
    530 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,3,1,3,4,5,6,7]
    531 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3],xmm3[4,5,6,7]
    532 ; AVX2-SLOW-NEXT:    vpbroadcastd %xmm1, %xmm1
    533 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,2,3,10,11,10,11,4,5,12,13]
    534 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
    535 ; AVX2-SLOW-NEXT:    vmovdqa %xmm0, (%rdi)
    536 ; AVX2-SLOW-NEXT:    vmovq %xmm2, 16(%rdi)
    537 ; AVX2-SLOW-NEXT:    retq
    538 ;
    539 ; AVX2-FAST-LABEL: v12i16:
    540 ; AVX2-FAST:       # %bb.0:
    541 ; AVX2-FAST-NEXT:    vpbroadcastd %xmm1, %xmm2
    542 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,1,8,9,8,9,2,3,10,11,10,11,4,5,12,13]
    543 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4],xmm2[5],xmm3[6,7]
    544 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
    545 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12,13,6,7,14,15,6,7,8,9,10,11,12,13,14,15]
    546 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
    547 ; AVX2-FAST-NEXT:    vmovq %xmm0, 16(%rdi)
    548 ; AVX2-FAST-NEXT:    vmovdqa %xmm2, (%rdi)
    549 ; AVX2-FAST-NEXT:    retq
    550 ;
    551 ; XOP-LABEL: v12i16:
    552 ; XOP:       # %bb.0:
    553 ; XOP-NEXT:    vpperm {{.*#+}} xmm2 = xmm0[0,1,8,9],xmm1[0,1],xmm0[2,3,10,11],xmm1[2,3],xmm0[4,5,12,13]
    554 ; XOP-NEXT:    vpperm {{.*#+}} xmm0 = xmm1[4,5],xmm0[6,7,14,15],xmm1[6,7],xmm0[8,9,10,11,12,13,14,15]
    555 ; XOP-NEXT:    vmovq %xmm0, 16(%rdi)
    556 ; XOP-NEXT:    vmovdqa %xmm2, (%rdi)
    557 ; XOP-NEXT:    retq
    558   %r = shufflevector <8 x i16> %a, <8 x i16> %b, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
    559   store <12 x i16> %r, <12 x i16>* %p
    560   ret void
    561 }
    562 
    563 define void @v12i32(<8 x i32> %a, <8 x i32> %b, <12 x i32>* %p) nounwind {
    564 ; SSE2-LABEL: v12i32:
    565 ; SSE2:       # %bb.0:
    566 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
    567 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
    568 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,1,2,2]
    569 ; SSE2-NEXT:    movaps %xmm2, %xmm4
    570 ; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,0],xmm3[3,0]
    571 ; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2]
    572 ; SSE2-NEXT:    movaps %xmm2, %xmm4
    573 ; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,0],xmm1[1,0]
    574 ; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[2,2]
    575 ; SSE2-NEXT:    unpckhps {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
    576 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[3,0]
    577 ; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0,2]
    578 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,3,2,2]
    579 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,2],xmm0[3,0]
    580 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
    581 ; SSE2-NEXT:    movaps %xmm0, 32(%rdi)
    582 ; SSE2-NEXT:    movaps %xmm4, 16(%rdi)
    583 ; SSE2-NEXT:    movaps %xmm3, (%rdi)
    584 ; SSE2-NEXT:    retq
    585 ;
    586 ; SSE42-LABEL: v12i32:
    587 ; SSE42:       # %bb.0:
    588 ; SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1]
    589 ; SSE42-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[0,1,0,1]
    590 ; SSE42-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5,6,7]
    591 ; SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,1,0,1]
    592 ; SSE42-NEXT:    pblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5],xmm4[6,7]
    593 ; SSE42-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,2,2]
    594 ; SSE42-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3],xmm4[4,5,6,7]
    595 ; SSE42-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm0[4,5],xmm4[6,7]
    596 ; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
    597 ; SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
    598 ; SSE42-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3],xmm2[4,5,6,7]
    599 ; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3]
    600 ; SSE42-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7]
    601 ; SSE42-NEXT:    movdqa %xmm0, 32(%rdi)
    602 ; SSE42-NEXT:    movdqa %xmm4, 16(%rdi)
    603 ; SSE42-NEXT:    movdqa %xmm3, (%rdi)
    604 ; SSE42-NEXT:    retq
    605 ;
    606 ; AVX1-LABEL: v12i32:
    607 ; AVX1:       # %bb.0:
    608 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
    609 ; AVX1-NEXT:    vmovsldup {{.*#+}} ymm2 = ymm2[0,0,2,2,4,4,6,6]
    610 ; AVX1-NEXT:    vpermilps {{.*#+}} ymm3 = ymm0[0,u,u,1,5,u,u,6]
    611 ; AVX1-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4,5],ymm2[6],ymm3[7]
    612 ; AVX1-NEXT:    vmovddup {{.*#+}} xmm3 = xmm1[0,0]
    613 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm3, %ymm3
    614 ; AVX1-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
    615 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
    616 ; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm3[3,3]
    617 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,1]
    618 ; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
    619 ; AVX1-NEXT:    vmovaps %xmm0, 32(%rdi)
    620 ; AVX1-NEXT:    vmovaps %ymm2, (%rdi)
    621 ; AVX1-NEXT:    vzeroupper
    622 ; AVX1-NEXT:    retq
    623 ;
    624 ; AVX2-SLOW-LABEL: v12i32:
    625 ; AVX2-SLOW:       # %bb.0:
    626 ; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[2,3,2,3]
    627 ; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm3 = ymm0[3,3,2,3,7,7,6,7]
    628 ; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,3]
    629 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3]
    630 ; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm3 = <0,4,u,1,5,u,2,6>
    631 ; AVX2-SLOW-NEXT:    vpermps %ymm0, %ymm3, %ymm0
    632 ; AVX2-SLOW-NEXT:    vbroadcastsd %xmm1, %ymm1
    633 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
    634 ; AVX2-SLOW-NEXT:    vmovaps %ymm0, (%rdi)
    635 ; AVX2-SLOW-NEXT:    vmovaps %xmm2, 32(%rdi)
    636 ; AVX2-SLOW-NEXT:    vzeroupper
    637 ; AVX2-SLOW-NEXT:    retq
    638 ;
    639 ; AVX2-FAST-LABEL: v12i32:
    640 ; AVX2-FAST:       # %bb.0:
    641 ; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm2 = <0,4,u,1,5,u,2,6>
    642 ; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm2, %ymm2
    643 ; AVX2-FAST-NEXT:    vbroadcastsd %xmm1, %ymm3
    644 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
    645 ; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm3 = [3,3,7,7,7,7,6,7]
    646 ; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm3, %ymm0
    647 ; AVX2-FAST-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3]
    648 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
    649 ; AVX2-FAST-NEXT:    vmovaps %xmm0, 32(%rdi)
    650 ; AVX2-FAST-NEXT:    vmovaps %ymm2, (%rdi)
    651 ; AVX2-FAST-NEXT:    vzeroupper
    652 ; AVX2-FAST-NEXT:    retq
    653 ;
    654 ; XOP-LABEL: v12i32:
    655 ; XOP:       # %bb.0:
    656 ; XOP-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
    657 ; XOP-NEXT:    vpermil2ps {{.*#+}} ymm2 = ymm0[0],ymm2[0],ymm0[u,1,5,u],ymm2[6],ymm0[6]
    658 ; XOP-NEXT:    vmovddup {{.*#+}} xmm3 = xmm1[0,0]
    659 ; XOP-NEXT:    vinsertf128 $1, %xmm3, %ymm3, %ymm3
    660 ; XOP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
    661 ; XOP-NEXT:    vextractf128 $1, %ymm0, %xmm3
    662 ; XOP-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm3[3,3]
    663 ; XOP-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,1]
    664 ; XOP-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
    665 ; XOP-NEXT:    vmovaps %xmm0, 32(%rdi)
    666 ; XOP-NEXT:    vmovaps %ymm2, (%rdi)
    667 ; XOP-NEXT:    vzeroupper
    668 ; XOP-NEXT:    retq
    669   %r = shufflevector <8 x i32> %a, <8 x i32> %b, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
    670   store <12 x i32> %r, <12 x i32>* %p
    671   ret void
    672 }
    673 
    674 define void @pr29025(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <12 x i8> *%p) nounwind {
    675 ; SSE2-LABEL: pr29025:
    676 ; SSE2:       # %bb.0:
    677 ; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255]
    678 ; SSE2-NEXT:    pand %xmm3, %xmm1
    679 ; SSE2-NEXT:    pand %xmm3, %xmm0
    680 ; SSE2-NEXT:    packuswb %xmm1, %xmm0
    681 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
    682 ; SSE2-NEXT:    pxor %xmm1, %xmm1
    683 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
    684 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
    685 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,3,1,3,4,5,6,7]
    686 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
    687 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
    688 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,1,4,5,6,7]
    689 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,4]
    690 ; SSE2-NEXT:    packuswb %xmm1, %xmm0
    691 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,255,255]
    692 ; SSE2-NEXT:    pand %xmm1, %xmm0
    693 ; SSE2-NEXT:    pand %xmm3, %xmm2
    694 ; SSE2-NEXT:    packuswb %xmm2, %xmm2
    695 ; SSE2-NEXT:    packuswb %xmm2, %xmm2
    696 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,1,1,4,5,6,7]
    697 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,1,3]
    698 ; SSE2-NEXT:    pandn %xmm2, %xmm1
    699 ; SSE2-NEXT:    por %xmm0, %xmm1
    700 ; SSE2-NEXT:    movq %xmm1, (%rdi)
    701 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
    702 ; SSE2-NEXT:    movd %xmm0, 8(%rdi)
    703 ; SSE2-NEXT:    retq
    704 ;
    705 ; SSE42-LABEL: pr29025:
    706 ; SSE42:       # %bb.0:
    707 ; SSE42-NEXT:    movdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
    708 ; SSE42-NEXT:    pshufb %xmm3, %xmm1
    709 ; SSE42-NEXT:    pshufb %xmm3, %xmm0
    710 ; SSE42-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    711 ; SSE42-NEXT:    pshufb %xmm3, %xmm2
    712 ; SSE42-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
    713 ; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,8,1,2,10,3,4,12,5,6,14,7,u,u,u,u]
    714 ; SSE42-NEXT:    pextrd $2, %xmm0, 8(%rdi)
    715 ; SSE42-NEXT:    movq %xmm0, (%rdi)
    716 ; SSE42-NEXT:    retq
    717 ;
    718 ; AVX-LABEL: pr29025:
    719 ; AVX:       # %bb.0:
    720 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
    721 ; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
    722 ; AVX-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
    723 ; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    724 ; AVX-NEXT:    vpshufb %xmm3, %xmm2, %xmm1
    725 ; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
    726 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,1,2,10,3,4,12,5,6,14,7,u,u,u,u]
    727 ; AVX-NEXT:    vpextrd $2, %xmm0, 8(%rdi)
    728 ; AVX-NEXT:    vmovq %xmm0, (%rdi)
    729 ; AVX-NEXT:    retq
    730 ;
    731 ; XOP-LABEL: pr29025:
    732 ; XOP:       # %bb.0:
    733 ; XOP-NEXT:    vpperm {{.*#+}} xmm0 = xmm0[0,4,8,12],xmm1[0,4,8,12],xmm0[u,u,u,u,u,u,u,u]
    734 ; XOP-NEXT:    vpperm {{.*#+}} xmm0 = xmm0[0,4],xmm2[0],xmm0[1,5],xmm2[4],xmm0[2,6],xmm2[8],xmm0[3,7],xmm2[12],xmm0[u,u,u,u]
    735 ; XOP-NEXT:    vpextrd $2, %xmm0, 8(%rdi)
    736 ; XOP-NEXT:    vmovq %xmm0, (%rdi)
    737 ; XOP-NEXT:    retq
    738   %s1 = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
    739   %s2 = shufflevector <4 x i8> %c, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
    740   %r = shufflevector <8 x i8> %s1, <8 x i8> %s2, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
    741   store <12 x i8> %r, <12 x i8>* %p, align 1
    742   ret void
    743 }
    744 
    745 define void @interleave_24i8_out(<24 x i8>* %p, <8 x i8>* %q1, <8 x i8>* %q2, <8 x i8>* %q3) nounwind {
    746 ; SSE2-LABEL: interleave_24i8_out:
    747 ; SSE2:       # %bb.0:
    748 ; SSE2-NEXT:    movdqu (%rdi), %xmm0
    749 ; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
    750 ; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,0,255,255,0,255,255,255,255,255,255,255,255,255,255]
    751 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
    752 ; SSE2-NEXT:    pand %xmm3, %xmm2
    753 ; SSE2-NEXT:    pandn %xmm1, %xmm3
    754 ; SSE2-NEXT:    por %xmm2, %xmm3
    755 ; SSE2-NEXT:    pxor %xmm2, %xmm2
    756 ; SSE2-NEXT:    movdqa %xmm3, %xmm4
    757 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
    758 ; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,0]
    759 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
    760 ; SSE2-NEXT:    pand %xmm5, %xmm3
    761 ; SSE2-NEXT:    pandn %xmm4, %xmm5
    762 ; SSE2-NEXT:    por %xmm3, %xmm5
    763 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[0,2,1,3]
    764 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5]
    765 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,3,2,1]
    766 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,1,4,5,6,7]
    767 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7]
    768 ; SSE2-NEXT:    packuswb %xmm0, %xmm3
    769 ; SSE2-NEXT:    movq %xmm3, (%rsi)
    770 ; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [0,255,255,0,255,255,0,255,255,255,255,255,255,255,255,255]
    771 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
    772 ; SSE2-NEXT:    pand %xmm3, %xmm4
    773 ; SSE2-NEXT:    pandn %xmm1, %xmm3
    774 ; SSE2-NEXT:    por %xmm4, %xmm3
    775 ; SSE2-NEXT:    movdqa %xmm3, %xmm4
    776 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
    777 ; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535]
    778 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
    779 ; SSE2-NEXT:    pand %xmm5, %xmm3
    780 ; SSE2-NEXT:    pandn %xmm4, %xmm5
    781 ; SSE2-NEXT:    por %xmm3, %xmm5
    782 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm5[2,1,0,3,4,5,6,7]
    783 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7]
    784 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,3,2,1]
    785 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[1,2,3,0,4,5,6,7]
    786 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,7,4]
    787 ; SSE2-NEXT:    packuswb %xmm0, %xmm3
    788 ; SSE2-NEXT:    movq %xmm3, (%rdx)
    789 ; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [255,0,255,255,0,255,255,0,255,255,255,255,255,255,255,255]
    790 ; SSE2-NEXT:    pand %xmm3, %xmm0
    791 ; SSE2-NEXT:    pandn %xmm1, %xmm3
    792 ; SSE2-NEXT:    por %xmm0, %xmm3
    793 ; SSE2-NEXT:    movdqa %xmm3, %xmm0
    794 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
    795 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,65535,0,65535,65535,0,65535]
    796 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
    797 ; SSE2-NEXT:    pand %xmm1, %xmm3
    798 ; SSE2-NEXT:    pandn %xmm0, %xmm1
    799 ; SSE2-NEXT:    por %xmm3, %xmm1
    800 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,1,2,0]
    801 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
    802 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
    803 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
    804 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
    805 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
    806 ; SSE2-NEXT:    movq %xmm0, (%rcx)
    807 ; SSE2-NEXT:    retq
    808 ;
    809 ; SSE42-LABEL: interleave_24i8_out:
    810 ; SSE42:       # %bb.0:
    811 ; SSE42-NEXT:    movdqu (%rdi), %xmm0
    812 ; SSE42-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
    813 ; SSE42-NEXT:    movdqa %xmm1, %xmm2
    814 ; SSE42-NEXT:    pshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm2[2,5,u,u,u,u,u,u,u,u]
    815 ; SSE42-NEXT:    movdqa %xmm0, %xmm3
    816 ; SSE42-NEXT:    pshufb {{.*#+}} xmm3 = xmm3[0,3,6,9,12,15],zero,zero,xmm3[u,u,u,u,u,u,u,u]
    817 ; SSE42-NEXT:    por %xmm2, %xmm3
    818 ; SSE42-NEXT:    movq %xmm3, (%rsi)
    819 ; SSE42-NEXT:    movdqa %xmm1, %xmm2
    820 ; SSE42-NEXT:    pshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm2[0,3,6,u,u,u,u,u,u,u,u]
    821 ; SSE42-NEXT:    movdqa %xmm0, %xmm3
    822 ; SSE42-NEXT:    pshufb {{.*#+}} xmm3 = xmm3[1,4,7,10,13],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u]
    823 ; SSE42-NEXT:    por %xmm2, %xmm3
    824 ; SSE42-NEXT:    movq %xmm3, (%rdx)
    825 ; SSE42-NEXT:    pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
    826 ; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
    827 ; SSE42-NEXT:    por %xmm1, %xmm0
    828 ; SSE42-NEXT:    movq %xmm0, (%rcx)
    829 ; SSE42-NEXT:    retq
    830 ;
    831 ; AVX-LABEL: interleave_24i8_out:
    832 ; AVX:       # %bb.0:
    833 ; AVX-NEXT:    vmovdqu (%rdi), %xmm0
    834 ; AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
    835 ; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
    836 ; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
    837 ; AVX-NEXT:    vpor %xmm2, %xmm3, %xmm2
    838 ; AVX-NEXT:    vmovq %xmm2, (%rsi)
    839 ; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
    840 ; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
    841 ; AVX-NEXT:    vpor %xmm2, %xmm3, %xmm2
    842 ; AVX-NEXT:    vmovq %xmm2, (%rdx)
    843 ; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
    844 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
    845 ; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
    846 ; AVX-NEXT:    vmovq %xmm0, (%rcx)
    847 ; AVX-NEXT:    retq
    848 ;
    849 ; XOP-LABEL: interleave_24i8_out:
    850 ; XOP:       # %bb.0:
    851 ; XOP-NEXT:    vmovdqu (%rdi), %xmm0
    852 ; XOP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
    853 ; XOP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
    854 ; XOP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
    855 ; XOP-NEXT:    vpor %xmm2, %xmm3, %xmm2
    856 ; XOP-NEXT:    vmovq %xmm2, (%rsi)
    857 ; XOP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
    858 ; XOP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
    859 ; XOP-NEXT:    vpor %xmm2, %xmm3, %xmm2
    860 ; XOP-NEXT:    vmovq %xmm2, (%rdx)
    861 ; XOP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
    862 ; XOP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
    863 ; XOP-NEXT:    vpor %xmm1, %xmm0, %xmm0
    864 ; XOP-NEXT:    vmovq %xmm0, (%rcx)
    865 ; XOP-NEXT:    retq
    866   %wide.vec = load <24 x i8>, <24 x i8>* %p, align 4
    867   %s1 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
    868   %s2 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
    869   %s3 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
    870   store <8 x i8> %s1, <8 x i8>* %q1, align 4
    871   store <8 x i8> %s2, <8 x i8>* %q2, align 4
    872   store <8 x i8> %s3, <8 x i8>* %q3, align 4
    873   ret void
    874 }
    875 
    876 define void @interleave_24i8_in(<24 x i8>* %p, <8 x i8>* %q1, <8 x i8>* %q2, <8 x i8>* %q3) nounwind {
    877 ; SSE2-LABEL: interleave_24i8_in:
    878 ; SSE2:       # %bb.0:
    879 ; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
    880 ; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
    881 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
    882 ; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
    883 ; SSE2-NEXT:    pxor %xmm2, %xmm2
    884 ; SSE2-NEXT:    movdqa %xmm1, %xmm3
    885 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
    886 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,2,2]
    887 ; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535]
    888 ; SSE2-NEXT:    pand %xmm5, %xmm4
    889 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
    890 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[0,1,3,3,4,5,6,7]
    891 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,6,7]
    892 ; SSE2-NEXT:    pandn %xmm2, %xmm5
    893 ; SSE2-NEXT:    por %xmm4, %xmm5
    894 ; SSE2-NEXT:    movdqa %xmm3, %xmm2
    895 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
    896 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
    897 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7]
    898 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,4,5]
    899 ; SSE2-NEXT:    packuswb %xmm5, %xmm2
    900 ; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255]
    901 ; SSE2-NEXT:    pand %xmm4, %xmm2
    902 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[0,1,0,1]
    903 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[0,0,0,3,4,5,6,7]
    904 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,6,6]
    905 ; SSE2-NEXT:    pandn %xmm5, %xmm4
    906 ; SSE2-NEXT:    por %xmm2, %xmm4
    907 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
    908 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
    909 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,1,1,0,4,5,6,7]
    910 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
    911 ; SSE2-NEXT:    packuswb %xmm0, %xmm1
    912 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,255,0,255,255,0,255,255,255,255,255,255,255,255]
    913 ; SSE2-NEXT:    pand %xmm2, %xmm1
    914 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,1,3,3,4,5,6,7]
    915 ; SSE2-NEXT:    pandn %xmm0, %xmm2
    916 ; SSE2-NEXT:    por %xmm1, %xmm2
    917 ; SSE2-NEXT:    movq %xmm2, 16(%rdi)
    918 ; SSE2-NEXT:    movdqu %xmm4, (%rdi)
    919 ; SSE2-NEXT:    retq
    920 ;
    921 ; SSE42-LABEL: interleave_24i8_in:
    922 ; SSE42:       # %bb.0:
    923 ; SSE42-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
    924 ; SSE42-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
    925 ; SSE42-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    926 ; SSE42-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
    927 ; SSE42-NEXT:    movdqa %xmm0, %xmm2
    928 ; SSE42-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[0,8],zero,xmm2[1,9],zero,xmm2[2,10],zero,xmm2[3,11],zero,xmm2[4,12],zero,xmm2[5]
    929 ; SSE42-NEXT:    movdqa %xmm1, %xmm3
    930 ; SSE42-NEXT:    pshufb {{.*#+}} xmm3 = zero,zero,xmm3[0],zero,zero,xmm3[1],zero,zero,xmm3[2],zero,zero,xmm3[3],zero,zero,xmm3[4],zero
    931 ; SSE42-NEXT:    por %xmm2, %xmm3
    932 ; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u]
    933 ; SSE42-NEXT:    pshufb {{.*#+}} xmm1 = zero,xmm1[5],zero,zero,xmm1[6],zero,zero,xmm1[7,u,u,u,u,u,u,u,u]
    934 ; SSE42-NEXT:    por %xmm0, %xmm1
    935 ; SSE42-NEXT:    movq %xmm1, 16(%rdi)
    936 ; SSE42-NEXT:    movdqu %xmm3, (%rdi)
    937 ; SSE42-NEXT:    retq
    938 ;
    939 ; AVX-LABEL: interleave_24i8_in:
    940 ; AVX:       # %bb.0:
    941 ; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
    942 ; AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
    943 ; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    944 ; AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
    945 ; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5]
    946 ; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[0],zero,zero,xmm1[1],zero,zero,xmm1[2],zero,zero,xmm1[3],zero,zero,xmm1[4],zero
    947 ; AVX-NEXT:    vpor %xmm3, %xmm2, %xmm2
    948 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u]
    949 ; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = zero,xmm1[5],zero,zero,xmm1[6],zero,zero,xmm1[7,u,u,u,u,u,u,u,u]
    950 ; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
    951 ; AVX-NEXT:    vmovq %xmm0, 16(%rdi)
    952 ; AVX-NEXT:    vmovdqu %xmm2, (%rdi)
    953 ; AVX-NEXT:    retq
    954 ;
    955 ; XOP-LABEL: interleave_24i8_in:
    956 ; XOP:       # %bb.0:
    957 ; XOP-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
    958 ; XOP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
    959 ; XOP-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    960 ; XOP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
    961 ; XOP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5]
    962 ; XOP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[0],zero,zero,xmm1[1],zero,zero,xmm1[2],zero,zero,xmm1[3],zero,zero,xmm1[4],zero
    963 ; XOP-NEXT:    vpor %xmm3, %xmm2, %xmm2
    964 ; XOP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u]
    965 ; XOP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,xmm1[5],zero,zero,xmm1[6],zero,zero,xmm1[7,u,u,u,u,u,u,u,u]
    966 ; XOP-NEXT:    vpor %xmm1, %xmm0, %xmm0
    967 ; XOP-NEXT:    vmovq %xmm0, 16(%rdi)
    968 ; XOP-NEXT:    vmovdqu %xmm2, (%rdi)
    969 ; XOP-NEXT:    retq
    970   %s1 = load <8 x i8>, <8 x i8>* %q1, align 4
    971   %s2 = load <8 x i8>, <8 x i8>* %q2, align 4
    972   %s3 = load <8 x i8>, <8 x i8>* %q3, align 4
    973   %t1 = shufflevector <8 x i8> %s1, <8 x i8> %s2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    974   %t2 = shufflevector <8 x i8> %s3, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    975   %interleaved = shufflevector <16 x i8> %t1, <16 x i8> %t2, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
    976   store <24 x i8> %interleaved, <24 x i8>* %p, align 4
    977   ret void
    978 }
    979 
    980 
    981 define void @interleave_24i16_out(<24 x i16>* %p, <8 x i16>* %q1, <8 x i16>* %q2, <8 x i16>* %q3) nounwind {
    982 ; SSE2-LABEL: interleave_24i16_out:
    983 ; SSE2:       # %bb.0:
    984 ; SSE2-NEXT:    movdqu (%rdi), %xmm3
    985 ; SSE2-NEXT:    movdqu 16(%rdi), %xmm2
    986 ; SSE2-NEXT:    movdqu 32(%rdi), %xmm8
    987 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,0,65535,65535,0]
    988 ; SSE2-NEXT:    movdqa %xmm3, %xmm4
    989 ; SSE2-NEXT:    pand %xmm1, %xmm4
    990 ; SSE2-NEXT:    pandn %xmm2, %xmm1
    991 ; SSE2-NEXT:    por %xmm4, %xmm1
    992 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
    993 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
    994 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
    995 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
    996 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
    997 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm8[0,1,2,1]
    998 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5]
    999 ; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,0],xmm1[2,0]
   1000 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,0]
   1001 ; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,0,65535,65535]
   1002 ; SSE2-NEXT:    movdqa %xmm4, %xmm5
   1003 ; SSE2-NEXT:    pandn %xmm2, %xmm5
   1004 ; SSE2-NEXT:    movdqa %xmm3, %xmm6
   1005 ; SSE2-NEXT:    pand %xmm4, %xmm6
   1006 ; SSE2-NEXT:    por %xmm5, %xmm6
   1007 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm5 = xmm6[2,1,2,3,4,5,6,7]
   1008 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7]
   1009 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3]
   1010 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[1,2,3,0,4,5,6,7]
   1011 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,6,7]
   1012 ; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,0,0,0]
   1013 ; SSE2-NEXT:    pand %xmm6, %xmm5
   1014 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm7 = xmm8[0,3,2,3,4,5,6,7]
   1015 ; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[0,1,0,3]
   1016 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,5,6]
   1017 ; SSE2-NEXT:    movdqa %xmm6, %xmm0
   1018 ; SSE2-NEXT:    pandn %xmm7, %xmm0
   1019 ; SSE2-NEXT:    por %xmm5, %xmm0
   1020 ; SSE2-NEXT:    pand %xmm4, %xmm2
   1021 ; SSE2-NEXT:    pandn %xmm3, %xmm4
   1022 ; SSE2-NEXT:    por %xmm2, %xmm4
   1023 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[3,1,2,0]
   1024 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
   1025 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3]
   1026 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7]
   1027 ; SSE2-NEXT:    pand %xmm6, %xmm2
   1028 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,7,6,7]
   1029 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0]
   1030 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,4,5]
   1031 ; SSE2-NEXT:    pandn %xmm3, %xmm6
   1032 ; SSE2-NEXT:    por %xmm2, %xmm6
   1033 ; SSE2-NEXT:    movups %xmm1, (%rsi)
   1034 ; SSE2-NEXT:    movdqu %xmm0, (%rdx)
   1035 ; SSE2-NEXT:    movdqu %xmm6, (%rcx)
   1036 ; SSE2-NEXT:    retq
   1037 ;
   1038 ; SSE42-LABEL: interleave_24i16_out:
   1039 ; SSE42:       # %bb.0:
   1040 ; SSE42-NEXT:    movdqu (%rdi), %xmm0
   1041 ; SSE42-NEXT:    movdqu 16(%rdi), %xmm1
   1042 ; SSE42-NEXT:    movdqu 32(%rdi), %xmm2
   1043 ; SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,1,2,1]
   1044 ; SSE42-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5]
   1045 ; SSE42-NEXT:    movdqa %xmm0, %xmm4
   1046 ; SSE42-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3],xmm1[4],xmm4[5,6],xmm1[7]
   1047 ; SSE42-NEXT:    pshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15]
   1048 ; SSE42-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm3[6,7]
   1049 ; SSE42-NEXT:    movdqa %xmm2, %xmm3
   1050 ; SSE42-NEXT:    pshufb {{.*#+}} xmm3 = xmm3[0,1,6,7,4,5,6,7,0,1,0,1,6,7,12,13]
   1051 ; SSE42-NEXT:    movdqa %xmm0, %xmm5
   1052 ; SSE42-NEXT:    pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3,4],xmm1[5],xmm5[6,7]
   1053 ; SSE42-NEXT:    pshufb {{.*#+}} xmm5 = xmm5[2,3,8,9,14,15,4,5,10,11,10,11,8,9,14,15]
   1054 ; SSE42-NEXT:    pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm3[5,6,7]
   1055 ; SSE42-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,7,8,9,2,3,8,9,14,15]
   1056 ; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7]
   1057 ; SSE42-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
   1058 ; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
   1059 ; SSE42-NEXT:    movdqu %xmm4, (%rsi)
   1060 ; SSE42-NEXT:    movdqu %xmm5, (%rdx)
   1061 ; SSE42-NEXT:    movdqu %xmm1, (%rcx)
   1062 ; SSE42-NEXT:    retq
   1063 ;
   1064 ; AVX1-LABEL: interleave_24i16_out:
   1065 ; AVX1:       # %bb.0:
   1066 ; AVX1-NEXT:    vmovdqu 32(%rdi), %xmm0
   1067 ; AVX1-NEXT:    vmovdqu (%rdi), %ymm1
   1068 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   1069 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6],xmm2[7]
   1070 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15]
   1071 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1]
   1072 ; AVX1-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5]
   1073 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7]
   1074 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
   1075 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[2,3,8,9,14,15,4,5,10,11,10,11,8,9,14,15]
   1076 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[0,1,6,7,4,5,6,7,0,1,0,1,6,7,12,13]
   1077 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7]
   1078 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
   1079 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
   1080 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,2,3,8,9,14,15]
   1081 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7]
   1082 ; AVX1-NEXT:    vmovdqu %xmm3, (%rsi)
   1083 ; AVX1-NEXT:    vmovdqu %xmm4, (%rdx)
   1084 ; AVX1-NEXT:    vmovdqu %xmm0, (%rcx)
   1085 ; AVX1-NEXT:    vzeroupper
   1086 ; AVX1-NEXT:    retq
   1087 ;
   1088 ; AVX2-LABEL: interleave_24i16_out:
   1089 ; AVX2:       # %bb.0:
   1090 ; AVX2-NEXT:    vmovdqu (%rdi), %ymm0
   1091 ; AVX2-NEXT:    vmovdqu 32(%rdi), %xmm1
   1092 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
   1093 ; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
   1094 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6],xmm3[7]
   1095 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11]
   1096 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
   1097 ; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm4
   1098 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7]
   1099 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
   1100 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15]
   1101 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1102 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
   1103 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15]
   1104 ; AVX2-NEXT:    vmovdqu %xmm2, (%rsi)
   1105 ; AVX2-NEXT:    vmovdqu %xmm3, (%rdx)
   1106 ; AVX2-NEXT:    vmovdqu %xmm0, (%rcx)
   1107 ; AVX2-NEXT:    vzeroupper
   1108 ; AVX2-NEXT:    retq
   1109 ;
   1110 ; XOP-LABEL: interleave_24i16_out:
   1111 ; XOP:       # %bb.0:
   1112 ; XOP-NEXT:    vmovdqu 32(%rdi), %xmm0
   1113 ; XOP-NEXT:    vmovdqu (%rdi), %ymm1
   1114 ; XOP-NEXT:    vextractf128 $1, %ymm1, %xmm2
   1115 ; XOP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6],xmm2[7]
   1116 ; XOP-NEXT:    vpperm {{.*#+}} xmm3 = xmm3[0,1,6,7,12,13,2,3,8,9,14,15],xmm0[4,5,10,11]
   1117 ; XOP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
   1118 ; XOP-NEXT:    vpperm {{.*#+}} xmm4 = xmm4[2,3,8,9,14,15,4,5,10,11],xmm0[0,1,6,7,12,13]
   1119 ; XOP-NEXT:    vpperm {{.*#+}} xmm1 = xmm1[4,5,10,11],xmm2[0,1,6,7,12,13,14,15,0,1,2,3]
   1120 ; XOP-NEXT:    vpperm {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6,7,8,9],xmm0[2,3,8,9,14,15]
   1121 ; XOP-NEXT:    vmovdqu %xmm3, (%rsi)
   1122 ; XOP-NEXT:    vmovdqu %xmm4, (%rdx)
   1123 ; XOP-NEXT:    vmovdqu %xmm0, (%rcx)
   1124 ; XOP-NEXT:    vzeroupper
   1125 ; XOP-NEXT:    retq
   1126   %wide.vec = load <24 x i16>, <24 x i16>* %p, align 4
   1127   %s1 = shufflevector <24 x i16> %wide.vec, <24 x i16> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
   1128   %s2 = shufflevector <24 x i16> %wide.vec, <24 x i16> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
   1129   %s3 = shufflevector <24 x i16> %wide.vec, <24 x i16> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
   1130   store <8 x i16> %s1, <8 x i16>* %q1, align 4
   1131   store <8 x i16> %s2, <8 x i16>* %q2, align 4
   1132   store <8 x i16> %s3, <8 x i16>* %q3, align 4
   1133   ret void
   1134 }
   1135 
   1136 define void @interleave_24i16_in(<24 x i16>* %p, <8 x i16>* %q1, <8 x i16>* %q2, <8 x i16>* %q3) nounwind {
   1137 ; SSE2-LABEL: interleave_24i16_in:
   1138 ; SSE2:       # %bb.0:
   1139 ; SSE2-NEXT:    movdqu (%rsi), %xmm3
   1140 ; SSE2-NEXT:    movdqu (%rdx), %xmm2
   1141 ; SSE2-NEXT:    movdqu (%rcx), %xmm1
   1142 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[0,0,0,3]
   1143 ; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [65535,65535,0,65535,65535,0,65535,65535]
   1144 ; SSE2-NEXT:    movdqa %xmm0, %xmm5
   1145 ; SSE2-NEXT:    pandn %xmm4, %xmm5
   1146 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,3,3,3]
   1147 ; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[1,1,2,2]
   1148 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
   1149 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
   1150 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,2,4,5,6,7]
   1151 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,4,5]
   1152 ; SSE2-NEXT:    pand %xmm0, %xmm3
   1153 ; SSE2-NEXT:    por %xmm5, %xmm3
   1154 ; SSE2-NEXT:    movdqa %xmm0, %xmm5
   1155 ; SSE2-NEXT:    pandn %xmm4, %xmm5
   1156 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[0,1,3,3,4,5,6,7]
   1157 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
   1158 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
   1159 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[2,3,2,0,4,5,6,7]
   1160 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,6,7]
   1161 ; SSE2-NEXT:    pand %xmm0, %xmm2
   1162 ; SSE2-NEXT:    por %xmm5, %xmm2
   1163 ; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,0]
   1164 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,2,2]
   1165 ; SSE2-NEXT:    pand %xmm5, %xmm1
   1166 ; SSE2-NEXT:    pandn %xmm6, %xmm5
   1167 ; SSE2-NEXT:    por %xmm1, %xmm5
   1168 ; SSE2-NEXT:    pand %xmm0, %xmm5
   1169 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,4,4,6,7]
   1170 ; SSE2-NEXT:    pandn %xmm1, %xmm0
   1171 ; SSE2-NEXT:    por %xmm5, %xmm0
   1172 ; SSE2-NEXT:    movdqu %xmm0, 16(%rdi)
   1173 ; SSE2-NEXT:    movdqu %xmm2, 32(%rdi)
   1174 ; SSE2-NEXT:    movdqu %xmm3, (%rdi)
   1175 ; SSE2-NEXT:    retq
   1176 ;
   1177 ; SSE42-LABEL: interleave_24i16_in:
   1178 ; SSE42:       # %bb.0:
   1179 ; SSE42-NEXT:    movdqu (%rsi), %xmm0
   1180 ; SSE42-NEXT:    movdqu (%rdx), %xmm1
   1181 ; SSE42-NEXT:    movdqu (%rcx), %xmm2
   1182 ; SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,2,2]
   1183 ; SSE42-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[0,3,3,3]
   1184 ; SSE42-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
   1185 ; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
   1186 ; SSE42-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[0,0,0,3]
   1187 ; SSE42-NEXT:    pblendw {{.*#+}} xmm5 = xmm0[0,1],xmm5[2],xmm0[3,4],xmm5[5],xmm0[6,7]
   1188 ; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,2]
   1189 ; SSE42-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6],xmm3[7]
   1190 ; SSE42-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[0,1,3,3,4,5,6,7]
   1191 ; SSE42-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,6,7]
   1192 ; SSE42-NEXT:    pblendw {{.*#+}} xmm3 = xmm0[0,1],xmm3[2],xmm0[3,4],xmm3[5],xmm0[6,7]
   1193 ; SSE42-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
   1194 ; SSE42-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[4,5,6,7,4,5,8,9,10,11,10,11,12,13,14,15]
   1195 ; SSE42-NEXT:    pblendw {{.*#+}} xmm4 = xmm1[0,1],xmm4[2],xmm1[3,4],xmm4[5],xmm1[6,7]
   1196 ; SSE42-NEXT:    movdqu %xmm4, 32(%rdi)
   1197 ; SSE42-NEXT:    movdqu %xmm3, 16(%rdi)
   1198 ; SSE42-NEXT:    movdqu %xmm5, (%rdi)
   1199 ; SSE42-NEXT:    retq
   1200 ;
   1201 ; AVX1-LABEL: interleave_24i16_in:
   1202 ; AVX1:       # %bb.0:
   1203 ; AVX1-NEXT:    vmovdqu (%rsi), %xmm0
   1204 ; AVX1-NEXT:    vmovdqu (%rdx), %xmm1
   1205 ; AVX1-NEXT:    vmovdqu (%rcx), %xmm2
   1206 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,2]
   1207 ; AVX1-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm1[0,1,3,3,4,5,6,7]
   1208 ; AVX1-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,6,7]
   1209 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7]
   1210 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[1,1,2,2]
   1211 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3],xmm3[4,5],xmm4[6],xmm3[7]
   1212 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
   1213 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
   1214 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[0,0,0,3]
   1215 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7]
   1216 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
   1217 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
   1218 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
   1219 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3]
   1220 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
   1221 ; AVX1-NEXT:    vmovdqu %xmm0, 32(%rdi)
   1222 ; AVX1-NEXT:    vmovups %ymm3, (%rdi)
   1223 ; AVX1-NEXT:    vzeroupper
   1224 ; AVX1-NEXT:    retq
   1225 ;
   1226 ; AVX2-LABEL: interleave_24i16_in:
   1227 ; AVX2:       # %bb.0:
   1228 ; AVX2-NEXT:    vmovdqu (%rsi), %xmm0
   1229 ; AVX2-NEXT:    vmovdqu (%rdx), %xmm1
   1230 ; AVX2-NEXT:    vmovdqu (%rcx), %xmm2
   1231 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm3
   1232 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm4 = ymm3[0,1,2,3,6,7,2,3,8,9,8,9,4,5,6,7,16,17,18,19,22,23,18,19,24,25,24,25,20,21,22,23]
   1233 ; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1]
   1234 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,1,u,u,u,u,2,3,u,u,u,u,4,5,u,u,22,23,u,u,u,u,24,25,u,u,u,u,26,27]
   1235 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15]
   1236 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,0,0,u,1,1,u,2>
   1237 ; AVX2-NEXT:    vpermd %ymm2, %ymm4, %ymm4
   1238 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
   1239 ; AVX2-NEXT:    vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
   1240 ; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
   1241 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
   1242 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3]
   1243 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
   1244 ; AVX2-NEXT:    vmovdqu %xmm0, 32(%rdi)
   1245 ; AVX2-NEXT:    vmovdqu %ymm3, (%rdi)
   1246 ; AVX2-NEXT:    vzeroupper
   1247 ; AVX2-NEXT:    retq
   1248 ;
   1249 ; XOP-LABEL: interleave_24i16_in:
   1250 ; XOP:       # %bb.0:
   1251 ; XOP-NEXT:    vmovdqu (%rsi), %xmm0
   1252 ; XOP-NEXT:    vmovdqu (%rdx), %xmm1
   1253 ; XOP-NEXT:    vmovdqu (%rcx), %xmm2
   1254 ; XOP-NEXT:    vpperm {{.*#+}} xmm3 = xmm0[4,5,6,7],xmm1[6,7],xmm0[6,7,8,9],xmm1[8,9],xmm0[8,9,10,11]
   1255 ; XOP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[1,1,2,2]
   1256 ; XOP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3],xmm3[4,5],xmm4[6],xmm3[7]
   1257 ; XOP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
   1258 ; XOP-NEXT:    vpperm {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm2[0,1],xmm4[4,5,6,7],xmm2[2,3],xmm4[8,9,10,11]
   1259 ; XOP-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
   1260 ; XOP-NEXT:    vpperm {{.*#+}} xmm0 = xmm1[10,11],xmm0[12,13,12,13],xmm1[12,13,12,13],xmm0[14,15],xmm1[14,15],xmm0[14,15]
   1261 ; XOP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3]
   1262 ; XOP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
   1263 ; XOP-NEXT:    vmovdqu %xmm0, 32(%rdi)
   1264 ; XOP-NEXT:    vmovups %ymm3, (%rdi)
   1265 ; XOP-NEXT:    vzeroupper
   1266 ; XOP-NEXT:    retq
   1267   %s1 = load <8 x i16>, <8 x i16>* %q1, align 4
   1268   %s2 = load <8 x i16>, <8 x i16>* %q2, align 4
   1269   %s3 = load <8 x i16>, <8 x i16>* %q3, align 4
   1270   %t1 = shufflevector <8 x i16> %s1, <8 x i16> %s2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   1271   %t2 = shufflevector <8 x i16> %s3, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1272   %interleaved = shufflevector <16 x i16> %t1, <16 x i16> %t2, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
   1273   store <24 x i16> %interleaved, <24 x i16>* %p, align 4
   1274   ret void
   1275 }
   1276 
   1277 define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2, <8 x i32>* %q3) nounwind {
   1278 ; SSE2-LABEL: interleave_24i32_out:
   1279 ; SSE2:       # %bb.0:
   1280 ; SSE2-NEXT:    movups 80(%rdi), %xmm9
   1281 ; SSE2-NEXT:    movups 64(%rdi), %xmm10
   1282 ; SSE2-NEXT:    movups (%rdi), %xmm0
   1283 ; SSE2-NEXT:    movups 16(%rdi), %xmm11
   1284 ; SSE2-NEXT:    movups 32(%rdi), %xmm8
   1285 ; SSE2-NEXT:    movups 48(%rdi), %xmm2
   1286 ; SSE2-NEXT:    movaps %xmm2, %xmm3
   1287 ; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,3],xmm10[2,3]
   1288 ; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[2,3,0,1]
   1289 ; SSE2-NEXT:    movaps %xmm9, %xmm6
   1290 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm10[1,1,2,3]
   1291 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
   1292 ; SSE2-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,1],xmm9[0,3]
   1293 ; SSE2-NEXT:    shufps {{.*#+}} xmm9 = xmm9[1,0],xmm3[2,0]
   1294 ; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm9[2,0]
   1295 ; SSE2-NEXT:    movaps %xmm0, %xmm5
   1296 ; SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,3],xmm11[2,3]
   1297 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1298 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm11[1,1,2,3]
   1299 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
   1300 ; SSE2-NEXT:    movaps %xmm8, %xmm4
   1301 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[0,3]
   1302 ; SSE2-NEXT:    shufps {{.*#+}} xmm8 = xmm8[1,0],xmm5[2,0]
   1303 ; SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,0]
   1304 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm11[0,0]
   1305 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm11[3,3]
   1306 ; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,0]
   1307 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,0]
   1308 ; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,0],xmm10[0,0]
   1309 ; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm10[3,3]
   1310 ; SSE2-NEXT:    shufps {{.*#+}} xmm6 = xmm6[2,0],xmm2[2,0]
   1311 ; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm6[2,0]
   1312 ; SSE2-NEXT:    movups %xmm3, 16(%rsi)
   1313 ; SSE2-NEXT:    movups %xmm5, (%rsi)
   1314 ; SSE2-NEXT:    movups %xmm2, 16(%rdx)
   1315 ; SSE2-NEXT:    movups %xmm0, (%rdx)
   1316 ; SSE2-NEXT:    movups %xmm7, 16(%rcx)
   1317 ; SSE2-NEXT:    movups %xmm1, (%rcx)
   1318 ; SSE2-NEXT:    retq
   1319 ;
   1320 ; SSE42-LABEL: interleave_24i32_out:
   1321 ; SSE42:       # %bb.0:
   1322 ; SSE42-NEXT:    movdqu 80(%rdi), %xmm9
   1323 ; SSE42-NEXT:    movdqu 64(%rdi), %xmm10
   1324 ; SSE42-NEXT:    movdqu (%rdi), %xmm4
   1325 ; SSE42-NEXT:    movdqu 16(%rdi), %xmm2
   1326 ; SSE42-NEXT:    movdqu 32(%rdi), %xmm11
   1327 ; SSE42-NEXT:    movdqu 48(%rdi), %xmm5
   1328 ; SSE42-NEXT:    pshufd {{.*#+}} xmm8 = xmm11[0,1,0,1]
   1329 ; SSE42-NEXT:    movdqa %xmm2, %xmm7
   1330 ; SSE42-NEXT:    pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm4[2,3],xmm7[4,5,6,7]
   1331 ; SSE42-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[2,3,0,1]
   1332 ; SSE42-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,3],xmm2[2,3]
   1333 ; SSE42-NEXT:    blendps {{.*#+}} xmm4 = xmm4[0,1,2],xmm8[3]
   1334 ; SSE42-NEXT:    movdqa %xmm10, %xmm1
   1335 ; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5,6,7]
   1336 ; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[2,3,0,1]
   1337 ; SSE42-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,3],xmm10[2,3]
   1338 ; SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm9[0,1,0,1]
   1339 ; SSE42-NEXT:    pblendw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,5],xmm3[6,7]
   1340 ; SSE42-NEXT:    pshufd {{.*#+}} xmm5 = xmm11[0,1,2,2]
   1341 ; SSE42-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[1,0,3,3]
   1342 ; SSE42-NEXT:    pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5],xmm5[6,7]
   1343 ; SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,0,3,3]
   1344 ; SSE42-NEXT:    pshufd {{.*#+}} xmm5 = xmm9[0,1,2,2]
   1345 ; SSE42-NEXT:    pblendw {{.*#+}} xmm5 = xmm1[0,1,2,3,4,5],xmm5[6,7]
   1346 ; SSE42-NEXT:    pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm2[2,3],xmm6[4,5,6,7]
   1347 ; SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm11[0,1,0,3]
   1348 ; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4,5,6,7]
   1349 ; SSE42-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3],xmm0[4,5,6,7]
   1350 ; SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm9[0,1,0,3]
   1351 ; SSE42-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7]
   1352 ; SSE42-NEXT:    movdqu %xmm3, 16(%rsi)
   1353 ; SSE42-NEXT:    movups %xmm4, (%rsi)
   1354 ; SSE42-NEXT:    movdqu %xmm5, 16(%rdx)
   1355 ; SSE42-NEXT:    movdqu %xmm7, (%rdx)
   1356 ; SSE42-NEXT:    movdqu %xmm2, 16(%rcx)
   1357 ; SSE42-NEXT:    movdqu %xmm1, (%rcx)
   1358 ; SSE42-NEXT:    retq
   1359 ;
   1360 ; AVX1-LABEL: interleave_24i32_out:
   1361 ; AVX1:       # %bb.0:
   1362 ; AVX1-NEXT:    vmovups (%rdi), %ymm0
   1363 ; AVX1-NEXT:    vmovups 32(%rdi), %ymm1
   1364 ; AVX1-NEXT:    vmovups 64(%rdi), %ymm2
   1365 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
   1366 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm4 = zero,zero,xmm2[2],xmm3[1]
   1367 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
   1368 ; AVX1-NEXT:    vblendps {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
   1369 ; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm6
   1370 ; AVX1-NEXT:    vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3]
   1371 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm5 = xmm5[0,3,2,1]
   1372 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm6 = xmm6[0,3,2,3]
   1373 ; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm5, %ymm5
   1374 ; AVX1-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
   1375 ; AVX1-NEXT:    vblendps {{.*#+}} xmm5 = xmm2[0,1],xmm3[2],xmm2[3]
   1376 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm5 = xmm5[0,0,3,2]
   1377 ; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm5
   1378 ; AVX1-NEXT:    vblendps {{.*#+}} ymm6 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
   1379 ; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm7
   1380 ; AVX1-NEXT:    vblendps {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3]
   1381 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm6 = xmm6[1,0,3,2]
   1382 ; AVX1-NEXT:    vmovshdup {{.*#+}} xmm7 = xmm7[1,1,3,3]
   1383 ; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm6, %ymm6
   1384 ; AVX1-NEXT:    vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7]
   1385 ; AVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0,3]
   1386 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
   1387 ; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
   1388 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   1389 ; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
   1390 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3]
   1391 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
   1392 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
   1393 ; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
   1394 ; AVX1-NEXT:    vmovups %ymm4, (%rsi)
   1395 ; AVX1-NEXT:    vmovups %ymm5, (%rdx)
   1396 ; AVX1-NEXT:    vmovups %ymm0, (%rcx)
   1397 ; AVX1-NEXT:    vzeroupper
   1398 ; AVX1-NEXT:    retq
   1399 ;
   1400 ; AVX2-SLOW-LABEL: interleave_24i32_out:
   1401 ; AVX2-SLOW:       # %bb.0:
   1402 ; AVX2-SLOW-NEXT:    vmovups (%rdi), %ymm0
   1403 ; AVX2-SLOW-NEXT:    vmovups 32(%rdi), %ymm1
   1404 ; AVX2-SLOW-NEXT:    vmovups 64(%rdi), %ymm2
   1405 ; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm3 = <u,u,u,u,u,u,2,5>
   1406 ; AVX2-SLOW-NEXT:    vpermps %ymm2, %ymm3, %ymm3
   1407 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
   1408 ; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u>
   1409 ; AVX2-SLOW-NEXT:    vpermps %ymm4, %ymm5, %ymm4
   1410 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
   1411 ; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm4 = <u,u,u,u,u,0,3,6>
   1412 ; AVX2-SLOW-NEXT:    vpermps %ymm2, %ymm4, %ymm4
   1413 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
   1414 ; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u>
   1415 ; AVX2-SLOW-NEXT:    vpermps %ymm5, %ymm6, %ymm5
   1416 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
   1417 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
   1418 ; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u>
   1419 ; AVX2-SLOW-NEXT:    vpermps %ymm0, %ymm1, %ymm0
   1420 ; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7]
   1421 ; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
   1422 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
   1423 ; AVX2-SLOW-NEXT:    vmovups %ymm3, (%rsi)
   1424 ; AVX2-SLOW-NEXT:    vmovups %ymm4, (%rdx)
   1425 ; AVX2-SLOW-NEXT:    vmovups %ymm0, (%rcx)
   1426 ; AVX2-SLOW-NEXT:    vzeroupper
   1427 ; AVX2-SLOW-NEXT:    retq
   1428 ;
   1429 ; AVX2-FAST-LABEL: interleave_24i32_out:
   1430 ; AVX2-FAST:       # %bb.0:
   1431 ; AVX2-FAST-NEXT:    vmovups (%rdi), %ymm0
   1432 ; AVX2-FAST-NEXT:    vmovups 32(%rdi), %ymm1
   1433 ; AVX2-FAST-NEXT:    vmovups 64(%rdi), %ymm2
   1434 ; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm3 = <u,u,u,u,u,u,2,5>
   1435 ; AVX2-FAST-NEXT:    vpermps %ymm2, %ymm3, %ymm3
   1436 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
   1437 ; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u>
   1438 ; AVX2-FAST-NEXT:    vpermps %ymm4, %ymm5, %ymm4
   1439 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
   1440 ; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm4 = <u,u,u,u,u,0,3,6>
   1441 ; AVX2-FAST-NEXT:    vpermps %ymm2, %ymm4, %ymm4
   1442 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
   1443 ; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u>
   1444 ; AVX2-FAST-NEXT:    vpermps %ymm5, %ymm6, %ymm5
   1445 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
   1446 ; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm5 = [0,1,0,3,0,1,4,7]
   1447 ; AVX2-FAST-NEXT:    vpermps %ymm2, %ymm5, %ymm2
   1448 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
   1449 ; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u>
   1450 ; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm1, %ymm0
   1451 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
   1452 ; AVX2-FAST-NEXT:    vmovups %ymm3, (%rsi)
   1453 ; AVX2-FAST-NEXT:    vmovups %ymm4, (%rdx)
   1454 ; AVX2-FAST-NEXT:    vmovups %ymm0, (%rcx)
   1455 ; AVX2-FAST-NEXT:    vzeroupper
   1456 ; AVX2-FAST-NEXT:    retq
   1457 ;
   1458 ; XOP-LABEL: interleave_24i32_out:
   1459 ; XOP:       # %bb.0:
   1460 ; XOP-NEXT:    vmovups (%rdi), %ymm0
   1461 ; XOP-NEXT:    vmovups 32(%rdi), %ymm1
   1462 ; XOP-NEXT:    vmovups 64(%rdi), %ymm2
   1463 ; XOP-NEXT:    vextractf128 $1, %ymm2, %xmm3
   1464 ; XOP-NEXT:    vinsertps {{.*#+}} xmm4 = zero,zero,xmm2[2],xmm3[1]
   1465 ; XOP-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
   1466 ; XOP-NEXT:    vblendps {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
   1467 ; XOP-NEXT:    vextractf128 $1, %ymm5, %xmm6
   1468 ; XOP-NEXT:    vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3]
   1469 ; XOP-NEXT:    vpermilps {{.*#+}} xmm5 = xmm5[0,3,2,1]
   1470 ; XOP-NEXT:    vpermilps {{.*#+}} xmm6 = xmm6[0,3,2,3]
   1471 ; XOP-NEXT:    vinsertf128 $1, %xmm6, %ymm5, %ymm5
   1472 ; XOP-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
   1473 ; XOP-NEXT:    vblendps {{.*#+}} xmm5 = xmm2[0,1],xmm3[2],xmm2[3]
   1474 ; XOP-NEXT:    vpermilps {{.*#+}} xmm5 = xmm5[0,0,3,2]
   1475 ; XOP-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm5
   1476 ; XOP-NEXT:    vblendps {{.*#+}} ymm6 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
   1477 ; XOP-NEXT:    vextractf128 $1, %ymm6, %xmm7
   1478 ; XOP-NEXT:    vblendps {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3]
   1479 ; XOP-NEXT:    vpermilps {{.*#+}} xmm6 = xmm6[1,0,3,2]
   1480 ; XOP-NEXT:    vmovshdup {{.*#+}} xmm7 = xmm7[1,1,3,3]
   1481 ; XOP-NEXT:    vinsertf128 $1, %xmm7, %ymm6, %ymm6
   1482 ; XOP-NEXT:    vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7]
   1483 ; XOP-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0,3]
   1484 ; XOP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
   1485 ; XOP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
   1486 ; XOP-NEXT:    vextractf128 $1, %ymm0, %xmm1
   1487 ; XOP-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
   1488 ; XOP-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3]
   1489 ; XOP-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
   1490 ; XOP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
   1491 ; XOP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
   1492 ; XOP-NEXT:    vmovups %ymm4, (%rsi)
   1493 ; XOP-NEXT:    vmovups %ymm5, (%rdx)
   1494 ; XOP-NEXT:    vmovups %ymm0, (%rcx)
   1495 ; XOP-NEXT:    vzeroupper
   1496 ; XOP-NEXT:    retq
   1497   %wide.vec = load <24 x i32>, <24 x i32>* %p, align 4
   1498   %s1 = shufflevector <24 x i32> %wide.vec, <24 x i32> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
   1499   %s2 = shufflevector <24 x i32> %wide.vec, <24 x i32> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
   1500   %s3 = shufflevector <24 x i32> %wide.vec, <24 x i32> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
   1501   store <8 x i32> %s1, <8 x i32>* %q1, align 4
   1502   store <8 x i32> %s2, <8 x i32>* %q2, align 4
   1503   store <8 x i32> %s3, <8 x i32>* %q3, align 4
   1504   ret void
   1505 }
   1506 
   1507 define void @interleave_24i32_in(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2, <8 x i32>* %q3) nounwind {
   1508 ; SSE2-LABEL: interleave_24i32_in:
   1509 ; SSE2:       # %bb.0:
   1510 ; SSE2-NEXT:    movdqu (%rsi), %xmm5
   1511 ; SSE2-NEXT:    movdqu 16(%rsi), %xmm2
   1512 ; SSE2-NEXT:    movdqu (%rdx), %xmm6
   1513 ; SSE2-NEXT:    movdqu 16(%rdx), %xmm1
   1514 ; SSE2-NEXT:    movups (%rcx), %xmm7
   1515 ; SSE2-NEXT:    movups 16(%rcx), %xmm4
   1516 ; SSE2-NEXT:    movdqa %xmm5, %xmm0
   1517 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
   1518 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2]
   1519 ; SSE2-NEXT:    movaps %xmm7, %xmm3
   1520 ; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[3,0]
   1521 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0,2]
   1522 ; SSE2-NEXT:    movaps %xmm7, %xmm3
   1523 ; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,0],xmm6[1,0]
   1524 ; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[2,0],xmm6[2,2]
   1525 ; SSE2-NEXT:    unpckhps {{.*#+}} xmm7 = xmm7[2],xmm5[2],xmm7[3],xmm5[3]
   1526 ; SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[3,0]
   1527 ; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm5[0,2]
   1528 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[0,3,2,2]
   1529 ; SSE2-NEXT:    shufps {{.*#+}} xmm6 = xmm6[3,2],xmm5[3,0]
   1530 ; SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0,2]
   1531 ; SSE2-NEXT:    movdqa %xmm2, %xmm6
   1532 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1]
   1533 ; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,1,2,2]
   1534 ; SSE2-NEXT:    movaps %xmm4, %xmm7
   1535 ; SSE2-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,0],xmm6[3,0]
   1536 ; SSE2-NEXT:    shufps {{.*#+}} xmm6 = xmm6[0,1],xmm7[0,2]
   1537 ; SSE2-NEXT:    movaps %xmm4, %xmm7
   1538 ; SSE2-NEXT:    shufps {{.*#+}} xmm7 = xmm7[1,0],xmm1[1,0]
   1539 ; SSE2-NEXT:    shufps {{.*#+}} xmm7 = xmm7[2,0],xmm1[2,2]
   1540 ; SSE2-NEXT:    unpckhps {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3]
   1541 ; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm7[3,0]
   1542 ; SSE2-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,1],xmm2[0,2]
   1543 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[0,3,2,2]
   1544 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,2],xmm2[3,0]
   1545 ; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2]
   1546 ; SSE2-NEXT:    movups %xmm2, 80(%rdi)
   1547 ; SSE2-NEXT:    movups %xmm7, 64(%rdi)
   1548 ; SSE2-NEXT:    movups %xmm6, 48(%rdi)
   1549 ; SSE2-NEXT:    movups %xmm5, 32(%rdi)
   1550 ; SSE2-NEXT:    movups %xmm3, 16(%rdi)
   1551 ; SSE2-NEXT:    movups %xmm0, (%rdi)
   1552 ; SSE2-NEXT:    retq
   1553 ;
   1554 ; SSE42-LABEL: interleave_24i32_in:
   1555 ; SSE42:       # %bb.0:
   1556 ; SSE42-NEXT:    movdqu (%rsi), %xmm5
   1557 ; SSE42-NEXT:    movdqu 16(%rsi), %xmm2
   1558 ; SSE42-NEXT:    movdqu (%rdx), %xmm6
   1559 ; SSE42-NEXT:    movdqu 16(%rdx), %xmm1
   1560 ; SSE42-NEXT:    movdqu (%rcx), %xmm7
   1561 ; SSE42-NEXT:    movdqu 16(%rcx), %xmm4
   1562 ; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,1,1]
   1563 ; SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[0,1,0,1]
   1564 ; SSE42-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3],xmm3[4,5,6,7]
   1565 ; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[0,1,0,1]
   1566 ; SSE42-NEXT:    pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5],xmm3[6,7]
   1567 ; SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[1,1,2,2]
   1568 ; SSE42-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,3],xmm3[4,5,6,7]
   1569 ; SSE42-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4,5],xmm3[6,7]
   1570 ; SSE42-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1]
   1571 ; SSE42-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[2,3,2,3]
   1572 ; SSE42-NEXT:    pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm5[2,3],xmm7[4,5,6,7]
   1573 ; SSE42-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[2,2,3,3]
   1574 ; SSE42-NEXT:    pblendw {{.*#+}} xmm5 = xmm7[0,1,2,3],xmm5[4,5],xmm7[6,7]
   1575 ; SSE42-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[0,0,1,1]
   1576 ; SSE42-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[0,1,0,1]
   1577 ; SSE42-NEXT:    pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm6[2,3],xmm7[4,5,6,7]
   1578 ; SSE42-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,1,0,1]
   1579 ; SSE42-NEXT:    pblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4,5],xmm7[6,7]
   1580 ; SSE42-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[1,1,2,2]
   1581 ; SSE42-NEXT:    pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm4[2,3],xmm7[4,5,6,7]
   1582 ; SSE42-NEXT:    pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm2[4,5],xmm7[6,7]
   1583 ; SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
   1584 ; SSE42-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
   1585 ; SSE42-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3],xmm4[4,5,6,7]
   1586 ; SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
   1587 ; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5],xmm4[6,7]
   1588 ; SSE42-NEXT:    movdqu %xmm1, 80(%rdi)
   1589 ; SSE42-NEXT:    movdqu %xmm7, 64(%rdi)
   1590 ; SSE42-NEXT:    movdqu %xmm6, 48(%rdi)
   1591 ; SSE42-NEXT:    movdqu %xmm5, 32(%rdi)
   1592 ; SSE42-NEXT:    movdqu %xmm3, 16(%rdi)
   1593 ; SSE42-NEXT:    movdqu %xmm0, (%rdi)
   1594 ; SSE42-NEXT:    retq
   1595 ;
   1596 ; AVX1-LABEL: interleave_24i32_in:
   1597 ; AVX1:       # %bb.0:
   1598 ; AVX1-NEXT:    vmovups (%rsi), %ymm0
   1599 ; AVX1-NEXT:    vmovups (%rdx), %ymm1
   1600 ; AVX1-NEXT:    vmovupd (%rcx), %ymm2
   1601 ; AVX1-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[2,0],xmm1[2,0]
   1602 ; AVX1-NEXT:    vshufps {{.*#+}} xmm3 = xmm1[1,1],xmm3[0,2]
   1603 ; AVX1-NEXT:    vshufps {{.*#+}} xmm4 = xmm1[0,0],xmm0[0,0]
   1604 ; AVX1-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,1]
   1605 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
   1606 ; AVX1-NEXT:    vmovddup {{.*#+}} xmm4 = xmm2[0,0]
   1607 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm4, %ymm4
   1608 ; AVX1-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
   1609 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
   1610 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
   1611 ; AVX1-NEXT:    vshufps {{.*#+}} xmm6 = xmm5[3,0],xmm4[3,0]
   1612 ; AVX1-NEXT:    vshufps {{.*#+}} xmm6 = xmm4[2,1],xmm6[0,2]
   1613 ; AVX1-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[1,0],xmm5[1,0]
   1614 ; AVX1-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[2,0],xmm5[2,2]
   1615 ; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm4, %ymm4
   1616 ; AVX1-NEXT:    vpermilpd {{.*#+}} ymm5 = ymm0[1,1,3,3]
   1617 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3]
   1618 ; AVX1-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
   1619 ; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2]
   1620 ; AVX1-NEXT:    vpermilpd {{.*#+}} ymm2 = ymm2[1,1,2,2]
   1621 ; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7]
   1622 ; AVX1-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7]
   1623 ; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
   1624 ; AVX1-NEXT:    vmovups %ymm0, 32(%rdi)
   1625 ; AVX1-NEXT:    vmovups %ymm4, 64(%rdi)
   1626 ; AVX1-NEXT:    vmovups %ymm3, (%rdi)
   1627 ; AVX1-NEXT:    vzeroupper
   1628 ; AVX1-NEXT:    retq
   1629 ;
   1630 ; AVX2-SLOW-LABEL: interleave_24i32_in:
   1631 ; AVX2-SLOW:       # %bb.0:
   1632 ; AVX2-SLOW-NEXT:    vmovups (%rsi), %ymm0
   1633 ; AVX2-SLOW-NEXT:    vmovups (%rdx), %ymm1
   1634 ; AVX2-SLOW-NEXT:    vmovups (%rcx), %ymm2
   1635 ; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} xmm3 = xmm1[1,0,2,2]
   1636 ; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,1]
   1637 ; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm4 = ymm0[0,0,2,1]
   1638 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7]
   1639 ; AVX2-SLOW-NEXT:    vbroadcastsd %xmm2, %ymm4
   1640 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
   1641 ; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm4 = ymm2[2,1,3,3]
   1642 ; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm5 = ymm1[1,2,3,3,5,6,7,7]
   1643 ; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,3]
   1644 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7]
   1645 ; AVX2-SLOW-NEXT:    vbroadcastsd 24(%rsi), %ymm5
   1646 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
   1647 ; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
   1648 ; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2]
   1649 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7]
   1650 ; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7]
   1651 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
   1652 ; AVX2-SLOW-NEXT:    vmovups %ymm0, 32(%rdi)
   1653 ; AVX2-SLOW-NEXT:    vmovups %ymm4, 64(%rdi)
   1654 ; AVX2-SLOW-NEXT:    vmovups %ymm3, (%rdi)
   1655 ; AVX2-SLOW-NEXT:    vzeroupper
   1656 ; AVX2-SLOW-NEXT:    retq
   1657 ;
   1658 ; AVX2-FAST-LABEL: interleave_24i32_in:
   1659 ; AVX2-FAST:       # %bb.0:
   1660 ; AVX2-FAST-NEXT:    vmovups (%rsi), %ymm0
   1661 ; AVX2-FAST-NEXT:    vmovups (%rdx), %ymm1
   1662 ; AVX2-FAST-NEXT:    vmovups (%rcx), %ymm2
   1663 ; AVX2-FAST-NEXT:    vpermilps {{.*#+}} xmm3 = xmm1[1,0,2,2]
   1664 ; AVX2-FAST-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,1]
   1665 ; AVX2-FAST-NEXT:    vpermpd {{.*#+}} ymm4 = ymm0[0,0,2,1]
   1666 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7]
   1667 ; AVX2-FAST-NEXT:    vbroadcastsd %xmm2, %ymm4
   1668 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
   1669 ; AVX2-FAST-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
   1670 ; AVX2-FAST-NEXT:    vpermpd {{.*#+}} ymm4 = ymm2[1,1,2,2]
   1671 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2,3],ymm0[4],ymm4[5,6],ymm0[7]
   1672 ; AVX2-FAST-NEXT:    vpermilps {{.*#+}} ymm4 = ymm1[0,0,3,3,4,4,7,7]
   1673 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7]
   1674 ; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm4 = [5,6,5,6,5,6,7,7]
   1675 ; AVX2-FAST-NEXT:    vpermps %ymm1, %ymm4, %ymm1
   1676 ; AVX2-FAST-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[2,1,3,3]
   1677 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7]
   1678 ; AVX2-FAST-NEXT:    vbroadcastsd 24(%rsi), %ymm2
   1679 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
   1680 ; AVX2-FAST-NEXT:    vmovups %ymm1, 64(%rdi)
   1681 ; AVX2-FAST-NEXT:    vmovups %ymm0, 32(%rdi)
   1682 ; AVX2-FAST-NEXT:    vmovups %ymm3, (%rdi)
   1683 ; AVX2-FAST-NEXT:    vzeroupper
   1684 ; AVX2-FAST-NEXT:    retq
   1685 ;
   1686 ; XOP-LABEL: interleave_24i32_in:
   1687 ; XOP:       # %bb.0:
   1688 ; XOP-NEXT:    vmovups (%rsi), %ymm0
   1689 ; XOP-NEXT:    vmovups (%rdx), %ymm1
   1690 ; XOP-NEXT:    vmovupd (%rcx), %ymm2
   1691 ; XOP-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[2,0],xmm1[2,0]
   1692 ; XOP-NEXT:    vshufps {{.*#+}} xmm3 = xmm1[1,1],xmm3[0,2]
   1693 ; XOP-NEXT:    vshufps {{.*#+}} xmm4 = xmm1[0,0],xmm0[0,0]
   1694 ; XOP-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,1]
   1695 ; XOP-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
   1696 ; XOP-NEXT:    vmovddup {{.*#+}} xmm4 = xmm2[0,0]
   1697 ; XOP-NEXT:    vinsertf128 $1, %xmm4, %ymm4, %ymm4
   1698 ; XOP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
   1699 ; XOP-NEXT:    vextractf128 $1, %ymm2, %xmm4
   1700 ; XOP-NEXT:    vextractf128 $1, %ymm1, %xmm5
   1701 ; XOP-NEXT:    vshufps {{.*#+}} xmm6 = xmm5[3,0],xmm4[3,0]
   1702 ; XOP-NEXT:    vshufps {{.*#+}} xmm6 = xmm4[2,1],xmm6[0,2]
   1703 ; XOP-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[1,0],xmm5[1,0]
   1704 ; XOP-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[2,0],xmm5[2,2]
   1705 ; XOP-NEXT:    vinsertf128 $1, %xmm6, %ymm4, %ymm4
   1706 ; XOP-NEXT:    vpermilpd {{.*#+}} ymm5 = ymm0[1,1,3,3]
   1707 ; XOP-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3]
   1708 ; XOP-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
   1709 ; XOP-NEXT:    vpermil2ps {{.*#+}} ymm0 = ymm2[2],ymm0[3],ymm2[2,3],ymm0[4],ymm2[5,4],ymm0[5]
   1710 ; XOP-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7]
   1711 ; XOP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
   1712 ; XOP-NEXT:    vmovups %ymm0, 32(%rdi)
   1713 ; XOP-NEXT:    vmovups %ymm4, 64(%rdi)
   1714 ; XOP-NEXT:    vmovups %ymm3, (%rdi)
   1715 ; XOP-NEXT:    vzeroupper
   1716 ; XOP-NEXT:    retq
   1717   %s1 = load <8 x i32>, <8 x i32>* %q1, align 4
   1718   %s2 = load <8 x i32>, <8 x i32>* %q2, align 4
   1719   %s3 = load <8 x i32>, <8 x i32>* %q3, align 4
   1720   %t1 = shufflevector <8 x i32> %s1, <8 x i32> %s2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   1721   %t2 = shufflevector <8 x i32> %s3, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   1722   %interleaved = shufflevector <16 x i32> %t1, <16 x i32> %t2, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
   1723   store <24 x i32> %interleaved, <24 x i32>* %p, align 4
   1724   ret void
   1725 }
   1726 
   1727 define <2 x double> @wrongorder(<4 x double> %A, <8 x double>* %P) #0 {
   1728 ; SSE2-LABEL: wrongorder:
   1729 ; SSE2:       # %bb.0:
   1730 ; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
   1731 ; SSE2-NEXT:    movaps %xmm0, 48(%rdi)
   1732 ; SSE2-NEXT:    movaps %xmm0, 32(%rdi)
   1733 ; SSE2-NEXT:    movaps %xmm0, 16(%rdi)
   1734 ; SSE2-NEXT:    movaps %xmm0, (%rdi)
   1735 ; SSE2-NEXT:    retq
   1736 ;
   1737 ; SSE42-LABEL: wrongorder:
   1738 ; SSE42:       # %bb.0:
   1739 ; SSE42-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
   1740 ; SSE42-NEXT:    movapd %xmm0, 48(%rdi)
   1741 ; SSE42-NEXT:    movapd %xmm0, 32(%rdi)
   1742 ; SSE42-NEXT:    movapd %xmm0, 16(%rdi)
   1743 ; SSE42-NEXT:    movapd %xmm0, (%rdi)
   1744 ; SSE42-NEXT:    retq
   1745 ;
   1746 ; AVX1-LABEL: wrongorder:
   1747 ; AVX1:       # %bb.0:
   1748 ; AVX1-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
   1749 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm1
   1750 ; AVX1-NEXT:    vmovaps %ymm1, 32(%rdi)
   1751 ; AVX1-NEXT:    vmovaps %ymm1, (%rdi)
   1752 ; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   1753 ; AVX1-NEXT:    vzeroupper
   1754 ; AVX1-NEXT:    retq
   1755 ;
   1756 ; AVX2-LABEL: wrongorder:
   1757 ; AVX2:       # %bb.0:
   1758 ; AVX2-NEXT:    vbroadcastsd %xmm0, %ymm1
   1759 ; AVX2-NEXT:    vmovapd %ymm1, 32(%rdi)
   1760 ; AVX2-NEXT:    vmovapd %ymm1, (%rdi)
   1761 ; AVX2-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
   1762 ; AVX2-NEXT:    vzeroupper
   1763 ; AVX2-NEXT:    retq
   1764 ;
   1765 ; XOP-LABEL: wrongorder:
   1766 ; XOP:       # %bb.0:
   1767 ; XOP-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
   1768 ; XOP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm1
   1769 ; XOP-NEXT:    vmovaps %ymm1, 32(%rdi)
   1770 ; XOP-NEXT:    vmovaps %ymm1, (%rdi)
   1771 ; XOP-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   1772 ; XOP-NEXT:    vzeroupper
   1773 ; XOP-NEXT:    retq
   1774   %shuffle = shufflevector <4 x double> %A, <4 x double> %A, <8 x i32> zeroinitializer
   1775   store <8 x double> %shuffle, <8 x double>* %P, align 64
   1776   %m2 = load <8 x double>, <8 x double>* %P, align 64
   1777   store <8 x double> %m2, <8 x double>* %P, align 64
   1778   %m3 = load <8 x double>, <8 x double>* %P, align 64
   1779   %m4 = shufflevector <8 x double> %m3, <8 x double> undef, <2 x i32> <i32 2, i32 0>
   1780   ret <2 x double> %m4
   1781 }
   1782