Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
      4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
      5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
      6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512F
      7 ;
      8 ; Just one 32-bit run to make sure we do reasonable things.
      9 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X32-SSE
     10 
     11 define <2 x double> @merge_2f64_f64_23(double* %ptr) nounwind uwtable noinline ssp {
     12 ; SSE-LABEL: merge_2f64_f64_23:
     13 ; SSE:       # BB#0:
     14 ; SSE-NEXT:    movups 16(%rdi), %xmm0
     15 ; SSE-NEXT:    retq
     16 ;
     17 ; AVX-LABEL: merge_2f64_f64_23:
     18 ; AVX:       # BB#0:
     19 ; AVX-NEXT:    vmovups 16(%rdi), %xmm0
     20 ; AVX-NEXT:    retq
     21 ;
     22 ; X32-SSE-LABEL: merge_2f64_f64_23:
     23 ; X32-SSE:       # BB#0:
     24 ; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
     25 ; X32-SSE-NEXT:    movups 16(%eax), %xmm0
     26 ; X32-SSE-NEXT:    retl
     27   %ptr0 = getelementptr inbounds double, double* %ptr, i64 2
     28   %ptr1 = getelementptr inbounds double, double* %ptr, i64 3
     29   %val0 = load double, double* %ptr0
     30   %val1 = load double, double* %ptr1
     31   %res0 = insertelement <2 x double> undef, double %val0, i32 0
     32   %res1 = insertelement <2 x double> %res0, double %val1, i32 1
     33   ret <2 x double> %res1
     34 }
     35 
     36 define <2 x i64> @merge_2i64_i64_12(i64* %ptr) nounwind uwtable noinline ssp {
     37 ; SSE-LABEL: merge_2i64_i64_12:
     38 ; SSE:       # BB#0:
     39 ; SSE-NEXT:    movups 8(%rdi), %xmm0
     40 ; SSE-NEXT:    retq
     41 ;
     42 ; AVX-LABEL: merge_2i64_i64_12:
     43 ; AVX:       # BB#0:
     44 ; AVX-NEXT:    vmovups 8(%rdi), %xmm0
     45 ; AVX-NEXT:    retq
     46 ;
     47 ; X32-SSE-LABEL: merge_2i64_i64_12:
     48 ; X32-SSE:       # BB#0:
     49 ; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
     50 ; X32-SSE-NEXT:    movups 8(%eax), %xmm0
     51 ; X32-SSE-NEXT:    retl
     52   %ptr0 = getelementptr inbounds i64, i64* %ptr, i64 1
     53   %ptr1 = getelementptr inbounds i64, i64* %ptr, i64 2
     54   %val0 = load i64, i64* %ptr0
     55   %val1 = load i64, i64* %ptr1
     56   %res0 = insertelement <2 x i64> undef, i64 %val0, i32 0
     57   %res1 = insertelement <2 x i64> %res0, i64 %val1, i32 1
     58   ret <2 x i64> %res1
     59 }
     60 
     61 define <4 x float> @merge_4f32_f32_2345(float* %ptr) nounwind uwtable noinline ssp {
     62 ; SSE-LABEL: merge_4f32_f32_2345:
     63 ; SSE:       # BB#0:
     64 ; SSE-NEXT:    movups 8(%rdi), %xmm0
     65 ; SSE-NEXT:    retq
     66 ;
     67 ; AVX-LABEL: merge_4f32_f32_2345:
     68 ; AVX:       # BB#0:
     69 ; AVX-NEXT:    vmovups 8(%rdi), %xmm0
     70 ; AVX-NEXT:    retq
     71 ;
     72 ; X32-SSE-LABEL: merge_4f32_f32_2345:
     73 ; X32-SSE:       # BB#0:
     74 ; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
     75 ; X32-SSE-NEXT:    movups 8(%eax), %xmm0
     76 ; X32-SSE-NEXT:    retl
     77   %ptr0 = getelementptr inbounds float, float* %ptr, i64 2
     78   %ptr1 = getelementptr inbounds float, float* %ptr, i64 3
     79   %ptr2 = getelementptr inbounds float, float* %ptr, i64 4
     80   %ptr3 = getelementptr inbounds float, float* %ptr, i64 5
     81   %val0 = load float, float* %ptr0
     82   %val1 = load float, float* %ptr1
     83   %val2 = load float, float* %ptr2
     84   %val3 = load float, float* %ptr3
     85   %res0 = insertelement <4 x float> undef, float %val0, i32 0
     86   %res1 = insertelement <4 x float> %res0, float %val1, i32 1
     87   %res2 = insertelement <4 x float> %res1, float %val2, i32 2
     88   %res3 = insertelement <4 x float> %res2, float %val3, i32 3
     89   ret <4 x float> %res3
     90 }
     91 
     92 define <4 x float> @merge_4f32_f32_3zuu(float* %ptr) nounwind uwtable noinline ssp {
     93 ; SSE-LABEL: merge_4f32_f32_3zuu:
     94 ; SSE:       # BB#0:
     95 ; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
     96 ; SSE-NEXT:    retq
     97 ;
     98 ; AVX-LABEL: merge_4f32_f32_3zuu:
     99 ; AVX:       # BB#0:
    100 ; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
    101 ; AVX-NEXT:    retq
    102 ;
    103 ; X32-SSE-LABEL: merge_4f32_f32_3zuu:
    104 ; X32-SSE:       # BB#0:
    105 ; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
    106 ; X32-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
    107 ; X32-SSE-NEXT:    retl
    108   %ptr0 = getelementptr inbounds float, float* %ptr, i64 3
    109   %val0 = load float, float* %ptr0
    110   %res0 = insertelement <4 x float> undef, float %val0, i32 0
    111   %res1 = insertelement <4 x float> %res0, float 0.0, i32 1
    112   ret <4 x float> %res1
    113 }
    114 
    115 define <4 x float> @merge_4f32_f32_34uu(float* %ptr) nounwind uwtable noinline ssp {
    116 ; SSE-LABEL: merge_4f32_f32_34uu:
    117 ; SSE:       # BB#0:
    118 ; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
    119 ; SSE-NEXT:    retq
    120 ;
    121 ; AVX-LABEL: merge_4f32_f32_34uu:
    122 ; AVX:       # BB#0:
    123 ; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
    124 ; AVX-NEXT:    retq
    125 ;
    126 ; X32-SSE-LABEL: merge_4f32_f32_34uu:
    127 ; X32-SSE:       # BB#0:
    128 ; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
    129 ; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
    130 ; X32-SSE-NEXT:    retl
    131   %ptr0 = getelementptr inbounds float, float* %ptr, i64 3
    132   %ptr1 = getelementptr inbounds float, float* %ptr, i64 4
    133   %val0 = load float, float* %ptr0
    134   %val1 = load float, float* %ptr1
    135   %res0 = insertelement <4 x float> undef, float %val0, i32 0
    136   %res1 = insertelement <4 x float> %res0, float %val1, i32 1
    137   ret <4 x float> %res1
    138 }
    139 
    140 define <4 x float> @merge_4f32_f32_34z6(float* %ptr) nounwind uwtable noinline ssp {
    141 ; SSE2-LABEL: merge_4f32_f32_34z6:
    142 ; SSE2:       # BB#0:
    143 ; SSE2-NEXT:    movups 12(%rdi), %xmm0
    144 ; SSE2-NEXT:    xorps %xmm1, %xmm1
    145 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
    146 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
    147 ; SSE2-NEXT:    retq
    148 ;
    149 ; SSE41-LABEL: merge_4f32_f32_34z6:
    150 ; SSE41:       # BB#0:
    151 ; SSE41-NEXT:    movups 12(%rdi), %xmm1
    152 ; SSE41-NEXT:    xorps %xmm0, %xmm0
    153 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3]
    154 ; SSE41-NEXT:    retq
    155 ;
    156 ; AVX-LABEL: merge_4f32_f32_34z6:
    157 ; AVX:       # BB#0:
    158 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
    159 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = mem[0,1],xmm0[2],mem[3]
    160 ; AVX-NEXT:    retq
    161 ;
    162 ; X32-SSE-LABEL: merge_4f32_f32_34z6:
    163 ; X32-SSE:       # BB#0:
    164 ; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
    165 ; X32-SSE-NEXT:    movups 12(%eax), %xmm1
    166 ; X32-SSE-NEXT:    xorps %xmm0, %xmm0
    167 ; X32-SSE-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3]
    168 ; X32-SSE-NEXT:    retl
    169   %ptr0 = getelementptr inbounds float, float* %ptr, i64 3
    170   %ptr1 = getelementptr inbounds float, float* %ptr, i64 4
    171   %ptr3 = getelementptr inbounds float, float* %ptr, i64 6
    172   %val0 = load float, float* %ptr0
    173   %val1 = load float, float* %ptr1
    174   %val3 = load float, float* %ptr3
    175   %res0 = insertelement <4 x float> zeroinitializer, float %val0, i32 0
    176   %res1 = insertelement <4 x float> %res0, float %val1, i32 1
    177   %res3 = insertelement <4 x float> %res1, float %val3, i32 3
    178   ret <4 x float> %res3
    179 }
    180 
    181 define <4 x float> @merge_4f32_f32_45zz(float* %ptr) nounwind uwtable noinline ssp {
    182 ; SSE-LABEL: merge_4f32_f32_45zz:
    183 ; SSE:       # BB#0:
    184 ; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
    185 ; SSE-NEXT:    retq
    186 ;
    187 ; AVX-LABEL: merge_4f32_f32_45zz:
    188 ; AVX:       # BB#0:
    189 ; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
    190 ; AVX-NEXT:    retq
    191 ;
    192 ; X32-SSE-LABEL: merge_4f32_f32_45zz:
    193 ; X32-SSE:       # BB#0:
    194 ; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
    195 ; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
    196 ; X32-SSE-NEXT:    retl
    197   %ptr0 = getelementptr inbounds float, float* %ptr, i64 4
    198   %ptr1 = getelementptr inbounds float, float* %ptr, i64 5
    199   %val0 = load float, float* %ptr0
    200   %val1 = load float, float* %ptr1
    201   %res0 = insertelement <4 x float> zeroinitializer, float %val0, i32 0
    202   %res1 = insertelement <4 x float> %res0, float %val1, i32 1
    203   ret <4 x float> %res1
    204 }
    205 
    206 define <4 x float> @merge_4f32_f32_012u(float* %ptr) nounwind uwtable noinline ssp {
    207 ; SSE2-LABEL: merge_4f32_f32_012u:
    208 ; SSE2:       # BB#0:
    209 ; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
    210 ; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
    211 ; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
    212 ; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    213 ; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
    214 ; SSE2-NEXT:    retq
    215 ;
    216 ; SSE41-LABEL: merge_4f32_f32_012u:
    217 ; SSE41:       # BB#0:
    218 ; SSE41-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
    219 ; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
    220 ; SSE41-NEXT:    retq
    221 ;
    222 ; AVX-LABEL: merge_4f32_f32_012u:
    223 ; AVX:       # BB#0:
    224 ; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
    225 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
    226 ; AVX-NEXT:    retq
    227 ;
    228 ; X32-SSE-LABEL: merge_4f32_f32_012u:
    229 ; X32-SSE:       # BB#0:
    230 ; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
    231 ; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
    232 ; X32-SSE-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
    233 ; X32-SSE-NEXT:    retl
    234   %ptr0 = getelementptr inbounds float, float* %ptr, i64 0
    235   %ptr1 = getelementptr inbounds float, float* %ptr, i64 1
    236   %ptr2 = getelementptr inbounds float, float* %ptr, i64 2
    237   %val0 = load float, float* %ptr0
    238   %val1 = load float, float* %ptr1
    239   %val2 = load float, float* %ptr2
    240   %res0 = insertelement <4 x float> undef, float %val0, i32 0
    241   %res1 = insertelement <4 x float> %res0, float %val1, i32 1
    242   %res2 = insertelement <4 x float> %res1, float %val2, i32 2
    243   %res3 = insertelement <4 x float> %res2, float undef, i32 3
    244   ret <4 x float> %res3
    245 }
    246 
    247 define <4 x float> @merge_4f32_f32_019u(float* %ptr) nounwind uwtable noinline ssp {
    248 ; SSE2-LABEL: merge_4f32_f32_019u:
    249 ; SSE2:       # BB#0:
    250 ; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
    251 ; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
    252 ; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
    253 ; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    254 ; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
    255 ; SSE2-NEXT:    retq
    256 ;
    257 ; SSE41-LABEL: merge_4f32_f32_019u:
    258 ; SSE41:       # BB#0:
    259 ; SSE41-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
    260 ; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
    261 ; SSE41-NEXT:    retq
    262 ;
    263 ; AVX-LABEL: merge_4f32_f32_019u:
    264 ; AVX:       # BB#0:
    265 ; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
    266 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
    267 ; AVX-NEXT:    retq
    268 ;
    269 ; X32-SSE-LABEL: merge_4f32_f32_019u:
    270 ; X32-SSE:       # BB#0:
    271 ; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
    272 ; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
    273 ; X32-SSE-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
    274 ; X32-SSE-NEXT:    retl
    275   %ptr0 = getelementptr inbounds float, float* %ptr, i64 0
    276   %ptr1 = getelementptr inbounds float, float* %ptr, i64 1
    277   %ptr2 = getelementptr inbounds float, float* %ptr, i64 9
    278   %val0 = load float, float* %ptr0
    279   %val1 = load float, float* %ptr1
    280   %val2 = load float, float* %ptr2
    281   %res0 = insertelement <4 x float> undef, float %val0, i32 0
    282   %res1 = insertelement <4 x float> %res0, float %val1, i32 1
    283   %res2 = insertelement <4 x float> %res1, float %val2, i32 2
    284   %res3 = insertelement <4 x float> %res2, float undef, i32 3
    285   ret <4 x float> %res3
    286 }
    287 
    288 define <4 x i32> @merge_4i32_i32_23u5(i32* %ptr) nounwind uwtable noinline ssp {
    289 ; SSE-LABEL: merge_4i32_i32_23u5:
    290 ; SSE:       # BB#0:
    291 ; SSE-NEXT:    movups 8(%rdi), %xmm0
    292 ; SSE-NEXT:    retq
    293 ;
    294 ; AVX-LABEL: merge_4i32_i32_23u5:
    295 ; AVX:       # BB#0:
    296 ; AVX-NEXT:    vmovups 8(%rdi), %xmm0
    297 ; AVX-NEXT:    retq
    298 ;
    299 ; X32-SSE-LABEL: merge_4i32_i32_23u5:
    300 ; X32-SSE:       # BB#0:
    301 ; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
    302 ; X32-SSE-NEXT:    movups 8(%eax), %xmm0
    303 ; X32-SSE-NEXT:    retl
    304   %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 2
    305   %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 3
    306   %ptr3 = getelementptr inbounds i32, i32* %ptr, i64 5
    307   %val0 = load i32, i32* %ptr0
    308   %val1 = load i32, i32* %ptr1
    309   %val3 = load i32, i32* %ptr3
    310   %res0 = insertelement <4 x i32> undef, i32 %val0, i32 0
    311   %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
    312   %res3 = insertelement <4 x i32> %res1, i32 %val3, i32 3
    313   ret <4 x i32> %res3
    314 }
    315 
    316 define <4 x i32> @merge_4i32_i32_3zuu(i32* %ptr) nounwind uwtable noinline ssp {
    317 ; SSE-LABEL: merge_4i32_i32_3zuu:
    318 ; SSE:       # BB#0:
    319 ; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
    320 ; SSE-NEXT:    retq
    321 ;
    322 ; AVX-LABEL: merge_4i32_i32_3zuu:
    323 ; AVX:       # BB#0:
    324 ; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
    325 ; AVX-NEXT:    retq
    326 ;
    327 ; X32-SSE-LABEL: merge_4i32_i32_3zuu:
    328 ; X32-SSE:       # BB#0:
    329 ; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
    330 ; X32-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
    331 ; X32-SSE-NEXT:    retl
    332   %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 3
    333   %val0 = load i32, i32* %ptr0
    334   %res0 = insertelement <4 x i32> undef, i32 %val0, i32 0
    335   %res1 = insertelement <4 x i32> %res0, i32     0, i32 1
    336   ret <4 x i32> %res1
    337 }
    338 
    339 define <4 x i32> @merge_4i32_i32_34uu(i32* %ptr) nounwind uwtable noinline ssp {
    340 ; SSE-LABEL: merge_4i32_i32_34uu:
    341 ; SSE:       # BB#0:
    342 ; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
    343 ; SSE-NEXT:    retq
    344 ;
    345 ; AVX-LABEL: merge_4i32_i32_34uu:
    346 ; AVX:       # BB#0:
    347 ; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
    348 ; AVX-NEXT:    retq
    349 ;
    350 ; X32-SSE-LABEL: merge_4i32_i32_34uu:
    351 ; X32-SSE:       # BB#0:
    352 ; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
    353 ; X32-SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
    354 ; X32-SSE-NEXT:    retl
    355   %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 3
    356   %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 4
    357   %val0 = load i32, i32* %ptr0
    358   %val1 = load i32, i32* %ptr1
    359   %res0 = insertelement <4 x i32> undef, i32 %val0, i32 0
    360   %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
    361   ret <4 x i32> %res1
    362 }
    363 
    364 define <4 x i32> @merge_4i32_i32_45zz(i32* %ptr) nounwind uwtable noinline ssp {
    365 ; SSE-LABEL: merge_4i32_i32_45zz:
    366 ; SSE:       # BB#0:
    367 ; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
    368 ; SSE-NEXT:    retq
    369 ;
    370 ; AVX-LABEL: merge_4i32_i32_45zz:
    371 ; AVX:       # BB#0:
    372 ; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
    373 ; AVX-NEXT:    retq
    374 ;
    375 ; X32-SSE-LABEL: merge_4i32_i32_45zz:
    376 ; X32-SSE:       # BB#0:
    377 ; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
    378 ; X32-SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
    379 ; X32-SSE-NEXT:    retl
    380   %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 4
    381   %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 5
    382   %val0 = load i32, i32* %ptr0
    383   %val1 = load i32, i32* %ptr1
    384   %res0 = insertelement <4 x i32> zeroinitializer, i32 %val0, i32 0
    385   %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
    386   ret <4 x i32> %res1
    387 }
    388 
    389 define <8 x i16> @merge_8i16_i16_23u567u9(i16* %ptr) nounwind uwtable noinline ssp {
    390 ; SSE-LABEL: merge_8i16_i16_23u567u9:
    391 ; SSE:       # BB#0:
    392 ; SSE-NEXT:    movups 4(%rdi), %xmm0
    393 ; SSE-NEXT:    retq
    394 ;
    395 ; AVX-LABEL: merge_8i16_i16_23u567u9:
    396 ; AVX:       # BB#0:
    397 ; AVX-NEXT:    vmovups 4(%rdi), %xmm0
    398 ; AVX-NEXT:    retq
    399 ;
    400 ; X32-SSE-LABEL: merge_8i16_i16_23u567u9:
    401 ; X32-SSE:       # BB#0:
    402 ; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
    403 ; X32-SSE-NEXT:    movups 4(%eax), %xmm0
    404 ; X32-SSE-NEXT:    retl
    405   %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 2
    406   %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 3
    407   %ptr3 = getelementptr inbounds i16, i16* %ptr, i64 5
    408   %ptr4 = getelementptr inbounds i16, i16* %ptr, i64 6
    409   %ptr5 = getelementptr inbounds i16, i16* %ptr, i64 7
    410   %ptr7 = getelementptr inbounds i16, i16* %ptr, i64 9
    411   %val0 = load i16, i16* %ptr0
    412   %val1 = load i16, i16* %ptr1
    413   %val3 = load i16, i16* %ptr3
    414   %val4 = load i16, i16* %ptr4
    415   %val5 = load i16, i16* %ptr5
    416   %val7 = load i16, i16* %ptr7
    417   %res0 = insertelement <8 x i16> undef, i16 %val0, i32 0
    418   %res1 = insertelement <8 x i16> %res0, i16 %val1, i32 1
    419   %res3 = insertelement <8 x i16> %res1, i16 %val3, i32 3
    420   %res4 = insertelement <8 x i16> %res3, i16 %val4, i32 4
    421   %res5 = insertelement <8 x i16> %res4, i16 %val5, i32 5
    422   %res7 = insertelement <8 x i16> %res5, i16 %val7, i32 7
    423   ret <8 x i16> %res7
    424 }
    425 
    426 define <8 x i16> @merge_8i16_i16_34uuuuuu(i16* %ptr) nounwind uwtable noinline ssp {
    427 ; SSE-LABEL: merge_8i16_i16_34uuuuuu:
    428 ; SSE:       # BB#0:
    429 ; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
    430 ; SSE-NEXT:    retq
    431 ;
    432 ; AVX-LABEL: merge_8i16_i16_34uuuuuu:
    433 ; AVX:       # BB#0:
    434 ; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
    435 ; AVX-NEXT:    retq
    436 ;
    437 ; X32-SSE-LABEL: merge_8i16_i16_34uuuuuu:
    438 ; X32-SSE:       # BB#0:
    439 ; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
    440 ; X32-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
    441 ; X32-SSE-NEXT:    retl
    442   %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 3
    443   %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 4
    444   %val0 = load i16, i16* %ptr0
    445   %val1 = load i16, i16* %ptr1
    446   %res0 = insertelement <8 x i16> undef, i16 %val0, i32 0
    447   %res1 = insertelement <8 x i16> %res0, i16 %val1, i32 1
    448   ret <8 x i16> %res1
    449 }
    450 
    451 define <8 x i16> @merge_8i16_i16_45u7zzzz(i16* %ptr) nounwind uwtable noinline ssp {
    452 ; SSE-LABEL: merge_8i16_i16_45u7zzzz:
    453 ; SSE:       # BB#0:
    454 ; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
    455 ; SSE-NEXT:    retq
    456 ;
    457 ; AVX-LABEL: merge_8i16_i16_45u7zzzz:
    458 ; AVX:       # BB#0:
    459 ; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
    460 ; AVX-NEXT:    retq
    461 ;
    462 ; X32-SSE-LABEL: merge_8i16_i16_45u7zzzz:
    463 ; X32-SSE:       # BB#0:
    464 ; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
    465 ; X32-SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
    466 ; X32-SSE-NEXT:    retl
    467   %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 4
    468   %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 5
    469   %ptr3 = getelementptr inbounds i16, i16* %ptr, i64 7
    470   %val0 = load i16, i16* %ptr0
    471   %val1 = load i16, i16* %ptr1
    472   %val3 = load i16, i16* %ptr3
    473   %res0 = insertelement <8 x i16> undef, i16 %val0, i32 0
    474   %res1 = insertelement <8 x i16> %res0, i16 %val1, i32 1
    475   %res3 = insertelement <8 x i16> %res1, i16 %val3, i32 3
    476   %res4 = insertelement <8 x i16> %res3, i16     0, i32 4
    477   %res5 = insertelement <8 x i16> %res4, i16     0, i32 5
    478   %res6 = insertelement <8 x i16> %res5, i16     0, i32 6
    479   %res7 = insertelement <8 x i16> %res6, i16     0, i32 7
    480   ret <8 x i16> %res7
    481 }
    482 
    483 define <16 x i8> @merge_16i8_i8_01u3456789ABCDuF(i8* %ptr) nounwind uwtable noinline ssp {
    484 ; SSE-LABEL: merge_16i8_i8_01u3456789ABCDuF:
    485 ; SSE:       # BB#0:
    486 ; SSE-NEXT:    movups (%rdi), %xmm0
    487 ; SSE-NEXT:    retq
    488 ;
    489 ; AVX-LABEL: merge_16i8_i8_01u3456789ABCDuF:
    490 ; AVX:       # BB#0:
    491 ; AVX-NEXT:    vmovups (%rdi), %xmm0
    492 ; AVX-NEXT:    retq
    493 ;
    494 ; X32-SSE-LABEL: merge_16i8_i8_01u3456789ABCDuF:
    495 ; X32-SSE:       # BB#0:
    496 ; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
    497 ; X32-SSE-NEXT:    movups (%eax), %xmm0
    498 ; X32-SSE-NEXT:    retl
    499   %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 0
    500   %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 1
    501   %ptr3 = getelementptr inbounds i8, i8* %ptr, i64 3
    502   %ptr4 = getelementptr inbounds i8, i8* %ptr, i64 4
    503   %ptr5 = getelementptr inbounds i8, i8* %ptr, i64 5
    504   %ptr6 = getelementptr inbounds i8, i8* %ptr, i64 6
    505   %ptr7 = getelementptr inbounds i8, i8* %ptr, i64 7
    506   %ptr8 = getelementptr inbounds i8, i8* %ptr, i64 8
    507   %ptr9 = getelementptr inbounds i8, i8* %ptr, i64 9
    508   %ptrA = getelementptr inbounds i8, i8* %ptr, i64 10
    509   %ptrB = getelementptr inbounds i8, i8* %ptr, i64 11
    510   %ptrC = getelementptr inbounds i8, i8* %ptr, i64 12
    511   %ptrD = getelementptr inbounds i8, i8* %ptr, i64 13
    512   %ptrF = getelementptr inbounds i8, i8* %ptr, i64 15
    513   %val0 = load i8, i8* %ptr0
    514   %val1 = load i8, i8* %ptr1
    515   %val3 = load i8, i8* %ptr3
    516   %val4 = load i8, i8* %ptr4
    517   %val5 = load i8, i8* %ptr5
    518   %val6 = load i8, i8* %ptr6
    519   %val7 = load i8, i8* %ptr7
    520   %val8 = load i8, i8* %ptr8
    521   %val9 = load i8, i8* %ptr9
    522   %valA = load i8, i8* %ptrA
    523   %valB = load i8, i8* %ptrB
    524   %valC = load i8, i8* %ptrC
    525   %valD = load i8, i8* %ptrD
    526   %valF = load i8, i8* %ptrF
    527   %res0 = insertelement <16 x i8> undef, i8 %val0, i32 0
    528   %res1 = insertelement <16 x i8> %res0, i8 %val1, i32 1
    529   %res3 = insertelement <16 x i8> %res1, i8 %val3, i32 3
    530   %res4 = insertelement <16 x i8> %res3, i8 %val4, i32 4
    531   %res5 = insertelement <16 x i8> %res4, i8 %val5, i32 5
    532   %res6 = insertelement <16 x i8> %res5, i8 %val6, i32 6
    533   %res7 = insertelement <16 x i8> %res6, i8 %val7, i32 7
    534   %res8 = insertelement <16 x i8> %res7, i8 %val8, i32 8
    535   %res9 = insertelement <16 x i8> %res8, i8 %val9, i32 9
    536   %resA = insertelement <16 x i8> %res9, i8 %valA, i32 10
    537   %resB = insertelement <16 x i8> %resA, i8 %valB, i32 11
    538   %resC = insertelement <16 x i8> %resB, i8 %valC, i32 12
    539   %resD = insertelement <16 x i8> %resC, i8 %valD, i32 13
    540   %resF = insertelement <16 x i8> %resD, i8 %valF, i32 15
    541   ret <16 x i8> %resF
    542 }
    543 
    544 define <16 x i8> @merge_16i8_i8_01u3uuzzuuuuuzzz(i8* %ptr) nounwind uwtable noinline ssp {
    545 ; SSE-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz:
    546 ; SSE:       # BB#0:
    547 ; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
    548 ; SSE-NEXT:    retq
    549 ;
    550 ; AVX-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz:
    551 ; AVX:       # BB#0:
    552 ; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
    553 ; AVX-NEXT:    retq
    554 ;
    555 ; X32-SSE-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz:
    556 ; X32-SSE:       # BB#0:
    557 ; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
    558 ; X32-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
    559 ; X32-SSE-NEXT:    retl
    560   %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 0
    561   %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 1
    562   %ptr3 = getelementptr inbounds i8, i8* %ptr, i64 3
    563   %val0 = load i8, i8* %ptr0
    564   %val1 = load i8, i8* %ptr1
    565   %val3 = load i8, i8* %ptr3
    566   %res0 = insertelement <16 x i8> undef, i8 %val0, i32 0
    567   %res1 = insertelement <16 x i8> %res0, i8 %val1, i32 1
    568   %res3 = insertelement <16 x i8> %res1, i8 %val3, i32 3
    569   %res6 = insertelement <16 x i8> %res3, i8     0, i32 6
    570   %res7 = insertelement <16 x i8> %res6, i8     0, i32 7
    571   %resD = insertelement <16 x i8> %res7, i8     0, i32 13
    572   %resE = insertelement <16 x i8> %resD, i8     0, i32 14
    573   %resF = insertelement <16 x i8> %resE, i8     0, i32 15
    574   ret <16 x i8> %resF
    575 }
    576 
    577 define <16 x i8> @merge_16i8_i8_0123uu67uuuuuzzz(i8* %ptr) nounwind uwtable noinline ssp {
    578 ; SSE-LABEL: merge_16i8_i8_0123uu67uuuuuzzz:
    579 ; SSE:       # BB#0:
    580 ; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
    581 ; SSE-NEXT:    retq
    582 ;
    583 ; AVX-LABEL: merge_16i8_i8_0123uu67uuuuuzzz:
    584 ; AVX:       # BB#0:
    585 ; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
    586 ; AVX-NEXT:    retq
    587 ;
    588 ; X32-SSE-LABEL: merge_16i8_i8_0123uu67uuuuuzzz:
    589 ; X32-SSE:       # BB#0:
    590 ; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
    591 ; X32-SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
    592 ; X32-SSE-NEXT:    retl
    593   %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 0
    594   %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 1
    595   %ptr2 = getelementptr inbounds i8, i8* %ptr, i64 2
    596   %ptr3 = getelementptr inbounds i8, i8* %ptr, i64 3
    597   %ptr6 = getelementptr inbounds i8, i8* %ptr, i64 6
    598   %ptr7 = getelementptr inbounds i8, i8* %ptr, i64 7
    599   %val0 = load i8, i8* %ptr0
    600   %val1 = load i8, i8* %ptr1
    601   %val2 = load i8, i8* %ptr2
    602   %val3 = load i8, i8* %ptr3
    603   %val6 = load i8, i8* %ptr6
    604   %val7 = load i8, i8* %ptr7
    605   %res0 = insertelement <16 x i8> undef, i8 %val0, i32 0
    606   %res1 = insertelement <16 x i8> %res0, i8 %val1, i32 1
    607   %res2 = insertelement <16 x i8> %res1, i8 %val2, i32 2
    608   %res3 = insertelement <16 x i8> %res2, i8 %val3, i32 3
    609   %res6 = insertelement <16 x i8> %res3, i8 %val6, i32 6
    610   %res7 = insertelement <16 x i8> %res6, i8 %val7, i32 7
    611   %resD = insertelement <16 x i8> %res7, i8     0, i32 13
    612   %resE = insertelement <16 x i8> %resD, i8     0, i32 14
    613   %resF = insertelement <16 x i8> %resE, i8     0, i32 15
    614   ret <16 x i8> %resF
    615 }
    616 
    617 define void @merge_4i32_i32_combine(<4 x i32>* %dst, i32* %src) {
    618 ; SSE-LABEL: merge_4i32_i32_combine:
    619 ; SSE:       # BB#0:
    620 ; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
    621 ; SSE-NEXT:    movaps %xmm0, (%rdi)
    622 ; SSE-NEXT:    retq
    623 ;
    624 ; AVX1-LABEL: merge_4i32_i32_combine:
    625 ; AVX1:       # BB#0:
    626 ; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
    627 ; AVX1-NEXT:    vmovaps %xmm0, (%rdi)
    628 ; AVX1-NEXT:    retq
    629 ;
    630 ; AVX2-LABEL: merge_4i32_i32_combine:
    631 ; AVX2:       # BB#0:
    632 ; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
    633 ; AVX2-NEXT:    vmovaps %xmm0, (%rdi)
    634 ; AVX2-NEXT:    retq
    635 ;
    636 ; AVX512F-LABEL: merge_4i32_i32_combine:
    637 ; AVX512F:       # BB#0:
    638 ; AVX512F-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
    639 ; AVX512F-NEXT:    vmovdqa %xmm0, (%rdi)
    640 ; AVX512F-NEXT:    retq
    641 ;
    642 ; X32-SSE-LABEL: merge_4i32_i32_combine:
    643 ; X32-SSE:       # BB#0:
    644 ; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
    645 ; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    646 ; X32-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
    647 ; X32-SSE-NEXT:    movaps %xmm0, (%eax)
    648 ; X32-SSE-NEXT:    retl
    649  %1 = getelementptr i32, i32* %src, i32 0
    650  %2 = load i32, i32* %1
    651  %3 = insertelement <4 x i32> undef, i32 %2, i32 0
    652  %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer
    653  %5 = lshr <4 x i32> %4, <i32 0, i32 undef, i32 undef, i32 undef>
    654  %6 = and <4 x i32> %5, <i32 -1, i32 0, i32 0, i32 0>
    655  store <4 x i32> %6, <4 x i32>* %dst
    656  ret void
    657 }
    658 
    659 ;
    660 ; consecutive loads including any/all volatiles may not be combined
    661 ;
    662 
    663 define <2 x i64> @merge_2i64_i64_12_volatile(i64* %ptr) nounwind uwtable noinline ssp {
    664 ; SSE-LABEL: merge_2i64_i64_12_volatile:
    665 ; SSE:       # BB#0:
    666 ; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
    667 ; SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
    668 ; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    669 ; SSE-NEXT:    retq
    670 ;
    671 ; AVX-LABEL: merge_2i64_i64_12_volatile:
    672 ; AVX:       # BB#0:
    673 ; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
    674 ; AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
    675 ; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    676 ; AVX-NEXT:    retq
    677 ;
    678 ; X32-SSE-LABEL: merge_2i64_i64_12_volatile:
    679 ; X32-SSE:       # BB#0:
    680 ; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
    681 ; X32-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
    682 ; X32-SSE-NEXT:    pinsrd $1, 12(%eax), %xmm0
    683 ; X32-SSE-NEXT:    pinsrd $2, 16(%eax), %xmm0
    684 ; X32-SSE-NEXT:    pinsrd $3, 20(%eax), %xmm0
    685 ; X32-SSE-NEXT:    retl
    686   %ptr0 = getelementptr inbounds i64, i64* %ptr, i64 1
    687   %ptr1 = getelementptr inbounds i64, i64* %ptr, i64 2
    688   %val0 = load volatile i64, i64* %ptr0
    689   %val1 = load volatile i64, i64* %ptr1
    690   %res0 = insertelement <2 x i64> undef, i64 %val0, i32 0
    691   %res1 = insertelement <2 x i64> %res0, i64 %val1, i32 1
    692   ret <2 x i64> %res1
    693 }
    694 
    695 define <4 x float> @merge_4f32_f32_2345_volatile(float* %ptr) nounwind uwtable noinline ssp {
    696 ; SSE2-LABEL: merge_4f32_f32_2345_volatile:
    697 ; SSE2:       # BB#0:
    698 ; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
    699 ; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
    700 ; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
    701 ; SSE2-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
    702 ; SSE2-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
    703 ; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
    704 ; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
    705 ; SSE2-NEXT:    retq
    706 ;
    707 ; SSE41-LABEL: merge_4f32_f32_2345_volatile:
    708 ; SSE41:       # BB#0:
    709 ; SSE41-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
    710 ; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
    711 ; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
    712 ; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
    713 ; SSE41-NEXT:    retq
    714 ;
    715 ; AVX-LABEL: merge_4f32_f32_2345_volatile:
    716 ; AVX:       # BB#0:
    717 ; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
    718 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
    719 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
    720 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
    721 ; AVX-NEXT:    retq
    722 ;
    723 ; X32-SSE-LABEL: merge_4f32_f32_2345_volatile:
    724 ; X32-SSE:       # BB#0:
    725 ; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
    726 ; X32-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
    727 ; X32-SSE-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
    728 ; X32-SSE-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
    729 ; X32-SSE-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
    730 ; X32-SSE-NEXT:    retl
    731   %ptr0 = getelementptr inbounds float, float* %ptr, i64 2
    732   %ptr1 = getelementptr inbounds float, float* %ptr, i64 3
    733   %ptr2 = getelementptr inbounds float, float* %ptr, i64 4
    734   %ptr3 = getelementptr inbounds float, float* %ptr, i64 5
    735   %val0 = load volatile float, float* %ptr0
    736   %val1 = load float, float* %ptr1
    737   %val2 = load float, float* %ptr2
    738   %val3 = load float, float* %ptr3
    739   %res0 = insertelement <4 x float> undef, float %val0, i32 0
    740   %res1 = insertelement <4 x float> %res0, float %val1, i32 1
    741   %res2 = insertelement <4 x float> %res1, float %val2, i32 2
    742   %res3 = insertelement <4 x float> %res2, float %val3, i32 3
    743   ret <4 x float> %res3
    744 }
    745 
    746 ;
    747 ; Non-consecutive test.
    748 ;
    749 
    750 define <4 x float> @merge_4f32_f32_X0YY(float* %ptr0, float* %ptr1) nounwind uwtable noinline ssp {
    751 ; SSE-LABEL: merge_4f32_f32_X0YY:
    752 ; SSE:       # BB#0:
    753 ; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
    754 ; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
    755 ; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0,0,1,1]
    756 ; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    757 ; SSE-NEXT:    retq
    758 ;
    759 ; AVX-LABEL: merge_4f32_f32_X0YY:
    760 ; AVX:       # BB#0:
    761 ; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
    762 ; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
    763 ; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1]
    764 ; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    765 ; AVX-NEXT:    retq
    766 ;
    767 ; X32-SSE-LABEL: merge_4f32_f32_X0YY:
    768 ; X32-SSE:       # BB#0:
    769 ; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
    770 ; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    771 ; X32-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
    772 ; X32-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
    773 ; X32-SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0,0,1,1]
    774 ; X32-SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    775 ; X32-SSE-NEXT:    retl
    776   %val0 = load float, float* %ptr0, align 4
    777   %val1 = load float, float* %ptr1, align 4
    778   %res0 = insertelement <4 x float> undef, float %val0, i32 0
    779   %res1 = insertelement <4 x float> %res0, float 0.000000e+00, i32 1
    780   %res2 = insertelement <4 x float> %res1, float %val1, i32 2
    781   %res3 = insertelement <4 x float> %res2, float %val1, i32 3
    782   ret <4 x float> %res3
    783 }
    784