Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx -fixup-byte-word-insts=1 < %s | FileCheck -check-prefix=CHECK -check-prefix=BWON %s
      3 ; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx -fixup-byte-word-insts=0 < %s | FileCheck -check-prefix=CHECK -check-prefix=BWOFF %s
      4 
      5 %struct.A = type { i8, i8, i8, i8, i8, i8, i8, i8 }
      6 %struct.B = type { i32, i32, i32, i32, i32, i32, i32, i32 }
      7 
      8 ; save 1,2,3 ... as one big integer.
      9 define void @merge_const_store(i32 %count, %struct.A* nocapture %p) nounwind uwtable noinline ssp {
     10 ; CHECK-LABEL: merge_const_store:
     11 ; CHECK:       # %bb.0:
     12 ; CHECK-NEXT:    testl %edi, %edi
     13 ; CHECK-NEXT:    jle .LBB0_3
     14 ; CHECK-NEXT:  # %bb.1: # %.lr.ph.preheader
     15 ; CHECK-NEXT:    movabsq $578437695752307201, %rax # imm = 0x807060504030201
     16 ; CHECK-NEXT:    .p2align 4, 0x90
     17 ; CHECK-NEXT:  .LBB0_2: # %.lr.ph
     18 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
     19 ; CHECK-NEXT:    movq %rax, (%rsi)
     20 ; CHECK-NEXT:    addq $8, %rsi
     21 ; CHECK-NEXT:    decl %edi
     22 ; CHECK-NEXT:    jne .LBB0_2
     23 ; CHECK-NEXT:  .LBB0_3: # %._crit_edge
     24 ; CHECK-NEXT:    retq
     25   %1 = icmp sgt i32 %count, 0
     26   br i1 %1, label %.lr.ph, label %._crit_edge
     27 .lr.ph:
     28   %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
     29   %.01 = phi %struct.A* [ %11, %.lr.ph ], [ %p, %0 ]
     30   %2 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 0
     31   store i8 1, i8* %2, align 1
     32   %3 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 1
     33   store i8 2, i8* %3, align 1
     34   %4 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 2
     35   store i8 3, i8* %4, align 1
     36   %5 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 3
     37   store i8 4, i8* %5, align 1
     38   %6 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 4
     39   store i8 5, i8* %6, align 1
     40   %7 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 5
     41   store i8 6, i8* %7, align 1
     42   %8 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 6
     43   store i8 7, i8* %8, align 1
     44   %9 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 7
     45   store i8 8, i8* %9, align 1
     46   %10 = add nsw i32 %i.02, 1
     47   %11 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 1
     48   %exitcond = icmp eq i32 %10, %count
     49   br i1 %exitcond, label %._crit_edge, label %.lr.ph
     50 ._crit_edge:
     51   ret void
     52 }
     53 
     54 ; No vectors because we use noimplicitfloat
     55 define void @merge_const_store_no_vec(i32 %count, %struct.B* nocapture %p) noimplicitfloat{
     56 ; CHECK-LABEL: merge_const_store_no_vec:
     57 ; CHECK:       # %bb.0:
     58 ; CHECK-NEXT:    testl %edi, %edi
     59 ; CHECK-NEXT:    jle .LBB1_2
     60 ; CHECK-NEXT:    .p2align 4, 0x90
     61 ; CHECK-NEXT:  .LBB1_1: # %.lr.ph
     62 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
     63 ; CHECK-NEXT:    movq $0, (%rsi)
     64 ; CHECK-NEXT:    movq $0, 8(%rsi)
     65 ; CHECK-NEXT:    movq $0, 16(%rsi)
     66 ; CHECK-NEXT:    movq $0, 24(%rsi)
     67 ; CHECK-NEXT:    addq $32, %rsi
     68 ; CHECK-NEXT:    decl %edi
     69 ; CHECK-NEXT:    jne .LBB1_1
     70 ; CHECK-NEXT:  .LBB1_2: # %._crit_edge
     71 ; CHECK-NEXT:    retq
     72   %1 = icmp sgt i32 %count, 0
     73   br i1 %1, label %.lr.ph, label %._crit_edge
     74 .lr.ph:
     75   %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
     76   %.01 = phi %struct.B* [ %11, %.lr.ph ], [ %p, %0 ]
     77   %2 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 0
     78   store i32 0, i32* %2, align 4
     79   %3 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 1
     80   store i32 0, i32* %3, align 4
     81   %4 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 2
     82   store i32 0, i32* %4, align 4
     83   %5 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 3
     84   store i32 0, i32* %5, align 4
     85   %6 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 4
     86   store i32 0, i32* %6, align 4
     87   %7 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 5
     88   store i32 0, i32* %7, align 4
     89   %8 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 6
     90   store i32 0, i32* %8, align 4
     91   %9 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 7
     92   store i32 0, i32* %9, align 4
     93   %10 = add nsw i32 %i.02, 1
     94   %11 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 1
     95   %exitcond = icmp eq i32 %10, %count
     96   br i1 %exitcond, label %._crit_edge, label %.lr.ph
     97 ._crit_edge:
     98   ret void
     99 }
    100 
    101 ; Move the constants using a single vector store.
    102 define void @merge_const_store_vec(i32 %count, %struct.B* nocapture %p) nounwind uwtable noinline ssp {
    103 ; CHECK-LABEL: merge_const_store_vec:
    104 ; CHECK:       # %bb.0:
    105 ; CHECK-NEXT:    testl %edi, %edi
    106 ; CHECK-NEXT:    jle .LBB2_3
    107 ; CHECK-NEXT:  # %bb.1: # %.lr.ph.preheader
    108 ; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
    109 ; CHECK-NEXT:    .p2align 4, 0x90
    110 ; CHECK-NEXT:  .LBB2_2: # %.lr.ph
    111 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
    112 ; CHECK-NEXT:    vmovups %ymm0, (%rsi)
    113 ; CHECK-NEXT:    addq $32, %rsi
    114 ; CHECK-NEXT:    decl %edi
    115 ; CHECK-NEXT:    jne .LBB2_2
    116 ; CHECK-NEXT:  .LBB2_3: # %._crit_edge
    117 ; CHECK-NEXT:    vzeroupper
    118 ; CHECK-NEXT:    retq
    119   %1 = icmp sgt i32 %count, 0
    120   br i1 %1, label %.lr.ph, label %._crit_edge
    121 .lr.ph:
    122   %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
    123   %.01 = phi %struct.B* [ %11, %.lr.ph ], [ %p, %0 ]
    124   %2 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 0
    125   store i32 0, i32* %2, align 4
    126   %3 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 1
    127   store i32 0, i32* %3, align 4
    128   %4 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 2
    129   store i32 0, i32* %4, align 4
    130   %5 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 3
    131   store i32 0, i32* %5, align 4
    132   %6 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 4
    133   store i32 0, i32* %6, align 4
    134   %7 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 5
    135   store i32 0, i32* %7, align 4
    136   %8 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 6
    137   store i32 0, i32* %8, align 4
    138   %9 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 7
    139   store i32 0, i32* %9, align 4
    140   %10 = add nsw i32 %i.02, 1
    141   %11 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 1
    142   %exitcond = icmp eq i32 %10, %count
    143   br i1 %exitcond, label %._crit_edge, label %.lr.ph
    144 ._crit_edge:
    145   ret void
    146 }
    147 
    148 ; Move the first 4 constants as a single vector. Move the rest as scalars.
    149 define void @merge_nonconst_store(i32 %count, i8 %zz, %struct.A* nocapture %p) nounwind uwtable noinline ssp {
    150 ; CHECK-LABEL: merge_nonconst_store:
    151 ; CHECK:       # %bb.0:
    152 ; CHECK-NEXT:    testl %edi, %edi
    153 ; CHECK-NEXT:    jle .LBB3_2
    154 ; CHECK-NEXT:    .p2align 4, 0x90
    155 ; CHECK-NEXT:  .LBB3_1: # %.lr.ph
    156 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
    157 ; CHECK-NEXT:    movl $67305985, (%rdx) # imm = 0x4030201
    158 ; CHECK-NEXT:    movb %sil, 4(%rdx)
    159 ; CHECK-NEXT:    movw $1798, 5(%rdx) # imm = 0x706
    160 ; CHECK-NEXT:    movb $8, 7(%rdx)
    161 ; CHECK-NEXT:    addq $8, %rdx
    162 ; CHECK-NEXT:    decl %edi
    163 ; CHECK-NEXT:    jne .LBB3_1
    164 ; CHECK-NEXT:  .LBB3_2: # %._crit_edge
    165 ; CHECK-NEXT:    retq
    166   %1 = icmp sgt i32 %count, 0
    167   br i1 %1, label %.lr.ph, label %._crit_edge
    168 .lr.ph:
    169   %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
    170   %.01 = phi %struct.A* [ %11, %.lr.ph ], [ %p, %0 ]
    171   %2 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 0
    172   store i8 1, i8* %2, align 1
    173   %3 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 1
    174   store i8 2, i8* %3, align 1
    175   %4 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 2
    176   store i8 3, i8* %4, align 1
    177   %5 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 3
    178   store i8 4, i8* %5, align 1
    179   %6 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 4
    180   store i8 %zz, i8* %6, align 1                     ;  <----------- Not a const;
    181   %7 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 5
    182   store i8 6, i8* %7, align 1
    183   %8 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 6
    184   store i8 7, i8* %8, align 1
    185   %9 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 7
    186   store i8 8, i8* %9, align 1
    187   %10 = add nsw i32 %i.02, 1
    188   %11 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 1
    189   %exitcond = icmp eq i32 %10, %count
    190   br i1 %exitcond, label %._crit_edge, label %.lr.ph
    191 ._crit_edge:
    192   ret void
    193 }
    194 
    195 define void @merge_loads_i16(i32 %count, %struct.A* noalias nocapture %q, %struct.A* noalias nocapture %p) nounwind uwtable noinline ssp {
    196 ; BWON-LABEL: merge_loads_i16:
    197 ; BWON:       # %bb.0:
    198 ; BWON-NEXT:    testl %edi, %edi
    199 ; BWON-NEXT:    jle .LBB4_2
    200 ; BWON-NEXT:    .p2align 4, 0x90
    201 ; BWON-NEXT:  .LBB4_1: # =>This Inner Loop Header: Depth=1
    202 ; BWON-NEXT:    movzwl (%rsi), %eax
    203 ; BWON-NEXT:    movw %ax, (%rdx)
    204 ; BWON-NEXT:    addq $8, %rdx
    205 ; BWON-NEXT:    decl %edi
    206 ; BWON-NEXT:    jne .LBB4_1
    207 ; BWON-NEXT:  .LBB4_2: # %._crit_edge
    208 ; BWON-NEXT:    retq
    209 ;
    210 ; BWOFF-LABEL: merge_loads_i16:
    211 ; BWOFF:       # %bb.0:
    212 ; BWOFF-NEXT:    testl %edi, %edi
    213 ; BWOFF-NEXT:    jle .LBB4_2
    214 ; BWOFF-NEXT:    .p2align 4, 0x90
    215 ; BWOFF-NEXT:  .LBB4_1: # =>This Inner Loop Header: Depth=1
    216 ; BWOFF-NEXT:    movw (%rsi), %ax
    217 ; BWOFF-NEXT:    movw %ax, (%rdx)
    218 ; BWOFF-NEXT:    addq $8, %rdx
    219 ; BWOFF-NEXT:    decl %edi
    220 ; BWOFF-NEXT:    jne .LBB4_1
    221 ; BWOFF-NEXT:  .LBB4_2: # %._crit_edge
    222 ; BWOFF-NEXT:    retq
    223   %1 = icmp sgt i32 %count, 0
    224   br i1 %1, label %.lr.ph, label %._crit_edge
    225 
    226 .lr.ph:                                           ; preds = %0
    227   %2 = getelementptr inbounds %struct.A, %struct.A* %q, i64 0, i32 0
    228   %3 = getelementptr inbounds %struct.A, %struct.A* %q, i64 0, i32 1
    229   br label %4
    230 
    231 ; <label>:4                                       ; preds = %4, %.lr.ph
    232   %i.02 = phi i32 [ 0, %.lr.ph ], [ %9, %4 ]
    233   %.01 = phi %struct.A* [ %p, %.lr.ph ], [ %10, %4 ]
    234   %5 = load i8, i8* %2, align 1
    235   %6 = load i8, i8* %3, align 1
    236   %7 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 0
    237   store i8 %5, i8* %7, align 1
    238   %8 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 1
    239   store i8 %6, i8* %8, align 1
    240   %9 = add nsw i32 %i.02, 1
    241   %10 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 1
    242   %exitcond = icmp eq i32 %9, %count
    243   br i1 %exitcond, label %._crit_edge, label %4
    244 
    245 ._crit_edge:                                      ; preds = %4, %0
    246   ret void
    247 }
    248 
    249 ; The loads and the stores are interleaved. Can't merge them.
    250 define void @no_merge_loads(i32 %count, %struct.A* noalias nocapture %q, %struct.A* noalias nocapture %p) nounwind uwtable noinline ssp {
    251 ; BWON-LABEL: no_merge_loads:
    252 ; BWON:       # %bb.0:
    253 ; BWON-NEXT:    testl %edi, %edi
    254 ; BWON-NEXT:    jle .LBB5_2
    255 ; BWON-NEXT:    .p2align 4, 0x90
    256 ; BWON-NEXT:  .LBB5_1: # %a4
    257 ; BWON-NEXT:    # =>This Inner Loop Header: Depth=1
    258 ; BWON-NEXT:    movzbl (%rsi), %eax
    259 ; BWON-NEXT:    movb %al, (%rdx)
    260 ; BWON-NEXT:    movzbl 1(%rsi), %eax
    261 ; BWON-NEXT:    movb %al, 1(%rdx)
    262 ; BWON-NEXT:    addq $8, %rdx
    263 ; BWON-NEXT:    decl %edi
    264 ; BWON-NEXT:    jne .LBB5_1
    265 ; BWON-NEXT:  .LBB5_2: # %._crit_edge
    266 ; BWON-NEXT:    retq
    267 ;
    268 ; BWOFF-LABEL: no_merge_loads:
    269 ; BWOFF:       # %bb.0:
    270 ; BWOFF-NEXT:    testl %edi, %edi
    271 ; BWOFF-NEXT:    jle .LBB5_2
    272 ; BWOFF-NEXT:    .p2align 4, 0x90
    273 ; BWOFF-NEXT:  .LBB5_1: # %a4
    274 ; BWOFF-NEXT:    # =>This Inner Loop Header: Depth=1
    275 ; BWOFF-NEXT:    movb (%rsi), %al
    276 ; BWOFF-NEXT:    movb %al, (%rdx)
    277 ; BWOFF-NEXT:    movb 1(%rsi), %al
    278 ; BWOFF-NEXT:    movb %al, 1(%rdx)
    279 ; BWOFF-NEXT:    addq $8, %rdx
    280 ; BWOFF-NEXT:    decl %edi
    281 ; BWOFF-NEXT:    jne .LBB5_1
    282 ; BWOFF-NEXT:  .LBB5_2: # %._crit_edge
    283 ; BWOFF-NEXT:    retq
    284   %1 = icmp sgt i32 %count, 0
    285   br i1 %1, label %.lr.ph, label %._crit_edge
    286 
    287 .lr.ph:                                           ; preds = %0
    288   %2 = getelementptr inbounds %struct.A, %struct.A* %q, i64 0, i32 0
    289   %3 = getelementptr inbounds %struct.A, %struct.A* %q, i64 0, i32 1
    290   br label %a4
    291 
    292 a4:                                       ; preds = %4, %.lr.ph
    293   %i.02 = phi i32 [ 0, %.lr.ph ], [ %a9, %a4 ]
    294   %.01 = phi %struct.A* [ %p, %.lr.ph ], [ %a10, %a4 ]
    295   %a5 = load i8, i8* %2, align 1
    296   %a7 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 0
    297   store i8 %a5, i8* %a7, align 1
    298   %a8 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 1
    299   %a6 = load i8, i8* %3, align 1
    300   store i8 %a6, i8* %a8, align 1
    301   %a9 = add nsw i32 %i.02, 1
    302   %a10 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 1
    303   %exitcond = icmp eq i32 %a9, %count
    304   br i1 %exitcond, label %._crit_edge, label %a4
    305 
    306 ._crit_edge:                                      ; preds = %4, %0
    307   ret void
    308 }
    309 
    310 define void @merge_loads_integer(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp {
    311 ; CHECK-LABEL: merge_loads_integer:
    312 ; CHECK:       # %bb.0:
    313 ; CHECK-NEXT:    testl %edi, %edi
    314 ; CHECK-NEXT:    jle .LBB6_2
    315 ; CHECK-NEXT:    .p2align 4, 0x90
    316 ; CHECK-NEXT:  .LBB6_1: # =>This Inner Loop Header: Depth=1
    317 ; CHECK-NEXT:    movq (%rsi), %rax
    318 ; CHECK-NEXT:    movq %rax, (%rdx)
    319 ; CHECK-NEXT:    addq $32, %rdx
    320 ; CHECK-NEXT:    decl %edi
    321 ; CHECK-NEXT:    jne .LBB6_1
    322 ; CHECK-NEXT:  .LBB6_2: # %._crit_edge
    323 ; CHECK-NEXT:    retq
    324   %1 = icmp sgt i32 %count, 0
    325   br i1 %1, label %.lr.ph, label %._crit_edge
    326 
    327 .lr.ph:                                           ; preds = %0
    328   %2 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 0
    329   %3 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 1
    330   br label %4
    331 
    332 ; <label>:4                                       ; preds = %4, %.lr.ph
    333   %i.02 = phi i32 [ 0, %.lr.ph ], [ %9, %4 ]
    334   %.01 = phi %struct.B* [ %p, %.lr.ph ], [ %10, %4 ]
    335   %5 = load i32, i32* %2
    336   %6 = load i32, i32* %3
    337   %7 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 0
    338   store i32 %5, i32* %7
    339   %8 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 1
    340   store i32 %6, i32* %8
    341   %9 = add nsw i32 %i.02, 1
    342   %10 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 1
    343   %exitcond = icmp eq i32 %9, %count
    344   br i1 %exitcond, label %._crit_edge, label %4
    345 
    346 ._crit_edge:                                      ; preds = %4, %0
    347   ret void
    348 }
    349 
    350 define void @merge_loads_vector(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp {
    351 ; CHECK-LABEL: merge_loads_vector:
    352 ; CHECK:       # %bb.0:
    353 ; CHECK-NEXT:    testl %edi, %edi
    354 ; CHECK-NEXT:    jle .LBB7_2
    355 ; CHECK-NEXT:    .p2align 4, 0x90
    356 ; CHECK-NEXT:  .LBB7_1: # %block4
    357 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
    358 ; CHECK-NEXT:    vmovups (%rsi), %xmm0
    359 ; CHECK-NEXT:    vmovups %xmm0, (%rdx)
    360 ; CHECK-NEXT:    addq $32, %rdx
    361 ; CHECK-NEXT:    decl %edi
    362 ; CHECK-NEXT:    jne .LBB7_1
    363 ; CHECK-NEXT:  .LBB7_2: # %._crit_edge
    364 ; CHECK-NEXT:    retq
    365   %a1 = icmp sgt i32 %count, 0
    366   br i1 %a1, label %.lr.ph, label %._crit_edge
    367 
    368 .lr.ph:                                           ; preds = %0
    369   %a2 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 0
    370   %a3 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 1
    371   %a4 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 2
    372   %a5 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 3
    373   br label %block4
    374 
    375 block4:                                       ; preds = %4, %.lr.ph
    376   %i.02 = phi i32 [ 0, %.lr.ph ], [ %c9, %block4 ]
    377   %.01 = phi %struct.B* [ %p, %.lr.ph ], [ %c10, %block4 ]
    378   %a7 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 0
    379   %a8 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 1
    380   %a9 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 2
    381   %a10 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 3
    382   %b1 = load i32, i32* %a2
    383   %b2 = load i32, i32* %a3
    384   %b3 = load i32, i32* %a4
    385   %b4 = load i32, i32* %a5
    386   store i32 %b1, i32* %a7
    387   store i32 %b2, i32* %a8
    388   store i32 %b3, i32* %a9
    389   store i32 %b4, i32* %a10
    390   %c9 = add nsw i32 %i.02, 1
    391   %c10 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 1
    392   %exitcond = icmp eq i32 %c9, %count
    393   br i1 %exitcond, label %._crit_edge, label %block4
    394 
    395 ._crit_edge:                                      ; preds = %4, %0
    396   ret void
    397 }
    398 
    399 ; On x86, even unaligned copies can be merged to vector ops.
    400 define void @merge_loads_no_align(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp {
    401 ; CHECK-LABEL: merge_loads_no_align:
    402 ; CHECK:       # %bb.0:
    403 ; CHECK-NEXT:    testl %edi, %edi
    404 ; CHECK-NEXT:    jle .LBB8_2
    405 ; CHECK-NEXT:    .p2align 4, 0x90
    406 ; CHECK-NEXT:  .LBB8_1: # %block4
    407 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
    408 ; CHECK-NEXT:    vmovups (%rsi), %xmm0
    409 ; CHECK-NEXT:    vmovups %xmm0, (%rdx)
    410 ; CHECK-NEXT:    addq $32, %rdx
    411 ; CHECK-NEXT:    decl %edi
    412 ; CHECK-NEXT:    jne .LBB8_1
    413 ; CHECK-NEXT:  .LBB8_2: # %._crit_edge
    414 ; CHECK-NEXT:    retq
    415   %a1 = icmp sgt i32 %count, 0
    416   br i1 %a1, label %.lr.ph, label %._crit_edge
    417 
    418 .lr.ph:                                           ; preds = %0
    419   %a2 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 0
    420   %a3 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 1
    421   %a4 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 2
    422   %a5 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 3
    423   br label %block4
    424 
    425 block4:                                       ; preds = %4, %.lr.ph
    426   %i.02 = phi i32 [ 0, %.lr.ph ], [ %c9, %block4 ]
    427   %.01 = phi %struct.B* [ %p, %.lr.ph ], [ %c10, %block4 ]
    428   %a7 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 0
    429   %a8 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 1
    430   %a9 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 2
    431   %a10 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 3
    432   %b1 = load i32, i32* %a2, align 1
    433   %b2 = load i32, i32* %a3, align 1
    434   %b3 = load i32, i32* %a4, align 1
    435   %b4 = load i32, i32* %a5, align 1
    436   store i32 %b1, i32* %a7, align 1
    437   store i32 %b2, i32* %a8, align 1
    438   store i32 %b3, i32* %a9, align 1
    439   store i32 %b4, i32* %a10, align 1
    440   %c9 = add nsw i32 %i.02, 1
    441   %c10 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 1
    442   %exitcond = icmp eq i32 %c9, %count
    443   br i1 %exitcond, label %._crit_edge, label %block4
    444 
    445 ._crit_edge:                                      ; preds = %4, %0
    446   ret void
    447 }
    448 
    449 ; Make sure that we merge the consecutive load/store sequence below and use a
    450 ; word (16 bit) instead of a byte copy.
    451 define void @MergeLoadStoreBaseIndexOffset(i64* %a, i8* %b, i8* %c, i32 %n) {
    452 ; BWON-LABEL: MergeLoadStoreBaseIndexOffset:
    453 ; BWON:       # %bb.0:
    454 ; BWON-NEXT:    movl %ecx, %r8d
    455 ; BWON-NEXT:    xorl %ecx, %ecx
    456 ; BWON-NEXT:    .p2align 4, 0x90
    457 ; BWON-NEXT:  .LBB9_1: # =>This Inner Loop Header: Depth=1
    458 ; BWON-NEXT:    movq (%rdi,%rcx,8), %rax
    459 ; BWON-NEXT:    movzwl (%rdx,%rax), %eax
    460 ; BWON-NEXT:    movw %ax, (%rsi,%rcx,2)
    461 ; BWON-NEXT:    incq %rcx
    462 ; BWON-NEXT:    cmpl %ecx, %r8d
    463 ; BWON-NEXT:    jne .LBB9_1
    464 ; BWON-NEXT:  # %bb.2:
    465 ; BWON-NEXT:    retq
    466 ;
    467 ; BWOFF-LABEL: MergeLoadStoreBaseIndexOffset:
    468 ; BWOFF:       # %bb.0:
    469 ; BWOFF-NEXT:    movl %ecx, %r8d
    470 ; BWOFF-NEXT:    xorl %ecx, %ecx
    471 ; BWOFF-NEXT:    .p2align 4, 0x90
    472 ; BWOFF-NEXT:  .LBB9_1: # =>This Inner Loop Header: Depth=1
    473 ; BWOFF-NEXT:    movq (%rdi,%rcx,8), %rax
    474 ; BWOFF-NEXT:    movw (%rdx,%rax), %ax
    475 ; BWOFF-NEXT:    movw %ax, (%rsi,%rcx,2)
    476 ; BWOFF-NEXT:    incq %rcx
    477 ; BWOFF-NEXT:    cmpl %ecx, %r8d
    478 ; BWOFF-NEXT:    jne .LBB9_1
    479 ; BWOFF-NEXT:  # %bb.2:
    480 ; BWOFF-NEXT:    retq
    481   br label %1
    482 
    483 ; <label>:1
    484   %.09 = phi i32 [ %n, %0 ], [ %11, %1 ]
    485   %.08 = phi i8* [ %b, %0 ], [ %10, %1 ]
    486   %.0 = phi i64* [ %a, %0 ], [ %2, %1 ]
    487   %2 = getelementptr inbounds i64, i64* %.0, i64 1
    488   %3 = load i64, i64* %.0, align 1
    489   %4 = getelementptr inbounds i8, i8* %c, i64 %3
    490   %5 = load i8, i8* %4, align 1
    491   %6 = add i64 %3, 1
    492   %7 = getelementptr inbounds i8, i8* %c, i64 %6
    493   %8 = load i8, i8* %7, align 1
    494   store i8 %5, i8* %.08, align 1
    495   %9 = getelementptr inbounds i8, i8* %.08, i64 1
    496   store i8 %8, i8* %9, align 1
    497   %10 = getelementptr inbounds i8, i8* %.08, i64 2
    498   %11 = add nsw i32 %.09, -1
    499   %12 = icmp eq i32 %11, 0
    500   br i1 %12, label %13, label %1
    501 
    502 ; <label>:13
    503   ret void
    504 }
    505 
    506 ; Make sure that we merge the consecutive load/store sequence below and use a
    507 ; word (16 bit) instead of a byte copy for complicated address calculation.
    508 define void @MergeLoadStoreBaseIndexOffsetComplicated(i8* %a, i8* %b, i8* %c, i64 %n) {
    509 ; BWON-LABEL: MergeLoadStoreBaseIndexOffsetComplicated:
    510 ; BWON:       # %bb.0:
    511 ; BWON-NEXT:    xorl %r8d, %r8d
    512 ; BWON-NEXT:    .p2align 4, 0x90
    513 ; BWON-NEXT:  .LBB10_1: # =>This Inner Loop Header: Depth=1
    514 ; BWON-NEXT:    movsbq (%rsi), %rax
    515 ; BWON-NEXT:    movzwl (%rdx,%rax), %eax
    516 ; BWON-NEXT:    movw %ax, (%rdi,%r8)
    517 ; BWON-NEXT:    incq %rsi
    518 ; BWON-NEXT:    addq $2, %r8
    519 ; BWON-NEXT:    cmpq %rcx, %r8
    520 ; BWON-NEXT:    jl .LBB10_1
    521 ; BWON-NEXT:  # %bb.2:
    522 ; BWON-NEXT:    retq
    523 ;
    524 ; BWOFF-LABEL: MergeLoadStoreBaseIndexOffsetComplicated:
    525 ; BWOFF:       # %bb.0:
    526 ; BWOFF-NEXT:    xorl %r8d, %r8d
    527 ; BWOFF-NEXT:    .p2align 4, 0x90
    528 ; BWOFF-NEXT:  .LBB10_1: # =>This Inner Loop Header: Depth=1
    529 ; BWOFF-NEXT:    movsbq (%rsi), %rax
    530 ; BWOFF-NEXT:    movw (%rdx,%rax), %ax
    531 ; BWOFF-NEXT:    movw %ax, (%rdi,%r8)
    532 ; BWOFF-NEXT:    incq %rsi
    533 ; BWOFF-NEXT:    addq $2, %r8
    534 ; BWOFF-NEXT:    cmpq %rcx, %r8
    535 ; BWOFF-NEXT:    jl .LBB10_1
    536 ; BWOFF-NEXT:  # %bb.2:
    537 ; BWOFF-NEXT:    retq
    538   br label %1
    539 
    540 ; <label>:1
    541   %.09 = phi i64 [ 0, %0 ], [ %13, %1 ]
    542   %.08 = phi i8* [ %b, %0 ], [ %12, %1 ]
    543   %2 = load i8, i8* %.08, align 1
    544   %3 = sext i8 %2 to i64
    545   %4 = getelementptr inbounds i8, i8* %c, i64 %3
    546   %5 = load i8, i8* %4, align 1
    547   %6 = add nsw i64 %3, 1
    548   %7 = getelementptr inbounds i8, i8* %c, i64 %6
    549   %8 = load i8, i8* %7, align 1
    550   %9 = getelementptr inbounds i8, i8* %a, i64 %.09
    551   store i8 %5, i8* %9, align 1
    552   %10 = or i64 %.09, 1
    553   %11 = getelementptr inbounds i8, i8* %a, i64 %10
    554   store i8 %8, i8* %11, align 1
    555   %12 = getelementptr inbounds i8, i8* %.08, i64 1
    556   %13 = add nuw nsw i64 %.09, 2
    557   %14 = icmp slt i64 %13, %n
    558   br i1 %14, label %1, label %15
    559 
    560 ; <label>:15
    561   ret void
    562 }
    563 
    564 ; Make sure that we merge the consecutive load/store sequence below and use a
    565 ; word (16 bit) instead of a byte copy even if there are intermediate sign
    566 ; extensions.
    567 define void @MergeLoadStoreBaseIndexOffsetSext(i8* %a, i8* %b, i8* %c, i32 %n) {
    568 ; BWON-LABEL: MergeLoadStoreBaseIndexOffsetSext:
    569 ; BWON:       # %bb.0:
    570 ; BWON-NEXT:    movl %ecx, %r8d
    571 ; BWON-NEXT:    xorl %ecx, %ecx
    572 ; BWON-NEXT:    .p2align 4, 0x90
    573 ; BWON-NEXT:  .LBB11_1: # =>This Inner Loop Header: Depth=1
    574 ; BWON-NEXT:    movsbq (%rdi,%rcx), %rax
    575 ; BWON-NEXT:    movzwl (%rdx,%rax), %eax
    576 ; BWON-NEXT:    movw %ax, (%rsi,%rcx,2)
    577 ; BWON-NEXT:    incq %rcx
    578 ; BWON-NEXT:    cmpl %ecx, %r8d
    579 ; BWON-NEXT:    jne .LBB11_1
    580 ; BWON-NEXT:  # %bb.2:
    581 ; BWON-NEXT:    retq
    582 ;
    583 ; BWOFF-LABEL: MergeLoadStoreBaseIndexOffsetSext:
    584 ; BWOFF:       # %bb.0:
    585 ; BWOFF-NEXT:    movl %ecx, %r8d
    586 ; BWOFF-NEXT:    xorl %ecx, %ecx
    587 ; BWOFF-NEXT:    .p2align 4, 0x90
    588 ; BWOFF-NEXT:  .LBB11_1: # =>This Inner Loop Header: Depth=1
    589 ; BWOFF-NEXT:    movsbq (%rdi,%rcx), %rax
    590 ; BWOFF-NEXT:    movw (%rdx,%rax), %ax
    591 ; BWOFF-NEXT:    movw %ax, (%rsi,%rcx,2)
    592 ; BWOFF-NEXT:    incq %rcx
    593 ; BWOFF-NEXT:    cmpl %ecx, %r8d
    594 ; BWOFF-NEXT:    jne .LBB11_1
    595 ; BWOFF-NEXT:  # %bb.2:
    596 ; BWOFF-NEXT:    retq
    597   br label %1
    598 
    599 ; <label>:1
    600   %.09 = phi i32 [ %n, %0 ], [ %12, %1 ]
    601   %.08 = phi i8* [ %b, %0 ], [ %11, %1 ]
    602   %.0 = phi i8* [ %a, %0 ], [ %2, %1 ]
    603   %2 = getelementptr inbounds i8, i8* %.0, i64 1
    604   %3 = load i8, i8* %.0, align 1
    605   %4 = sext i8 %3 to i64
    606   %5 = getelementptr inbounds i8, i8* %c, i64 %4
    607   %6 = load i8, i8* %5, align 1
    608   %7 = add i64 %4, 1
    609   %8 = getelementptr inbounds i8, i8* %c, i64 %7
    610   %9 = load i8, i8* %8, align 1
    611   store i8 %6, i8* %.08, align 1
    612   %10 = getelementptr inbounds i8, i8* %.08, i64 1
    613   store i8 %9, i8* %10, align 1
    614   %11 = getelementptr inbounds i8, i8* %.08, i64 2
    615   %12 = add nsw i32 %.09, -1
    616   %13 = icmp eq i32 %12, 0
    617   br i1 %13, label %14, label %1
    618 
    619 ; <label>:14
    620   ret void
    621 }
    622 
    623 ; However, we can only merge ignore sign extensions when they are on all memory
    624 ; computations;
    625 define void @loadStoreBaseIndexOffsetSextNoSex(i8* %a, i8* %b, i8* %c, i32 %n) {
    626 ; BWON-LABEL: loadStoreBaseIndexOffsetSextNoSex:
    627 ; BWON:       # %bb.0:
    628 ; BWON-NEXT:    movl %ecx, %r8d
    629 ; BWON-NEXT:    xorl %ecx, %ecx
    630 ; BWON-NEXT:    .p2align 4, 0x90
    631 ; BWON-NEXT:  .LBB12_1: # =>This Inner Loop Header: Depth=1
    632 ; BWON-NEXT:    movsbq (%rdi,%rcx), %rax
    633 ; BWON-NEXT:    movzbl (%rdx,%rax), %r9d
    634 ; BWON-NEXT:    incb %al
    635 ; BWON-NEXT:    movsbq %al, %rax
    636 ; BWON-NEXT:    movzbl (%rdx,%rax), %eax
    637 ; BWON-NEXT:    movb %r9b, (%rsi,%rcx,2)
    638 ; BWON-NEXT:    movb %al, 1(%rsi,%rcx,2)
    639 ; BWON-NEXT:    incq %rcx
    640 ; BWON-NEXT:    cmpl %ecx, %r8d
    641 ; BWON-NEXT:    jne .LBB12_1
    642 ; BWON-NEXT:  # %bb.2:
    643 ; BWON-NEXT:    retq
    644 ;
    645 ; BWOFF-LABEL: loadStoreBaseIndexOffsetSextNoSex:
    646 ; BWOFF:       # %bb.0:
    647 ; BWOFF-NEXT:    movl %ecx, %r8d
    648 ; BWOFF-NEXT:    xorl %ecx, %ecx
    649 ; BWOFF-NEXT:    .p2align 4, 0x90
    650 ; BWOFF-NEXT:  .LBB12_1: # =>This Inner Loop Header: Depth=1
    651 ; BWOFF-NEXT:    movsbq (%rdi,%rcx), %rax
    652 ; BWOFF-NEXT:    movb (%rdx,%rax), %r9b
    653 ; BWOFF-NEXT:    incb %al
    654 ; BWOFF-NEXT:    movsbq %al, %rax
    655 ; BWOFF-NEXT:    movb (%rdx,%rax), %al
    656 ; BWOFF-NEXT:    movb %r9b, (%rsi,%rcx,2)
    657 ; BWOFF-NEXT:    movb %al, 1(%rsi,%rcx,2)
    658 ; BWOFF-NEXT:    incq %rcx
    659 ; BWOFF-NEXT:    cmpl %ecx, %r8d
    660 ; BWOFF-NEXT:    jne .LBB12_1
    661 ; BWOFF-NEXT:  # %bb.2:
    662 ; BWOFF-NEXT:    retq
    663   br label %1
    664 
    665 ; <label>:1
    666   %.09 = phi i32 [ %n, %0 ], [ %12, %1 ]
    667   %.08 = phi i8* [ %b, %0 ], [ %11, %1 ]
    668   %.0 = phi i8* [ %a, %0 ], [ %2, %1 ]
    669   %2 = getelementptr inbounds i8, i8* %.0, i64 1
    670   %3 = load i8, i8* %.0, align 1
    671   %4 = sext i8 %3 to i64
    672   %5 = getelementptr inbounds i8, i8* %c, i64 %4
    673   %6 = load i8, i8* %5, align 1
    674   %7 = add i8 %3, 1
    675   %wrap.4 = sext i8 %7 to i64
    676   %8 = getelementptr inbounds i8, i8* %c, i64 %wrap.4
    677   %9 = load i8, i8* %8, align 1
    678   store i8 %6, i8* %.08, align 1
    679   %10 = getelementptr inbounds i8, i8* %.08, i64 1
    680   store i8 %9, i8* %10, align 1
    681   %11 = getelementptr inbounds i8, i8* %.08, i64 2
    682   %12 = add nsw i32 %.09, -1
    683   %13 = icmp eq i32 %12, 0
    684   br i1 %13, label %14, label %1
    685 
    686 ; <label>:14
    687   ret void
    688 }
    689 
    690 ; PR21711 ( http://llvm.org/bugs/show_bug.cgi?id=21711 )
    691 define void @merge_vec_element_store(<8 x float> %v, float* %ptr) {
    692 ; CHECK-LABEL: merge_vec_element_store:
    693 ; CHECK:       # %bb.0:
    694 ; CHECK-NEXT:    vmovups %ymm0, (%rdi)
    695 ; CHECK-NEXT:    vzeroupper
    696 ; CHECK-NEXT:    retq
    697   %vecext0 = extractelement <8 x float> %v, i32 0
    698   %vecext1 = extractelement <8 x float> %v, i32 1
    699   %vecext2 = extractelement <8 x float> %v, i32 2
    700   %vecext3 = extractelement <8 x float> %v, i32 3
    701   %vecext4 = extractelement <8 x float> %v, i32 4
    702   %vecext5 = extractelement <8 x float> %v, i32 5
    703   %vecext6 = extractelement <8 x float> %v, i32 6
    704   %vecext7 = extractelement <8 x float> %v, i32 7
    705   %arrayidx1 = getelementptr inbounds float, float* %ptr, i64 1
    706   %arrayidx2 = getelementptr inbounds float, float* %ptr, i64 2
    707   %arrayidx3 = getelementptr inbounds float, float* %ptr, i64 3
    708   %arrayidx4 = getelementptr inbounds float, float* %ptr, i64 4
    709   %arrayidx5 = getelementptr inbounds float, float* %ptr, i64 5
    710   %arrayidx6 = getelementptr inbounds float, float* %ptr, i64 6
    711   %arrayidx7 = getelementptr inbounds float, float* %ptr, i64 7
    712   store float %vecext0, float* %ptr, align 4
    713   store float %vecext1, float* %arrayidx1, align 4
    714   store float %vecext2, float* %arrayidx2, align 4
    715   store float %vecext3, float* %arrayidx3, align 4
    716   store float %vecext4, float* %arrayidx4, align 4
    717   store float %vecext5, float* %arrayidx5, align 4
    718   store float %vecext6, float* %arrayidx6, align 4
    719   store float %vecext7, float* %arrayidx7, align 4
    720   ret void
    721 
    722 }
    723 
    724 ; PR21711 - Merge vector stores into wider vector stores.
    725 ; These should be merged into 32-byte stores.
    726 define void @merge_vec_extract_stores(<8 x float> %v1, <8 x float> %v2, <4 x float>* %ptr) {
    727 ; CHECK-LABEL: merge_vec_extract_stores:
    728 ; CHECK:       # %bb.0:
    729 ; CHECK-NEXT:    vmovups %ymm0, 48(%rdi)
    730 ; CHECK-NEXT:    vmovups %ymm1, 80(%rdi)
    731 ; CHECK-NEXT:    vzeroupper
    732 ; CHECK-NEXT:    retq
    733   %idx0 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 3
    734   %idx1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 4
    735   %idx2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 5
    736   %idx3 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 6
    737   %shuffle0 = shufflevector <8 x float> %v1, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    738   %shuffle1 = shufflevector <8 x float> %v1, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    739   %shuffle2 = shufflevector <8 x float> %v2, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    740   %shuffle3 = shufflevector <8 x float> %v2, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    741   store <4 x float> %shuffle0, <4 x float>* %idx0, align 16
    742   store <4 x float> %shuffle1, <4 x float>* %idx1, align 16
    743   store <4 x float> %shuffle2, <4 x float>* %idx2, align 16
    744   store <4 x float> %shuffle3, <4 x float>* %idx3, align 16
    745   ret void
    746 
    747 }
    748 
    749 ; Merging vector stores when sourced from vector loads.
    750 define void @merge_vec_stores_from_loads(<4 x float>* %v, <4 x float>* %ptr) {
    751 ; CHECK-LABEL: merge_vec_stores_from_loads:
    752 ; CHECK:       # %bb.0:
    753 ; CHECK-NEXT:    vmovups (%rdi), %ymm0
    754 ; CHECK-NEXT:    vmovups %ymm0, (%rsi)
    755 ; CHECK-NEXT:    vzeroupper
    756 ; CHECK-NEXT:    retq
    757   %load_idx0 = getelementptr inbounds <4 x float>, <4 x float>* %v, i64 0
    758   %load_idx1 = getelementptr inbounds <4 x float>, <4 x float>* %v, i64 1
    759   %v0 = load <4 x float>, <4 x float>* %load_idx0
    760   %v1 = load <4 x float>, <4 x float>* %load_idx1
    761   %store_idx0 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 0
    762   %store_idx1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 1
    763   store <4 x float> %v0, <4 x float>* %store_idx0, align 16
    764   store <4 x float> %v1, <4 x float>* %store_idx1, align 16
    765   ret void
    766 
    767 }
    768 
    769 ; Merging vector stores when sourced from a constant vector is not currently handled.
    770 define void @merge_vec_stores_of_constants(<4 x i32>* %ptr) {
    771 ; CHECK-LABEL: merge_vec_stores_of_constants:
    772 ; CHECK:       # %bb.0:
    773 ; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
    774 ; CHECK-NEXT:    vmovaps %xmm0, 48(%rdi)
    775 ; CHECK-NEXT:    vmovaps %xmm0, 64(%rdi)
    776 ; CHECK-NEXT:    retq
    777   %idx0 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 3
    778   %idx1 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 4
    779   store <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32>* %idx0, align 16
    780   store <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32>* %idx1, align 16
    781   ret void
    782 
    783 }
    784 
    785 ; This is a minimized test based on real code that was failing.
    786 ; This should now be merged.
    787 define void @merge_vec_element_and_scalar_load([6 x i64]* %array) {
    788 ; CHECK-LABEL: merge_vec_element_and_scalar_load:
    789 ; CHECK:       # %bb.0:
    790 ; CHECK-NEXT:    vmovups (%rdi), %xmm0
    791 ; CHECK-NEXT:    vmovups %xmm0, 32(%rdi)
    792 ; CHECK-NEXT:    retq
    793   %idx0 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 0
    794   %idx1 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 1
    795   %idx4 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 4
    796   %idx5 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 5
    797 
    798   %a0 = load i64, i64* %idx0, align 8
    799   store i64 %a0, i64* %idx4, align 8
    800 
    801   %b = bitcast i64* %idx1 to <2 x i64>*
    802   %v = load <2 x i64>, <2 x i64>* %b, align 8
    803   %a1 = extractelement <2 x i64> %v, i32 0
    804   store i64 %a1, i64* %idx5, align 8
    805   ret void
    806 
    807 }
    808 
    809 ; Don't let a non-consecutive store thwart merging of the last two.
    810 define void @almost_consecutive_stores(i8* %p) {
    811 ; CHECK-LABEL: almost_consecutive_stores:
    812 ; CHECK:       # %bb.0:
    813 ; CHECK-NEXT:    movb $0, (%rdi)
    814 ; CHECK-NEXT:    movb $1, 42(%rdi)
    815 ; CHECK-NEXT:    movw $770, 2(%rdi) # imm = 0x302
    816 ; CHECK-NEXT:    retq
    817   store i8 0, i8* %p
    818   %p1 = getelementptr i8, i8* %p, i64 42
    819   store i8 1, i8* %p1
    820   %p2 = getelementptr i8, i8* %p, i64 2
    821   store i8 2, i8* %p2
    822   %p3 = getelementptr i8, i8* %p, i64 3
    823   store i8 3, i8* %p3
    824   ret void
    825 }
    826 
    827 ; We should be able to merge these.
    828 define void @merge_bitcast(<4 x i32> %v, float* %ptr) {
    829 ; CHECK-LABEL: merge_bitcast:
    830 ; CHECK:       # %bb.0:
    831 ; CHECK-NEXT:    vmovups %xmm0, (%rdi)
    832 ; CHECK-NEXT:    retq
    833   %fv = bitcast <4 x i32> %v to <4 x float>
    834   %vecext1 = extractelement <4 x i32> %v, i32 1
    835   %vecext2 = extractelement <4 x i32> %v, i32 2
    836   %vecext3 = extractelement <4 x i32> %v, i32 3
    837   %f0 = extractelement <4 x float> %fv, i32 0
    838   %f1 = bitcast i32 %vecext1 to float
    839   %f2 = bitcast i32 %vecext2 to float
    840   %f3 = bitcast i32 %vecext3 to float
    841   %idx0 = getelementptr inbounds float, float* %ptr, i64 0
    842   %idx1 = getelementptr inbounds float, float* %ptr, i64 1
    843   %idx2 = getelementptr inbounds float, float* %ptr, i64 2
    844   %idx3 = getelementptr inbounds float, float* %ptr, i64 3
    845   store float %f0, float* %idx0, align 4
    846   store float %f1, float* %idx1, align 4
    847   store float %f2, float* %idx2, align 4
    848   store float %f3, float* %idx3, align 4
    849   ret void
    850 }
    851