Home | History | Annotate | Download | only in SROA
      1 ; RUN: opt < %s -sroa -S | FileCheck %s
      2 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
      3 
      4 %S1 = type { i64, [42 x float] }
      5 
      6 define i32 @test1(<4 x i32> %x, <4 x i32> %y) {
      7 ; CHECK-LABEL: @test1(
      8 entry:
      9 	%a = alloca [2 x <4 x i32>]
     10 ; CHECK-NOT: alloca
     11 
     12   %a.x = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 0
     13   store <4 x i32> %x, <4 x i32>* %a.x
     14   %a.y = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 1
     15   store <4 x i32> %y, <4 x i32>* %a.y
     16 ; CHECK-NOT: store
     17 
     18   %a.tmp1 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 0, i64 2
     19   %tmp1 = load i32, i32* %a.tmp1
     20   %a.tmp2 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 1, i64 3
     21   %tmp2 = load i32, i32* %a.tmp2
     22   %a.tmp3 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 1, i64 0
     23   %tmp3 = load i32, i32* %a.tmp3
     24 ; CHECK-NOT: load
     25 ; CHECK:      extractelement <4 x i32> %x, i32 2
     26 ; CHECK-NEXT: extractelement <4 x i32> %y, i32 3
     27 ; CHECK-NEXT: extractelement <4 x i32> %y, i32 0
     28 
     29   %tmp4 = add i32 %tmp1, %tmp2
     30   %tmp5 = add i32 %tmp3, %tmp4
     31   ret i32 %tmp5
     32 ; CHECK-NEXT: add
     33 ; CHECK-NEXT: add
     34 ; CHECK-NEXT: ret
     35 }
     36 
     37 define i32 @test2(<4 x i32> %x, <4 x i32> %y) {
     38 ; CHECK-LABEL: @test2(
     39 entry:
     40 	%a = alloca [2 x <4 x i32>]
     41 ; CHECK-NOT: alloca
     42 
     43   %a.x = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 0
     44   store <4 x i32> %x, <4 x i32>* %a.x
     45   %a.y = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 1
     46   store <4 x i32> %y, <4 x i32>* %a.y
     47 ; CHECK-NOT: store
     48 
     49   %a.tmp1 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 0, i64 2
     50   %tmp1 = load i32, i32* %a.tmp1
     51   %a.tmp2 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 1, i64 3
     52   %tmp2 = load i32, i32* %a.tmp2
     53   %a.tmp3 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 1, i64 0
     54   %a.tmp3.cast = bitcast i32* %a.tmp3 to <2 x i32>*
     55   %tmp3.vec = load <2 x i32>, <2 x i32>* %a.tmp3.cast
     56   %tmp3 = extractelement <2 x i32> %tmp3.vec, i32 0
     57 ; CHECK-NOT: load
     58 ; CHECK:      %[[extract1:.*]] = extractelement <4 x i32> %x, i32 2
     59 ; CHECK-NEXT: %[[extract2:.*]] = extractelement <4 x i32> %y, i32 3
     60 ; CHECK-NEXT: %[[extract3:.*]] = shufflevector <4 x i32> %y, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
     61 ; CHECK-NEXT: %[[extract4:.*]] = extractelement <2 x i32> %[[extract3]], i32 0
     62 
     63   %tmp4 = add i32 %tmp1, %tmp2
     64   %tmp5 = add i32 %tmp3, %tmp4
     65   ret i32 %tmp5
     66 ; CHECK-NEXT: %[[sum1:.*]] = add i32 %[[extract1]], %[[extract2]]
     67 ; CHECK-NEXT: %[[sum2:.*]] = add i32 %[[extract4]], %[[sum1]]
     68 ; CHECK-NEXT: ret i32 %[[sum2]]
     69 }
     70 
     71 define i32 @test3(<4 x i32> %x, <4 x i32> %y) {
     72 ; CHECK-LABEL: @test3(
     73 entry:
     74 	%a = alloca [2 x <4 x i32>]
     75 ; CHECK-NOT: alloca
     76 
     77   %a.x = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 0
     78   store <4 x i32> %x, <4 x i32>* %a.x
     79   %a.y = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 1
     80   store <4 x i32> %y, <4 x i32>* %a.y
     81 ; CHECK-NOT: store
     82 
     83   %a.y.cast = bitcast <4 x i32>* %a.y to i8*
     84   call void @llvm.memset.p0i8.i32(i8* %a.y.cast, i8 0, i32 16, i32 1, i1 false)
     85 ; CHECK-NOT: memset
     86 
     87   %a.tmp1 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 0, i64 2
     88   %a.tmp1.cast = bitcast i32* %a.tmp1 to i8*
     89   call void @llvm.memset.p0i8.i32(i8* %a.tmp1.cast, i8 -1, i32 4, i32 1, i1 false)
     90   %tmp1 = load i32, i32* %a.tmp1
     91   %a.tmp2 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 1, i64 3
     92   %tmp2 = load i32, i32* %a.tmp2
     93   %a.tmp3 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 1, i64 0
     94   %tmp3 = load i32, i32* %a.tmp3
     95 ; CHECK-NOT: load
     96 ; CHECK:      %[[insert:.*]] = insertelement <4 x i32> %x, i32 -1, i32 2
     97 ; CHECK-NEXT: extractelement <4 x i32> %[[insert]], i32 2
     98 ; CHECK-NEXT: extractelement <4 x i32> zeroinitializer, i32 3
     99 ; CHECK-NEXT: extractelement <4 x i32> zeroinitializer, i32 0
    100 
    101   %tmp4 = add i32 %tmp1, %tmp2
    102   %tmp5 = add i32 %tmp3, %tmp4
    103   ret i32 %tmp5
    104 ; CHECK-NEXT: add
    105 ; CHECK-NEXT: add
    106 ; CHECK-NEXT: ret
    107 }
    108 
    109 define i32 @test4(<4 x i32> %x, <4 x i32> %y, <4 x i32>* %z) {
    110 ; CHECK-LABEL: @test4(
    111 entry:
    112 	%a = alloca [2 x <4 x i32>]
    113 ; CHECK-NOT: alloca
    114 
    115   %a.x = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 0
    116   store <4 x i32> %x, <4 x i32>* %a.x
    117   %a.y = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 1
    118   store <4 x i32> %y, <4 x i32>* %a.y
    119 ; CHECK-NOT: store
    120 
    121   %a.y.cast = bitcast <4 x i32>* %a.y to i8*
    122   %z.cast = bitcast <4 x i32>* %z to i8*
    123   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a.y.cast, i8* %z.cast, i32 16, i32 1, i1 false)
    124 ; CHECK-NOT: memcpy
    125 
    126   %a.tmp1 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 0, i64 2
    127   %a.tmp1.cast = bitcast i32* %a.tmp1 to i8*
    128   %z.tmp1 = getelementptr inbounds <4 x i32>, <4 x i32>* %z, i64 0, i64 2
    129   %z.tmp1.cast = bitcast i32* %z.tmp1 to i8*
    130   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a.tmp1.cast, i8* %z.tmp1.cast, i32 4, i32 1, i1 false)
    131   %tmp1 = load i32, i32* %a.tmp1
    132   %a.tmp2 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 1, i64 3
    133   %tmp2 = load i32, i32* %a.tmp2
    134   %a.tmp3 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 1, i64 0
    135   %tmp3 = load i32, i32* %a.tmp3
    136 ; CHECK-NOT: memcpy
    137 ; CHECK:      %[[load:.*]] = load <4 x i32>, <4 x i32>* %z
    138 ; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* %z, i64 0, i64 2
    139 ; CHECK-NEXT: %[[element_load:.*]] = load i32, i32* %[[gep]]
    140 ; CHECK-NEXT: %[[insert:.*]] = insertelement <4 x i32> %x, i32 %[[element_load]], i32 2
    141 ; CHECK-NEXT: extractelement <4 x i32> %[[insert]], i32 2
    142 ; CHECK-NEXT: extractelement <4 x i32> %[[load]], i32 3
    143 ; CHECK-NEXT: extractelement <4 x i32> %[[load]], i32 0
    144 
    145   %tmp4 = add i32 %tmp1, %tmp2
    146   %tmp5 = add i32 %tmp3, %tmp4
    147   ret i32 %tmp5
    148 ; CHECK-NEXT: add
    149 ; CHECK-NEXT: add
    150 ; CHECK-NEXT: ret
    151 }
    152 
    153 declare void @llvm.memcpy.p0i8.p1i8.i32(i8* nocapture, i8 addrspace(1)* nocapture, i32, i32, i1) nounwind
    154 
    155 ; Same as test4 with a different sized address  space pointer source.
    156 define i32 @test4_as1(<4 x i32> %x, <4 x i32> %y, <4 x i32> addrspace(1)* %z) {
    157 ; CHECK-LABEL: @test4_as1(
    158 entry:
    159 	%a = alloca [2 x <4 x i32>]
    160 ; CHECK-NOT: alloca
    161 
    162   %a.x = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 0
    163   store <4 x i32> %x, <4 x i32>* %a.x
    164   %a.y = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 1
    165   store <4 x i32> %y, <4 x i32>* %a.y
    166 ; CHECK-NOT: store
    167 
    168   %a.y.cast = bitcast <4 x i32>* %a.y to i8*
    169   %z.cast = bitcast <4 x i32> addrspace(1)* %z to i8 addrspace(1)*
    170   call void @llvm.memcpy.p0i8.p1i8.i32(i8* %a.y.cast, i8 addrspace(1)* %z.cast, i32 16, i32 1, i1 false)
    171 ; CHECK-NOT: memcpy
    172 
    173   %a.tmp1 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 0, i64 2
    174   %a.tmp1.cast = bitcast i32* %a.tmp1 to i8*
    175   %z.tmp1 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %z, i16 0, i16 2
    176   %z.tmp1.cast = bitcast i32 addrspace(1)* %z.tmp1 to i8 addrspace(1)*
    177   call void @llvm.memcpy.p0i8.p1i8.i32(i8* %a.tmp1.cast, i8 addrspace(1)* %z.tmp1.cast, i32 4, i32 1, i1 false)
    178   %tmp1 = load i32, i32* %a.tmp1
    179   %a.tmp2 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 1, i64 3
    180   %tmp2 = load i32, i32* %a.tmp2
    181   %a.tmp3 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 1, i64 0
    182   %tmp3 = load i32, i32* %a.tmp3
    183 ; CHECK-NOT: memcpy
    184 ; CHECK:      %[[load:.*]] = load <4 x i32>, <4 x i32> addrspace(1)* %z
    185 ; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %z, i64 0, i64 2
    186 ; CHECK-NEXT: %[[element_load:.*]] = load i32, i32 addrspace(1)* %[[gep]]
    187 ; CHECK-NEXT: %[[insert:.*]] = insertelement <4 x i32> %x, i32 %[[element_load]], i32 2
    188 ; CHECK-NEXT: extractelement <4 x i32> %[[insert]], i32 2
    189 ; CHECK-NEXT: extractelement <4 x i32> %[[load]], i32 3
    190 ; CHECK-NEXT: extractelement <4 x i32> %[[load]], i32 0
    191 
    192   %tmp4 = add i32 %tmp1, %tmp2
    193   %tmp5 = add i32 %tmp3, %tmp4
    194   ret i32 %tmp5
    195 ; CHECK-NEXT: add
    196 ; CHECK-NEXT: add
    197 ; CHECK-NEXT: ret
    198 }
    199 
    200 define i32 @test5(<4 x i32> %x, <4 x i32> %y, <4 x i32>* %z) {
    201 ; CHECK-LABEL: @test5(
    202 ; The same as the above, but with reversed source and destination for the
    203 ; element memcpy, and a self copy.
    204 entry:
    205 	%a = alloca [2 x <4 x i32>]
    206 ; CHECK-NOT: alloca
    207 
    208   %a.x = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 0
    209   store <4 x i32> %x, <4 x i32>* %a.x
    210   %a.y = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 1
    211   store <4 x i32> %y, <4 x i32>* %a.y
    212 ; CHECK-NOT: store
    213 
    214   %a.y.cast = bitcast <4 x i32>* %a.y to i8*
    215   %a.x.cast = bitcast <4 x i32>* %a.x to i8*
    216   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a.x.cast, i8* %a.y.cast, i32 16, i32 1, i1 false)
    217 ; CHECK-NOT: memcpy
    218 
    219   %a.tmp1 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 0, i64 2
    220   %a.tmp1.cast = bitcast i32* %a.tmp1 to i8*
    221   %z.tmp1 = getelementptr inbounds <4 x i32>, <4 x i32>* %z, i64 0, i64 2
    222   %z.tmp1.cast = bitcast i32* %z.tmp1 to i8*
    223   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %z.tmp1.cast, i8* %a.tmp1.cast, i32 4, i32 1, i1 false)
    224   %tmp1 = load i32, i32* %a.tmp1
    225   %a.tmp2 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 1, i64 3
    226   %tmp2 = load i32, i32* %a.tmp2
    227   %a.tmp3 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 1, i64 0
    228   %tmp3 = load i32, i32* %a.tmp3
    229 ; CHECK-NOT: memcpy
    230 ; CHECK:      %[[gep:.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* %z, i64 0, i64 2
    231 ; CHECK-NEXT: %[[extract:.*]] = extractelement <4 x i32> %y, i32 2
    232 ; CHECK-NEXT: store i32 %[[extract]], i32* %[[gep]]
    233 ; CHECK-NEXT: extractelement <4 x i32> %y, i32 2
    234 ; CHECK-NEXT: extractelement <4 x i32> %y, i32 3
    235 ; CHECK-NEXT: extractelement <4 x i32> %y, i32 0
    236 
    237   %tmp4 = add i32 %tmp1, %tmp2
    238   %tmp5 = add i32 %tmp3, %tmp4
    239   ret i32 %tmp5
    240 ; CHECK-NEXT: add
    241 ; CHECK-NEXT: add
    242 ; CHECK-NEXT: ret
    243 }
    244 
    245 declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
    246 declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind
    247 
    248 define i64 @test6(<4 x i64> %x, <4 x i64> %y, i64 %n) {
    249 ; CHECK-LABEL: @test6(
    250 ; The old scalarrepl pass would wrongly drop the store to the second alloca.
    251 ; PR13254
    252   %tmp = alloca { <4 x i64>, <4 x i64> }
    253   %p0 = getelementptr inbounds { <4 x i64>, <4 x i64> }, { <4 x i64>, <4 x i64> }* %tmp, i32 0, i32 0
    254   store <4 x i64> %x, <4 x i64>* %p0
    255 ; CHECK: store <4 x i64> %x,
    256   %p1 = getelementptr inbounds { <4 x i64>, <4 x i64> }, { <4 x i64>, <4 x i64> }* %tmp, i32 0, i32 1
    257   store <4 x i64> %y, <4 x i64>* %p1
    258 ; CHECK: store <4 x i64> %y,
    259   %addr = getelementptr inbounds { <4 x i64>, <4 x i64> }, { <4 x i64>, <4 x i64> }* %tmp, i32 0, i32 0, i64 %n
    260   %res = load i64, i64* %addr, align 4
    261   ret i64 %res
    262 }
    263 
    264 define <4 x i32> @test_subvec_store() {
    265 ; CHECK-LABEL: @test_subvec_store(
    266 entry:
    267   %a = alloca <4 x i32>
    268 ; CHECK-NOT: alloca
    269 
    270   %a.gep0 = getelementptr <4 x i32>, <4 x i32>* %a, i32 0, i32 0
    271   %a.cast0 = bitcast i32* %a.gep0 to <2 x i32>*
    272   store <2 x i32> <i32 0, i32 0>, <2 x i32>* %a.cast0
    273 ; CHECK-NOT: store
    274 ; CHECK:     select <4 x i1> <i1 true, i1 true, i1 false, i1 false> 
    275 
    276   %a.gep1 = getelementptr <4 x i32>, <4 x i32>* %a, i32 0, i32 1
    277   %a.cast1 = bitcast i32* %a.gep1 to <2 x i32>*
    278   store <2 x i32> <i32 1, i32 1>, <2 x i32>* %a.cast1
    279 ; CHECK-NEXT: select <4 x i1> <i1 false, i1 true, i1 true, i1 false>
    280 
    281   %a.gep2 = getelementptr <4 x i32>, <4 x i32>* %a, i32 0, i32 2
    282   %a.cast2 = bitcast i32* %a.gep2 to <2 x i32>*
    283   store <2 x i32> <i32 2, i32 2>, <2 x i32>* %a.cast2
    284 ; CHECK-NEXT: select <4 x i1> <i1 false, i1 false, i1 true, i1 true>
    285 
    286   %a.gep3 = getelementptr <4 x i32>, <4 x i32>* %a, i32 0, i32 3
    287   store i32 3, i32* %a.gep3
    288 ; CHECK-NEXT: insertelement <4 x i32>
    289 
    290   %ret = load <4 x i32>, <4 x i32>* %a
    291 
    292   ret <4 x i32> %ret
    293 ; CHECK-NEXT: ret <4 x i32> 
    294 }
    295 
    296 define <4 x i32> @test_subvec_load() {
    297 ; CHECK-LABEL: @test_subvec_load(
    298 entry:
    299   %a = alloca <4 x i32>
    300 ; CHECK-NOT: alloca
    301   store <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32>* %a
    302 ; CHECK-NOT: store
    303 
    304   %a.gep0 = getelementptr <4 x i32>, <4 x i32>* %a, i32 0, i32 0
    305   %a.cast0 = bitcast i32* %a.gep0 to <2 x i32>*
    306   %first = load <2 x i32>, <2 x i32>* %a.cast0
    307 ; CHECK-NOT: load
    308 ; CHECK:      %[[extract1:.*]] = shufflevector <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
    309 
    310   %a.gep1 = getelementptr <4 x i32>, <4 x i32>* %a, i32 0, i32 1
    311   %a.cast1 = bitcast i32* %a.gep1 to <2 x i32>*
    312   %second = load <2 x i32>, <2 x i32>* %a.cast1
    313 ; CHECK-NEXT: %[[extract2:.*]] = shufflevector <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32> undef, <2 x i32> <i32 1, i32 2>
    314 
    315   %a.gep2 = getelementptr <4 x i32>, <4 x i32>* %a, i32 0, i32 2
    316   %a.cast2 = bitcast i32* %a.gep2 to <2 x i32>*
    317   %third = load <2 x i32>, <2 x i32>* %a.cast2
    318 ; CHECK-NEXT: %[[extract3:.*]] = shufflevector <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    319 
    320   %tmp = shufflevector <2 x i32> %first, <2 x i32> %second, <2 x i32> <i32 0, i32 2>
    321   %ret = shufflevector <2 x i32> %tmp, <2 x i32> %third, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    322 ; CHECK-NEXT: %[[tmp:.*]] = shufflevector <2 x i32> %[[extract1]], <2 x i32> %[[extract2]], <2 x i32> <i32 0, i32 2>
    323 ; CHECK-NEXT: %[[ret:.*]] = shufflevector <2 x i32> %[[tmp]], <2 x i32> %[[extract3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    324 
    325   ret <4 x i32> %ret
    326 ; CHECK-NEXT: ret <4 x i32> %[[ret]]
    327 }
    328 
    329 declare void @llvm.memset.p0i32.i32(i32* nocapture, i32, i32, i32, i1) nounwind
    330 
    331 define <4 x float> @test_subvec_memset() {
    332 ; CHECK-LABEL: @test_subvec_memset(
    333 entry:
    334   %a = alloca <4 x float>
    335 ; CHECK-NOT: alloca
    336 
    337   %a.gep0 = getelementptr <4 x float>, <4 x float>* %a, i32 0, i32 0
    338   %a.cast0 = bitcast float* %a.gep0 to i8*
    339   call void @llvm.memset.p0i8.i32(i8* %a.cast0, i8 0, i32 8, i32 0, i1 false)
    340 ; CHECK-NOT: store
    341 ; CHECK: select <4 x i1> <i1 true, i1 true, i1 false, i1 false>
    342 
    343   %a.gep1 = getelementptr <4 x float>, <4 x float>* %a, i32 0, i32 1
    344   %a.cast1 = bitcast float* %a.gep1 to i8*
    345   call void @llvm.memset.p0i8.i32(i8* %a.cast1, i8 1, i32 8, i32 0, i1 false)
    346 ; CHECK-NEXT: select <4 x i1> <i1 false, i1 true, i1 true, i1 false>
    347 
    348   %a.gep2 = getelementptr <4 x float>, <4 x float>* %a, i32 0, i32 2
    349   %a.cast2 = bitcast float* %a.gep2 to i8*
    350   call void @llvm.memset.p0i8.i32(i8* %a.cast2, i8 3, i32 8, i32 0, i1 false)
    351 ; CHECK-NEXT: select <4 x i1> <i1 false, i1 false, i1 true, i1 true>
    352 
    353   %a.gep3 = getelementptr <4 x float>, <4 x float>* %a, i32 0, i32 3
    354   %a.cast3 = bitcast float* %a.gep3 to i8*
    355   call void @llvm.memset.p0i8.i32(i8* %a.cast3, i8 7, i32 4, i32 0, i1 false)
    356 ; CHECK-NEXT: insertelement <4 x float> 
    357 
    358   %ret = load <4 x float>, <4 x float>* %a
    359 
    360   ret <4 x float> %ret
    361 ; CHECK-NEXT: ret <4 x float> 
    362 }
    363 
    364 define <4 x float> @test_subvec_memcpy(i8* %x, i8* %y, i8* %z, i8* %f, i8* %out) {
    365 ; CHECK-LABEL: @test_subvec_memcpy(
    366 entry:
    367   %a = alloca <4 x float>
    368 ; CHECK-NOT: alloca
    369 
    370   %a.gep0 = getelementptr <4 x float>, <4 x float>* %a, i32 0, i32 0
    371   %a.cast0 = bitcast float* %a.gep0 to i8*
    372   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a.cast0, i8* %x, i32 8, i32 0, i1 false)
    373 ; CHECK:      %[[xptr:.*]] = bitcast i8* %x to <2 x float>*
    374 ; CHECK-NEXT: %[[x:.*]] = load <2 x float>, <2 x float>* %[[xptr]]
    375 ; CHECK-NEXT: %[[expand_x:.*]] = shufflevector <2 x float> %[[x]], <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
    376 ; CHECK-NEXT: select <4 x i1> <i1 true, i1 true, i1 false, i1 false>  
    377 
    378   %a.gep1 = getelementptr <4 x float>, <4 x float>* %a, i32 0, i32 1
    379   %a.cast1 = bitcast float* %a.gep1 to i8*
    380   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a.cast1, i8* %y, i32 8, i32 0, i1 false)
    381 ; CHECK-NEXT: %[[yptr:.*]] = bitcast i8* %y to <2 x float>*
    382 ; CHECK-NEXT: %[[y:.*]] = load <2 x float>, <2 x float>* %[[yptr]]
    383 ; CHECK-NEXT: %[[expand_y:.*]] = shufflevector <2 x float> %[[y]], <2 x float> undef, <4 x i32> <i32 undef, i32 0, i32 1, i32 undef>
    384 ; CHECK-NEXT: select <4 x i1> <i1 false, i1 true, i1 true, i1 false>
    385 
    386   %a.gep2 = getelementptr <4 x float>, <4 x float>* %a, i32 0, i32 2
    387   %a.cast2 = bitcast float* %a.gep2 to i8*
    388   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a.cast2, i8* %z, i32 8, i32 0, i1 false)
    389 ; CHECK-NEXT: %[[zptr:.*]] = bitcast i8* %z to <2 x float>*
    390 ; CHECK-NEXT: %[[z:.*]] = load <2 x float>, <2 x float>* %[[zptr]]
    391 ; CHECK-NEXT: %[[expand_z:.*]] = shufflevector <2 x float> %[[z]], <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
    392 ; CHECK-NEXT: select <4 x i1> <i1 false, i1 false, i1 true, i1 true>
    393 
    394   %a.gep3 = getelementptr <4 x float>, <4 x float>* %a, i32 0, i32 3
    395   %a.cast3 = bitcast float* %a.gep3 to i8*
    396   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a.cast3, i8* %f, i32 4, i32 0, i1 false)
    397 ; CHECK-NEXT: %[[fptr:.*]] = bitcast i8* %f to float*
    398 ; CHECK-NEXT: %[[f:.*]] = load float, float* %[[fptr]]
    399 ; CHECK-NEXT: %[[insert_f:.*]] = insertelement <4 x float> 
    400 
    401   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %out, i8* %a.cast2, i32 8, i32 0, i1 false)
    402 ; CHECK-NEXT: %[[outptr:.*]] = bitcast i8* %out to <2 x float>*
    403 ; CHECK-NEXT: %[[extract_out:.*]] = shufflevector <4 x float> %[[insert_f]], <4 x float> undef, <2 x i32> <i32 2, i32 3>
    404 ; CHECK-NEXT: store <2 x float> %[[extract_out]], <2 x float>* %[[outptr]]
    405 
    406   %ret = load <4 x float>, <4 x float>* %a
    407 
    408   ret <4 x float> %ret
    409 ; CHECK-NEXT: ret <4 x float> %[[insert_f]]
    410 }
    411 
    412 define i32 @PR14212() {
    413 ; CHECK-LABEL: @PR14212(
    414 ; This caused a crash when "splitting" the load of the i32 in order to promote
    415 ; the store of <3 x i8> properly. Heavily reduced from an OpenCL test case.
    416 entry:
    417   %retval = alloca <3 x i8>, align 4
    418 ; CHECK-NOT: alloca
    419 
    420   store <3 x i8> undef, <3 x i8>* %retval, align 4
    421   %cast = bitcast <3 x i8>* %retval to i32*
    422   %load = load i32, i32* %cast, align 4
    423   ret i32 %load
    424 ; CHECK: ret i32
    425 }
    426 
    427 define <2 x i8> @PR14349.1(i32 %x) {
    428 ; CHECK: @PR14349.1
    429 ; The first testcase for broken SROA rewriting of split integer loads and
    430 ; stores due to smaller vector loads and stores. This particular test ensures
    431 ; that we can rewrite a split store of an integer to a store of a vector.
    432 entry:
    433   %a = alloca i32
    434 ; CHECK-NOT: alloca
    435 
    436   store i32 %x, i32* %a
    437 ; CHECK-NOT: store
    438 
    439   %cast = bitcast i32* %a to <2 x i8>*
    440   %vec = load <2 x i8>, <2 x i8>* %cast
    441 ; CHECK-NOT: load
    442 
    443   ret <2 x i8> %vec
    444 ; CHECK: %[[trunc:.*]] = trunc i32 %x to i16
    445 ; CHECK: %[[cast:.*]] = bitcast i16 %[[trunc]] to <2 x i8>
    446 ; CHECK: ret <2 x i8> %[[cast]]
    447 }
    448 
    449 define i32 @PR14349.2(<2 x i8> %x) {
    450 ; CHECK: @PR14349.2
    451 ; The first testcase for broken SROA rewriting of split integer loads and
    452 ; stores due to smaller vector loads and stores. This particular test ensures
    453 ; that we can rewrite a split load of an integer to a load of a vector.
    454 entry:
    455   %a = alloca i32
    456 ; CHECK-NOT: alloca
    457 
    458   %cast = bitcast i32* %a to <2 x i8>*
    459   store <2 x i8> %x, <2 x i8>* %cast
    460 ; CHECK-NOT: store
    461 
    462   %int = load i32, i32* %a
    463 ; CHECK-NOT: load
    464 
    465   ret i32 %int
    466 ; CHECK: %[[cast:.*]] = bitcast <2 x i8> %x to i16
    467 ; CHECK: %[[trunc:.*]] = zext i16 %[[cast]] to i32
    468 ; CHECK: %[[insert:.*]] = or i32 %{{.*}}, %[[trunc]]
    469 ; CHECK: ret i32 %[[insert]]
    470 }
    471 
    472 define i32 @test7(<2 x i32> %x, <2 x i32> %y) {
    473 ; Test that we can promote to vectors when the alloca doesn't mention any vector types.
    474 ; CHECK-LABEL: @test7(
    475 entry:
    476 	%a = alloca [2 x i64]
    477   %a.cast = bitcast [2 x i64]* %a to [2 x <2 x i32>]*
    478 ; CHECK-NOT: alloca
    479 
    480   %a.x = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* %a.cast, i64 0, i64 0
    481   store <2 x i32> %x, <2 x i32>* %a.x
    482   %a.y = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* %a.cast, i64 0, i64 1
    483   store <2 x i32> %y, <2 x i32>* %a.y
    484 ; CHECK-NOT: store
    485 
    486   %a.tmp1 = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* %a.cast, i64 0, i64 0, i64 1
    487   %tmp1 = load i32, i32* %a.tmp1
    488   %a.tmp2 = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* %a.cast, i64 0, i64 1, i64 1
    489   %tmp2 = load i32, i32* %a.tmp2
    490   %a.tmp3 = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* %a.cast, i64 0, i64 1, i64 0
    491   %tmp3 = load i32, i32* %a.tmp3
    492 ; CHECK-NOT: load
    493 ; CHECK:      extractelement <2 x i32> %x, i32 1
    494 ; CHECK-NEXT: extractelement <2 x i32> %y, i32 1
    495 ; CHECK-NEXT: extractelement <2 x i32> %y, i32 0
    496 
    497   %tmp4 = add i32 %tmp1, %tmp2
    498   %tmp5 = add i32 %tmp3, %tmp4
    499   ret i32 %tmp5
    500 ; CHECK-NEXT: add
    501 ; CHECK-NEXT: add
    502 ; CHECK-NEXT: ret
    503 }
    504 
    505 define i32 @test8(<2 x i32> %x) {
    506 ; Ensure that we can promote an alloca that doesn't mention a vector type based
    507 ; on a single store with a vector type.
    508 ; CHECK-LABEL: @test8(
    509 entry:
    510 	%a = alloca i64
    511   %a.vec = bitcast i64* %a to <2 x i32>*
    512   %a.i32 = bitcast i64* %a to i32*
    513 ; CHECK-NOT: alloca
    514 
    515   store <2 x i32> %x, <2 x i32>* %a.vec
    516 ; CHECK-NOT: store
    517 
    518   %tmp1 = load i32, i32* %a.i32
    519   %a.tmp2 = getelementptr inbounds i32, i32* %a.i32, i64 1
    520   %tmp2 = load i32, i32* %a.tmp2
    521 ; CHECK-NOT: load
    522 ; CHECK:      extractelement <2 x i32> %x, i32 0
    523 ; CHECK-NEXT: extractelement <2 x i32> %x, i32 1
    524 
    525   %tmp4 = add i32 %tmp1, %tmp2
    526   ret i32 %tmp4
    527 ; CHECK-NEXT: add
    528 ; CHECK-NEXT: ret
    529 }
    530 
    531 define <2 x i32> @test9(i32 %x, i32 %y) {
    532 ; Ensure that we can promote an alloca that doesn't mention a vector type based
    533 ; on a single load with a vector type.
    534 ; CHECK-LABEL: @test9(
    535 entry:
    536 	%a = alloca i64
    537   %a.vec = bitcast i64* %a to <2 x i32>*
    538   %a.i32 = bitcast i64* %a to i32*
    539 ; CHECK-NOT: alloca
    540 
    541   store i32 %x, i32* %a.i32
    542   %a.tmp2 = getelementptr inbounds i32, i32* %a.i32, i64 1
    543   store i32 %y, i32* %a.tmp2
    544 ; CHECK-NOT: store
    545 ; CHECK:      %[[V1:.*]] = insertelement <2 x i32> undef, i32 %x, i32 0
    546 ; CHECK-NEXT: %[[V2:.*]] = insertelement <2 x i32> %[[V1]], i32 %y, i32 1
    547 
    548   %result = load <2 x i32>, <2 x i32>* %a.vec
    549 ; CHECK-NOT:  load
    550 
    551   ret <2 x i32> %result
    552 ; CHECK-NEXT: ret <2 x i32> %[[V2]]
    553 }
    554 
    555 define <2 x i32> @test10(<4 x i16> %x, i32 %y) {
    556 ; If there are multiple different vector types used, we should select the one
    557 ; with the widest elements.
    558 ; CHECK-LABEL: @test10(
    559 entry:
    560 	%a = alloca i64
    561   %a.vec1 = bitcast i64* %a to <2 x i32>*
    562   %a.vec2 = bitcast i64* %a to <4 x i16>*
    563   %a.i32 = bitcast i64* %a to i32*
    564 ; CHECK-NOT: alloca
    565 
    566   store <4 x i16> %x, <4 x i16>* %a.vec2
    567   %a.tmp2 = getelementptr inbounds i32, i32* %a.i32, i64 1
    568   store i32 %y, i32* %a.tmp2
    569 ; CHECK-NOT: store
    570 ; CHECK:      %[[V1:.*]] = bitcast <4 x i16> %x to <2 x i32>
    571 ; CHECK-NEXT: %[[V2:.*]] = insertelement <2 x i32> %[[V1]], i32 %y, i32 1
    572 
    573   %result = load <2 x i32>, <2 x i32>* %a.vec1
    574 ; CHECK-NOT:  load
    575 
    576   ret <2 x i32> %result
    577 ; CHECK-NEXT: ret <2 x i32> %[[V2]]
    578 }
    579 
    580 define <2 x float> @test11(<4 x i16> %x, i32 %y) {
    581 ; If there are multiple different element types for different vector types,
    582 ; pick the integer types. This isn't really important, but seems like the best
    583 ; heuristic for making a deterministic decision.
    584 ; CHECK-LABEL: @test11(
    585 entry:
    586 	%a = alloca i64
    587   %a.vec1 = bitcast i64* %a to <2 x float>*
    588   %a.vec2 = bitcast i64* %a to <4 x i16>*
    589   %a.i32 = bitcast i64* %a to i32*
    590 ; CHECK-NOT: alloca
    591 
    592   store <4 x i16> %x, <4 x i16>* %a.vec2
    593   %a.tmp2 = getelementptr inbounds i32, i32* %a.i32, i64 1
    594   store i32 %y, i32* %a.tmp2
    595 ; CHECK-NOT: store
    596 ; CHECK:      %[[V1:.*]] = bitcast i32 %y to <2 x i16>
    597 ; CHECK-NEXT: %[[V2:.*]] = shufflevector <2 x i16> %[[V1]], <2 x i16> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
    598 ; CHECK-NEXT: %[[V3:.*]] = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x i16> %[[V2]], <4 x i16> %x
    599 ; CHECK-NEXT: %[[V4:.*]] = bitcast <4 x i16> %[[V3]] to <2 x float>
    600 
    601   %result = load <2 x float>, <2 x float>* %a.vec1
    602 ; CHECK-NOT:  load
    603 
    604   ret <2 x float> %result
    605 ; CHECK-NEXT: ret <2 x float> %[[V4]]
    606 }
    607 
    608 define <4 x float> @test12() {
    609 ; CHECK-LABEL: @test12(
    610   %a = alloca <3 x i32>, align 16
    611 ; CHECK-NOT: alloca
    612 
    613   %cast1 = bitcast <3 x i32>* %a to <4 x i32>*
    614   store <4 x i32> undef, <4 x i32>* %cast1, align 16
    615 ; CHECK-NOT: store
    616 
    617   %cast2 = bitcast <3 x i32>* %a to <3 x float>*
    618   %cast3 = bitcast <3 x float>* %cast2 to <4 x float>*
    619   %vec = load <4 x float>, <4 x float>* %cast3
    620 ; CHECK-NOT: load
    621 
    622 ; CHECK:      %[[ret:.*]] = bitcast <4 x i32> undef to <4 x float>
    623 ; CHECK-NEXT: ret <4 x float> %[[ret]]
    624   ret <4 x float> %vec
    625 }
    626