Home | History | Annotate | Download | only in LoopVectorize
      1 ; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
      2 
      3 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
      4 target triple = "x86_64-apple-macosx10.8.0"
      5 
      6 ;CHECK: @reduction_sum
      7 ;CHECK: phi <4 x i32>
      8 ;CHECK: load <4 x i32>
      9 ;CHECK: add <4 x i32>
     10 ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
     11 ;CHECK: add <4 x i32>
     12 ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
     13 ;CHECK: add <4 x i32>
     14 ;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
     15 ;CHECK: ret i32
     16 define i32 @reduction_sum(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
     17   %1 = icmp sgt i32 %n, 0
     18   br i1 %1, label %.lr.ph, label %._crit_edge
     19 
     20 .lr.ph:                                           ; preds = %0, %.lr.ph
     21   %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
     22   %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ]
     23   %2 = getelementptr inbounds i32* %A, i64 %indvars.iv
     24   %3 = load i32* %2, align 4
     25   %4 = getelementptr inbounds i32* %B, i64 %indvars.iv
     26   %5 = load i32* %4, align 4
     27   %6 = trunc i64 %indvars.iv to i32
     28   %7 = add i32 %sum.02, %6
     29   %8 = add i32 %7, %3
     30   %9 = add i32 %8, %5
     31   %indvars.iv.next = add i64 %indvars.iv, 1
     32   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
     33   %exitcond = icmp eq i32 %lftr.wideiv, %n
     34   br i1 %exitcond, label %._crit_edge, label %.lr.ph
     35 
     36 ._crit_edge:                                      ; preds = %.lr.ph, %0
     37   %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
     38   ret i32 %sum.0.lcssa
     39 }
     40 
     41 ;CHECK: @reduction_prod
     42 ;CHECK: phi <4 x i32>
     43 ;CHECK: load <4 x i32>
     44 ;CHECK: mul <4 x i32>
     45 ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
     46 ;CHECK: mul <4 x i32>
     47 ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
     48 ;CHECK: mul <4 x i32>
     49 ;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
     50 ;CHECK: ret i32
     51 define i32 @reduction_prod(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
     52   %1 = icmp sgt i32 %n, 0
     53   br i1 %1, label %.lr.ph, label %._crit_edge
     54 
     55 .lr.ph:                                           ; preds = %0, %.lr.ph
     56   %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
     57   %prod.02 = phi i32 [ %9, %.lr.ph ], [ 1, %0 ]
     58   %2 = getelementptr inbounds i32* %A, i64 %indvars.iv
     59   %3 = load i32* %2, align 4
     60   %4 = getelementptr inbounds i32* %B, i64 %indvars.iv
     61   %5 = load i32* %4, align 4
     62   %6 = trunc i64 %indvars.iv to i32
     63   %7 = mul i32 %prod.02, %6
     64   %8 = mul i32 %7, %3
     65   %9 = mul i32 %8, %5
     66   %indvars.iv.next = add i64 %indvars.iv, 1
     67   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
     68   %exitcond = icmp eq i32 %lftr.wideiv, %n
     69   br i1 %exitcond, label %._crit_edge, label %.lr.ph
     70 
     71 ._crit_edge:                                      ; preds = %.lr.ph, %0
     72   %prod.0.lcssa = phi i32 [ 1, %0 ], [ %9, %.lr.ph ]
     73   ret i32 %prod.0.lcssa
     74 }
     75 
     76 ;CHECK: @reduction_mix
     77 ;CHECK: phi <4 x i32>
     78 ;CHECK: load <4 x i32>
     79 ;CHECK: mul nsw <4 x i32>
     80 ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
     81 ;CHECK: add <4 x i32>
     82 ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
     83 ;CHECK: add <4 x i32>
     84 ;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
     85 ;CHECK: ret i32
     86 define i32 @reduction_mix(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
     87   %1 = icmp sgt i32 %n, 0
     88   br i1 %1, label %.lr.ph, label %._crit_edge
     89 
     90 .lr.ph:                                           ; preds = %0, %.lr.ph
     91   %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
     92   %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ]
     93   %2 = getelementptr inbounds i32* %A, i64 %indvars.iv
     94   %3 = load i32* %2, align 4
     95   %4 = getelementptr inbounds i32* %B, i64 %indvars.iv
     96   %5 = load i32* %4, align 4
     97   %6 = mul nsw i32 %5, %3
     98   %7 = trunc i64 %indvars.iv to i32
     99   %8 = add i32 %sum.02, %7
    100   %9 = add i32 %8, %6
    101   %indvars.iv.next = add i64 %indvars.iv, 1
    102   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
    103   %exitcond = icmp eq i32 %lftr.wideiv, %n
    104   br i1 %exitcond, label %._crit_edge, label %.lr.ph
    105 
    106 ._crit_edge:                                      ; preds = %.lr.ph, %0
    107   %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
    108   ret i32 %sum.0.lcssa
    109 }
    110 
    111 ;CHECK: @reduction_mul
    112 ;CHECK: mul <4 x i32>
    113 ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
    114 ;CHECK: mul <4 x i32>
    115 ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
    116 ;CHECK: mul <4 x i32>
    117 ;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
    118 ;CHECK: ret i32
    119 define i32 @reduction_mul(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
    120   %1 = icmp sgt i32 %n, 0
    121   br i1 %1, label %.lr.ph, label %._crit_edge
    122 
    123 .lr.ph:                                           ; preds = %0, %.lr.ph
    124   %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
    125   %sum.02 = phi i32 [ %9, %.lr.ph ], [ 19, %0 ]
    126   %2 = getelementptr inbounds i32* %A, i64 %indvars.iv
    127   %3 = load i32* %2, align 4
    128   %4 = getelementptr inbounds i32* %B, i64 %indvars.iv
    129   %5 = load i32* %4, align 4
    130   %6 = trunc i64 %indvars.iv to i32
    131   %7 = add i32 %3, %6
    132   %8 = add i32 %7, %5
    133   %9 = mul i32 %8, %sum.02
    134   %indvars.iv.next = add i64 %indvars.iv, 1
    135   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
    136   %exitcond = icmp eq i32 %lftr.wideiv, %n
    137   br i1 %exitcond, label %._crit_edge, label %.lr.ph
    138 
    139 ._crit_edge:                                      ; preds = %.lr.ph, %0
    140   %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
    141   ret i32 %sum.0.lcssa
    142 }
    143 
    144 ;CHECK: @start_at_non_zero
    145 ;CHECK: phi <4 x i32>
    146 ;CHECK: <i32 120, i32 0, i32 0, i32 0>
    147 ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
    148 ;CHECK: add <4 x i32>
    149 ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
    150 ;CHECK: add <4 x i32>
    151 ;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
    152 ;CHECK: ret i32
    153 define i32 @start_at_non_zero(i32* nocapture %in, i32* nocapture %coeff, i32* nocapture %out, i32 %n) nounwind uwtable readonly ssp {
    154 entry:
    155   %cmp7 = icmp sgt i32 %n, 0
    156   br i1 %cmp7, label %for.body, label %for.end
    157 
    158 for.body:                                         ; preds = %entry, %for.body
    159   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
    160   %sum.09 = phi i32 [ %add, %for.body ], [ 120, %entry ]
    161   %arrayidx = getelementptr inbounds i32* %in, i64 %indvars.iv
    162   %0 = load i32* %arrayidx, align 4
    163   %arrayidx2 = getelementptr inbounds i32* %coeff, i64 %indvars.iv
    164   %1 = load i32* %arrayidx2, align 4
    165   %mul = mul nsw i32 %1, %0
    166   %add = add nsw i32 %mul, %sum.09
    167   %indvars.iv.next = add i64 %indvars.iv, 1
    168   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
    169   %exitcond = icmp eq i32 %lftr.wideiv, %n
    170   br i1 %exitcond, label %for.end, label %for.body
    171 
    172 for.end:                                          ; preds = %for.body, %entry
    173   %sum.0.lcssa = phi i32 [ 120, %entry ], [ %add, %for.body ]
    174   ret i32 %sum.0.lcssa
    175 }
    176 
    177 ;CHECK: @reduction_and
    178 ;CHECK: and <4 x i32>
    179 ;CHECK: <i32 -1, i32 -1, i32 -1, i32 -1>
    180 ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
    181 ;CHECK: and <4 x i32>
    182 ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
    183 ;CHECK: and <4 x i32>
    184 ;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
    185 ;CHECK: ret i32
    186 define i32 @reduction_and(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
    187 entry:
    188   %cmp7 = icmp sgt i32 %n, 0
    189   br i1 %cmp7, label %for.body, label %for.end
    190 
    191 for.body:                                         ; preds = %entry, %for.body
    192   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
    193   %result.08 = phi i32 [ %and, %for.body ], [ -1, %entry ]
    194   %arrayidx = getelementptr inbounds i32* %A, i64 %indvars.iv
    195   %0 = load i32* %arrayidx, align 4
    196   %arrayidx2 = getelementptr inbounds i32* %B, i64 %indvars.iv
    197   %1 = load i32* %arrayidx2, align 4
    198   %add = add nsw i32 %1, %0
    199   %and = and i32 %add, %result.08
    200   %indvars.iv.next = add i64 %indvars.iv, 1
    201   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
    202   %exitcond = icmp eq i32 %lftr.wideiv, %n
    203   br i1 %exitcond, label %for.end, label %for.body
    204 
    205 for.end:                                          ; preds = %for.body, %entry
    206   %result.0.lcssa = phi i32 [ -1, %entry ], [ %and, %for.body ]
    207   ret i32 %result.0.lcssa
    208 }
    209 
    210 ;CHECK: @reduction_or
    211 ;CHECK: or <4 x i32>
    212 ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
    213 ;CHECK: or <4 x i32>
    214 ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
    215 ;CHECK: or <4 x i32>
    216 ;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
    217 ;CHECK: ret i32
    218 define i32 @reduction_or(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
    219 entry:
    220   %cmp7 = icmp sgt i32 %n, 0
    221   br i1 %cmp7, label %for.body, label %for.end
    222 
    223 for.body:                                         ; preds = %entry, %for.body
    224   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
    225   %result.08 = phi i32 [ %or, %for.body ], [ 0, %entry ]
    226   %arrayidx = getelementptr inbounds i32* %A, i64 %indvars.iv
    227   %0 = load i32* %arrayidx, align 4
    228   %arrayidx2 = getelementptr inbounds i32* %B, i64 %indvars.iv
    229   %1 = load i32* %arrayidx2, align 4
    230   %add = add nsw i32 %1, %0
    231   %or = or i32 %add, %result.08
    232   %indvars.iv.next = add i64 %indvars.iv, 1
    233   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
    234   %exitcond = icmp eq i32 %lftr.wideiv, %n
    235   br i1 %exitcond, label %for.end, label %for.body
    236 
    237 for.end:                                          ; preds = %for.body, %entry
    238   %result.0.lcssa = phi i32 [ 0, %entry ], [ %or, %for.body ]
    239   ret i32 %result.0.lcssa
    240 }
    241 
    242 ;CHECK: @reduction_xor
    243 ;CHECK: xor <4 x i32>
    244 ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
    245 ;CHECK: xor <4 x i32>
    246 ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
    247 ;CHECK: xor <4 x i32>
    248 ;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
    249 ;CHECK: ret i32
    250 define i32 @reduction_xor(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
    251 entry:
    252   %cmp7 = icmp sgt i32 %n, 0
    253   br i1 %cmp7, label %for.body, label %for.end
    254 
    255 for.body:                                         ; preds = %entry, %for.body
    256   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
    257   %result.08 = phi i32 [ %xor, %for.body ], [ 0, %entry ]
    258   %arrayidx = getelementptr inbounds i32* %A, i64 %indvars.iv
    259   %0 = load i32* %arrayidx, align 4
    260   %arrayidx2 = getelementptr inbounds i32* %B, i64 %indvars.iv
    261   %1 = load i32* %arrayidx2, align 4
    262   %add = add nsw i32 %1, %0
    263   %xor = xor i32 %add, %result.08
    264   %indvars.iv.next = add i64 %indvars.iv, 1
    265   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
    266   %exitcond = icmp eq i32 %lftr.wideiv, %n
    267   br i1 %exitcond, label %for.end, label %for.body
    268 
    269 for.end:                                          ; preds = %for.body, %entry
    270   %result.0.lcssa = phi i32 [ 0, %entry ], [ %xor, %for.body ]
    271   ret i32 %result.0.lcssa
    272 }
    273 
    274 ; In this code the subtracted variable is on the RHS and this is not an induction variable.
    275 ;CHECK: @reduction_sub_rhs
    276 ;CHECK-NOT: phi <4 x i32>
    277 ;CHECK-NOT: sub nsw <4 x i32>
    278 ;CHECK: ret i32
    279 define i32 @reduction_sub_rhs(i32 %n, i32* noalias nocapture %A) nounwind uwtable readonly {
    280 entry:
    281   %cmp4 = icmp sgt i32 %n, 0
    282   br i1 %cmp4, label %for.body, label %for.end
    283 
    284 for.body:                                         ; preds = %entry, %for.body
    285   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
    286   %x.05 = phi i32 [ %sub, %for.body ], [ 0, %entry ]
    287   %arrayidx = getelementptr inbounds i32* %A, i64 %indvars.iv
    288   %0 = load i32* %arrayidx, align 4
    289   %sub = sub nsw i32 %0, %x.05
    290   %indvars.iv.next = add i64 %indvars.iv, 1
    291   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
    292   %exitcond = icmp eq i32 %lftr.wideiv, %n
    293   br i1 %exitcond, label %for.end, label %for.body
    294 
    295 for.end:                                          ; preds = %for.body, %entry
    296   %x.0.lcssa = phi i32 [ 0, %entry ], [ %sub, %for.body ]
    297   ret i32 %x.0.lcssa
    298 }
    299 
    300 
    301 ; In this test the reduction variable is on the LHS and we can vectorize it.
    302 ;CHECK: @reduction_sub_lhs
    303 ;CHECK: phi <4 x i32>
    304 ;CHECK: sub nsw <4 x i32>
    305 ;CHECK: ret i32
    306 define i32 @reduction_sub_lhs(i32 %n, i32* noalias nocapture %A) nounwind uwtable readonly {
    307 entry:
    308   %cmp4 = icmp sgt i32 %n, 0
    309   br i1 %cmp4, label %for.body, label %for.end
    310 
    311 for.body:                                         ; preds = %entry, %for.body
    312   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
    313   %x.05 = phi i32 [ %sub, %for.body ], [ 0, %entry ]
    314   %arrayidx = getelementptr inbounds i32* %A, i64 %indvars.iv
    315   %0 = load i32* %arrayidx, align 4
    316   %sub = sub nsw i32 %x.05, %0
    317   %indvars.iv.next = add i64 %indvars.iv, 1
    318   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
    319   %exitcond = icmp eq i32 %lftr.wideiv, %n
    320   br i1 %exitcond, label %for.end, label %for.body
    321 
    322 for.end:                                          ; preds = %for.body, %entry
    323   %x.0.lcssa = phi i32 [ 0, %entry ], [ %sub, %for.body ]
    324   ret i32 %x.0.lcssa
    325 }
    326