Home | History | Annotate | Download | only in ARM
      1 ; RUN: opt -mtriple armv7-linux-gnueabihf -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=LINUX
      2 ; RUN: opt -mtriple armv8-linux-gnu -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=LINUX
      3 ; RUN: opt -mtriple armv7-unknwon-darwin -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=DARWIN
      4 ; REQUIRES: asserts
      5 
      6 ; Testing the ability of the loop vectorizer to tell when SIMD is safe or not
      7 ; regarding IEEE 754 standard.
      8 ; On Linux, we only want the vectorizer to work when -ffast-math flag is set,
      9 ; because NEON is not IEEE compliant.
     10 ; Darwin, on the other hand, doesn't support subnormals, and all optimizations
     11 ; are allowed, even without -ffast-math.
     12 
     13 ; Integer loops are always vectorizeable
     14 ; CHECK: Checking a loop in "sumi"
     15 ; CHECK: We can vectorize this loop!
     16 define void @sumi(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %N) {
     17 entry:
     18   %cmp5 = icmp eq i32 %N, 0
     19   br i1 %cmp5, label %for.end, label %for.body.preheader
     20 
     21 for.body.preheader:                               ; preds = %entry
     22   br label %for.body
     23 
     24 for.body:                                         ; preds = %for.body.preheader, %for.body
     25   %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
     26   %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.06
     27   %0 = load i32, i32* %arrayidx, align 4
     28   %arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.06
     29   %1 = load i32, i32* %arrayidx1, align 4
     30   %mul = mul nsw i32 %1, %0
     31   %arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.06
     32   store i32 %mul, i32* %arrayidx2, align 4
     33   %inc = add nuw nsw i32 %i.06, 1
     34   %exitcond = icmp eq i32 %inc, %N
     35   br i1 %exitcond, label %for.end.loopexit, label %for.body
     36 
     37 for.end.loopexit:                                 ; preds = %for.body
     38   br label %for.end
     39 
     40 for.end:                                          ; preds = %for.end.loopexit, %entry
     41   ret void
     42 }
     43 
     44 ; Floating-point loops need fast-math to be vectorizeable
     45 ; LINUX: Checking a loop in "sumf"
     46 ; LINUX: Potentially unsafe FP op prevents vectorization
     47 ; DARWIN: Checking a loop in "sumf"
     48 ; DARWIN: We can vectorize this loop!
     49 define void @sumf(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
     50 entry:
     51   %cmp5 = icmp eq i32 %N, 0
     52   br i1 %cmp5, label %for.end, label %for.body.preheader
     53 
     54 for.body.preheader:                               ; preds = %entry
     55   br label %for.body
     56 
     57 for.body:                                         ; preds = %for.body.preheader, %for.body
     58   %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
     59   %arrayidx = getelementptr inbounds float, float* %A, i32 %i.06
     60   %0 = load float, float* %arrayidx, align 4
     61   %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.06
     62   %1 = load float, float* %arrayidx1, align 4
     63   %mul = fmul float %0, %1
     64   %arrayidx2 = getelementptr inbounds float, float* %C, i32 %i.06
     65   store float %mul, float* %arrayidx2, align 4
     66   %inc = add nuw nsw i32 %i.06, 1
     67   %exitcond = icmp eq i32 %inc, %N
     68   br i1 %exitcond, label %for.end.loopexit, label %for.body
     69 
     70 for.end.loopexit:                                 ; preds = %for.body
     71   br label %for.end
     72 
     73 for.end:                                          ; preds = %for.end.loopexit, %entry
     74   ret void
     75 }
     76 
     77 ; Integer loops are always vectorizeable
     78 ; CHECK: Checking a loop in "redi"
     79 ; CHECK: We can vectorize this loop!
     80 define i32 @redi(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) {
     81 entry:
     82   %cmp5 = icmp eq i32 %N, 0
     83   br i1 %cmp5, label %for.end, label %for.body.preheader
     84 
     85 for.body.preheader:                               ; preds = %entry
     86   br label %for.body
     87 
     88 for.body:                                         ; preds = %for.body.preheader, %for.body
     89   %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
     90   %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ]
     91   %arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.07
     92   %0 = load i32, i32* %arrayidx, align 4
     93   %arrayidx1 = getelementptr inbounds i32, i32* %b, i32 %i.07
     94   %1 = load i32, i32* %arrayidx1, align 4
     95   %mul = mul nsw i32 %1, %0
     96   %add = add nsw i32 %mul, %Red.06
     97   %inc = add nuw nsw i32 %i.07, 1
     98   %exitcond = icmp eq i32 %inc, %N
     99   br i1 %exitcond, label %for.end.loopexit, label %for.body
    100 
    101 for.end.loopexit:                                 ; preds = %for.body
    102   %add.lcssa = phi i32 [ %add, %for.body ]
    103   br label %for.end
    104 
    105 for.end:                                          ; preds = %for.end.loopexit, %entry
    106   %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
    107   ret i32 %Red.0.lcssa
    108 }
    109 
    110 ; Floating-point loops need fast-math to be vectorizeable
    111 ; LINUX: Checking a loop in "redf"
    112 ; LINUX: Potentially unsafe FP op prevents vectorization
    113 ; DARWIN: Checking a loop in "redf"
    114 ; DARWIN: We can vectorize this loop!
    115 define float @redf(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i32 %N) {
    116 entry:
    117   %cmp5 = icmp eq i32 %N, 0
    118   br i1 %cmp5, label %for.end, label %for.body.preheader
    119 
    120 for.body.preheader:                               ; preds = %entry
    121   br label %for.body
    122 
    123 for.body:                                         ; preds = %for.body.preheader, %for.body
    124   %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
    125   %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ]
    126   %arrayidx = getelementptr inbounds float, float* %a, i32 %i.07
    127   %0 = load float, float* %arrayidx, align 4
    128   %arrayidx1 = getelementptr inbounds float, float* %b, i32 %i.07
    129   %1 = load float, float* %arrayidx1, align 4
    130   %mul = fmul float %0, %1
    131   %add = fadd float %Red.06, %mul
    132   %inc = add nuw nsw i32 %i.07, 1
    133   %exitcond = icmp eq i32 %inc, %N
    134   br i1 %exitcond, label %for.end.loopexit, label %for.body
    135 
    136 for.end.loopexit:                                 ; preds = %for.body
    137   %add.lcssa = phi float [ %add, %for.body ]
    138   br label %for.end
    139 
    140 for.end:                                          ; preds = %for.end.loopexit, %entry
    141   %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
    142   ret float %Red.0.lcssa
    143 }
    144 
    145 ; Make sure calls that turn into builtins are also covered
    146 ; LINUX: Checking a loop in "fabs"
    147 ; LINUX: Potentially unsafe FP op prevents vectorization
    148 ; DARWIN: Checking a loop in "fabs"
    149 ; DARWIN: We can vectorize this loop!
    150 define void @fabs(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
    151 entry:
    152   %cmp10 = icmp eq i32 %N, 0
    153   br i1 %cmp10, label %for.end, label %for.body
    154 
    155 for.body:                                         ; preds = %entry, %for.body
    156   %i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
    157   %arrayidx = getelementptr inbounds float, float* %A, i32 %i.011
    158   %0 = load float, float* %arrayidx, align 4
    159   %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.011
    160   %1 = load float, float* %arrayidx1, align 4
    161   %fabsf = tail call float @fabsf(float %1) #1
    162   %conv3 = fmul float %0, %fabsf
    163   %arrayidx4 = getelementptr inbounds float, float* %C, i32 %i.011
    164   store float %conv3, float* %arrayidx4, align 4
    165   %inc = add nuw nsw i32 %i.011, 1
    166   %exitcond = icmp eq i32 %inc, %N
    167   br i1 %exitcond, label %for.end, label %for.body
    168 
    169 for.end:                                          ; preds = %for.body, %entry
    170   ret void
    171 }
    172 
    173 ; Integer loops are always vectorizeable
    174 ; CHECK: Checking a loop in "sumi_fast"
    175 ; CHECK: We can vectorize this loop!
    176 define void @sumi_fast(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %N) {
    177 entry:
    178   %cmp5 = icmp eq i32 %N, 0
    179   br i1 %cmp5, label %for.end, label %for.body.preheader
    180 
    181 for.body.preheader:                               ; preds = %entry
    182   br label %for.body
    183 
    184 for.body:                                         ; preds = %for.body.preheader, %for.body
    185   %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
    186   %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.06
    187   %0 = load i32, i32* %arrayidx, align 4
    188   %arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.06
    189   %1 = load i32, i32* %arrayidx1, align 4
    190   %mul = mul nsw i32 %1, %0
    191   %arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.06
    192   store i32 %mul, i32* %arrayidx2, align 4
    193   %inc = add nuw nsw i32 %i.06, 1
    194   %exitcond = icmp eq i32 %inc, %N
    195   br i1 %exitcond, label %for.end.loopexit, label %for.body
    196 
    197 for.end.loopexit:                                 ; preds = %for.body
    198   br label %for.end
    199 
    200 for.end:                                          ; preds = %for.end.loopexit, %entry
    201   ret void
    202 }
    203 
    204 ; Floating-point loops can be vectorizeable with fast-math
    205 ; CHECK: Checking a loop in "sumf_fast"
    206 ; CHECK: We can vectorize this loop!
    207 define void @sumf_fast(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
    208 entry:
    209   %cmp5 = icmp eq i32 %N, 0
    210   br i1 %cmp5, label %for.end, label %for.body.preheader
    211 
    212 for.body.preheader:                               ; preds = %entry
    213   br label %for.body
    214 
    215 for.body:                                         ; preds = %for.body.preheader, %for.body
    216   %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
    217   %arrayidx = getelementptr inbounds float, float* %A, i32 %i.06
    218   %0 = load float, float* %arrayidx, align 4
    219   %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.06
    220   %1 = load float, float* %arrayidx1, align 4
    221   %mul = fmul fast float %1, %0
    222   %arrayidx2 = getelementptr inbounds float, float* %C, i32 %i.06
    223   store float %mul, float* %arrayidx2, align 4
    224   %inc = add nuw nsw i32 %i.06, 1
    225   %exitcond = icmp eq i32 %inc, %N
    226   br i1 %exitcond, label %for.end.loopexit, label %for.body
    227 
    228 for.end.loopexit:                                 ; preds = %for.body
    229   br label %for.end
    230 
    231 for.end:                                          ; preds = %for.end.loopexit, %entry
    232   ret void
    233 }
    234 
    235 ; Integer loops are always vectorizeable
    236 ; CHECK: Checking a loop in "redi_fast"
    237 ; CHECK: We can vectorize this loop!
    238 define i32 @redi_fast(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) {
    239 entry:
    240   %cmp5 = icmp eq i32 %N, 0
    241   br i1 %cmp5, label %for.end, label %for.body.preheader
    242 
    243 for.body.preheader:                               ; preds = %entry
    244   br label %for.body
    245 
    246 for.body:                                         ; preds = %for.body.preheader, %for.body
    247   %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
    248   %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ]
    249   %arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.07
    250   %0 = load i32, i32* %arrayidx, align 4
    251   %arrayidx1 = getelementptr inbounds i32, i32* %b, i32 %i.07
    252   %1 = load i32, i32* %arrayidx1, align 4
    253   %mul = mul nsw i32 %1, %0
    254   %add = add nsw i32 %mul, %Red.06
    255   %inc = add nuw nsw i32 %i.07, 1
    256   %exitcond = icmp eq i32 %inc, %N
    257   br i1 %exitcond, label %for.end.loopexit, label %for.body
    258 
    259 for.end.loopexit:                                 ; preds = %for.body
    260   %add.lcssa = phi i32 [ %add, %for.body ]
    261   br label %for.end
    262 
    263 for.end:                                          ; preds = %for.end.loopexit, %entry
    264   %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
    265   ret i32 %Red.0.lcssa
    266 }
    267 
    268 ; Floating-point loops can be vectorizeable with fast-math
    269 ; CHECK: Checking a loop in "redf_fast"
    270 ; CHECK: We can vectorize this loop!
    271 define float @redf_fast(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i32 %N) {
    272 entry:
    273   %cmp5 = icmp eq i32 %N, 0
    274   br i1 %cmp5, label %for.end, label %for.body.preheader
    275 
    276 for.body.preheader:                               ; preds = %entry
    277   br label %for.body
    278 
    279 for.body:                                         ; preds = %for.body.preheader, %for.body
    280   %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
    281   %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ]
    282   %arrayidx = getelementptr inbounds float, float* %a, i32 %i.07
    283   %0 = load float, float* %arrayidx, align 4
    284   %arrayidx1 = getelementptr inbounds float, float* %b, i32 %i.07
    285   %1 = load float, float* %arrayidx1, align 4
    286   %mul = fmul fast float %1, %0
    287   %add = fadd fast float %mul, %Red.06
    288   %inc = add nuw nsw i32 %i.07, 1
    289   %exitcond = icmp eq i32 %inc, %N
    290   br i1 %exitcond, label %for.end.loopexit, label %for.body
    291 
    292 for.end.loopexit:                                 ; preds = %for.body
    293   %add.lcssa = phi float [ %add, %for.body ]
    294   br label %for.end
    295 
    296 for.end:                                          ; preds = %for.end.loopexit, %entry
    297   %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
    298   ret float %Red.0.lcssa
    299 }
    300 
    301 ; Make sure calls that turn into builtins are also covered
    302 ; CHECK: Checking a loop in "fabs_fast"
    303 ; CHECK: We can vectorize this loop!
    304 define void @fabs_fast(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
    305 entry:
    306   %cmp10 = icmp eq i32 %N, 0
    307   br i1 %cmp10, label %for.end, label %for.body
    308 
    309 for.body:                                         ; preds = %entry, %for.body
    310   %i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
    311   %arrayidx = getelementptr inbounds float, float* %A, i32 %i.011
    312   %0 = load float, float* %arrayidx, align 4
    313   %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.011
    314   %1 = load float, float* %arrayidx1, align 4
    315   %fabsf = tail call fast float @fabsf(float %1) #2
    316   %conv3 = fmul fast float %fabsf, %0
    317   %arrayidx4 = getelementptr inbounds float, float* %C, i32 %i.011
    318   store float %conv3, float* %arrayidx4, align 4
    319   %inc = add nuw nsw i32 %i.011, 1
    320   %exitcond = icmp eq i32 %inc, %N
    321   br i1 %exitcond, label %for.end, label %for.body
    322 
    323 for.end:                                          ; preds = %for.body, %entry
    324   ret void
    325 }
    326 
    327 declare float @fabsf(float)
    328 
    329 attributes #1 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" }
    330 attributes #2 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="true" "use-soft-float"="false" }
    331