1 ; RUN: opt -mtriple armv7-linux-gnueabihf -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=LINUX 2 ; RUN: opt -mtriple armv8-linux-gnu -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=LINUX 3 ; RUN: opt -mtriple armv7-unknwon-darwin -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=DARWIN 4 ; REQUIRES: asserts 5 6 ; Testing the ability of the loop vectorizer to tell when SIMD is safe or not 7 ; regarding IEEE 754 standard. 8 ; On Linux, we only want the vectorizer to work when -ffast-math flag is set, 9 ; because NEON is not IEEE compliant. 10 ; Darwin, on the other hand, doesn't support subnormals, and all optimizations 11 ; are allowed, even without -ffast-math. 12 13 ; Integer loops are always vectorizeable 14 ; CHECK: Checking a loop in "sumi" 15 ; CHECK: We can vectorize this loop! 16 define void @sumi(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %N) { 17 entry: 18 %cmp5 = icmp eq i32 %N, 0 19 br i1 %cmp5, label %for.end, label %for.body.preheader 20 21 for.body.preheader: ; preds = %entry 22 br label %for.body 23 24 for.body: ; preds = %for.body.preheader, %for.body 25 %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] 26 %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.06 27 %0 = load i32, i32* %arrayidx, align 4 28 %arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.06 29 %1 = load i32, i32* %arrayidx1, align 4 30 %mul = mul nsw i32 %1, %0 31 %arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.06 32 store i32 %mul, i32* %arrayidx2, align 4 33 %inc = add nuw nsw i32 %i.06, 1 34 %exitcond = icmp eq i32 %inc, %N 35 br i1 %exitcond, label %for.end.loopexit, label %for.body 36 37 for.end.loopexit: ; preds = %for.body 38 br label %for.end 39 40 for.end: ; preds = %for.end.loopexit, %entry 41 ret void 42 } 43 44 ; Floating-point loops need fast-math to be vectorizeable 45 ; LINUX: Checking a loop in "sumf" 46 ; LINUX: Potentially unsafe FP op prevents vectorization 47 ; DARWIN: Checking a loop in "sumf" 48 ; DARWIN: We can vectorize this loop! 49 define void @sumf(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) { 50 entry: 51 %cmp5 = icmp eq i32 %N, 0 52 br i1 %cmp5, label %for.end, label %for.body.preheader 53 54 for.body.preheader: ; preds = %entry 55 br label %for.body 56 57 for.body: ; preds = %for.body.preheader, %for.body 58 %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] 59 %arrayidx = getelementptr inbounds float, float* %A, i32 %i.06 60 %0 = load float, float* %arrayidx, align 4 61 %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.06 62 %1 = load float, float* %arrayidx1, align 4 63 %mul = fmul float %0, %1 64 %arrayidx2 = getelementptr inbounds float, float* %C, i32 %i.06 65 store float %mul, float* %arrayidx2, align 4 66 %inc = add nuw nsw i32 %i.06, 1 67 %exitcond = icmp eq i32 %inc, %N 68 br i1 %exitcond, label %for.end.loopexit, label %for.body 69 70 for.end.loopexit: ; preds = %for.body 71 br label %for.end 72 73 for.end: ; preds = %for.end.loopexit, %entry 74 ret void 75 } 76 77 ; Integer loops are always vectorizeable 78 ; CHECK: Checking a loop in "redi" 79 ; CHECK: We can vectorize this loop! 80 define i32 @redi(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) { 81 entry: 82 %cmp5 = icmp eq i32 %N, 0 83 br i1 %cmp5, label %for.end, label %for.body.preheader 84 85 for.body.preheader: ; preds = %entry 86 br label %for.body 87 88 for.body: ; preds = %for.body.preheader, %for.body 89 %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] 90 %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ] 91 %arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.07 92 %0 = load i32, i32* %arrayidx, align 4 93 %arrayidx1 = getelementptr inbounds i32, i32* %b, i32 %i.07 94 %1 = load i32, i32* %arrayidx1, align 4 95 %mul = mul nsw i32 %1, %0 96 %add = add nsw i32 %mul, %Red.06 97 %inc = add nuw nsw i32 %i.07, 1 98 %exitcond = icmp eq i32 %inc, %N 99 br i1 %exitcond, label %for.end.loopexit, label %for.body 100 101 for.end.loopexit: ; preds = %for.body 102 %add.lcssa = phi i32 [ %add, %for.body ] 103 br label %for.end 104 105 for.end: ; preds = %for.end.loopexit, %entry 106 %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ] 107 ret i32 %Red.0.lcssa 108 } 109 110 ; Floating-point loops need fast-math to be vectorizeable 111 ; LINUX: Checking a loop in "redf" 112 ; LINUX: Potentially unsafe FP op prevents vectorization 113 ; DARWIN: Checking a loop in "redf" 114 ; DARWIN: We can vectorize this loop! 115 define float @redf(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i32 %N) { 116 entry: 117 %cmp5 = icmp eq i32 %N, 0 118 br i1 %cmp5, label %for.end, label %for.body.preheader 119 120 for.body.preheader: ; preds = %entry 121 br label %for.body 122 123 for.body: ; preds = %for.body.preheader, %for.body 124 %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] 125 %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ] 126 %arrayidx = getelementptr inbounds float, float* %a, i32 %i.07 127 %0 = load float, float* %arrayidx, align 4 128 %arrayidx1 = getelementptr inbounds float, float* %b, i32 %i.07 129 %1 = load float, float* %arrayidx1, align 4 130 %mul = fmul float %0, %1 131 %add = fadd float %Red.06, %mul 132 %inc = add nuw nsw i32 %i.07, 1 133 %exitcond = icmp eq i32 %inc, %N 134 br i1 %exitcond, label %for.end.loopexit, label %for.body 135 136 for.end.loopexit: ; preds = %for.body 137 %add.lcssa = phi float [ %add, %for.body ] 138 br label %for.end 139 140 for.end: ; preds = %for.end.loopexit, %entry 141 %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ] 142 ret float %Red.0.lcssa 143 } 144 145 ; Make sure calls that turn into builtins are also covered 146 ; LINUX: Checking a loop in "fabs" 147 ; LINUX: Potentially unsafe FP op prevents vectorization 148 ; DARWIN: Checking a loop in "fabs" 149 ; DARWIN: We can vectorize this loop! 150 define void @fabs(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) { 151 entry: 152 %cmp10 = icmp eq i32 %N, 0 153 br i1 %cmp10, label %for.end, label %for.body 154 155 for.body: ; preds = %entry, %for.body 156 %i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ] 157 %arrayidx = getelementptr inbounds float, float* %A, i32 %i.011 158 %0 = load float, float* %arrayidx, align 4 159 %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.011 160 %1 = load float, float* %arrayidx1, align 4 161 %fabsf = tail call float @fabsf(float %1) #1 162 %conv3 = fmul float %0, %fabsf 163 %arrayidx4 = getelementptr inbounds float, float* %C, i32 %i.011 164 store float %conv3, float* %arrayidx4, align 4 165 %inc = add nuw nsw i32 %i.011, 1 166 %exitcond = icmp eq i32 %inc, %N 167 br i1 %exitcond, label %for.end, label %for.body 168 169 for.end: ; preds = %for.body, %entry 170 ret void 171 } 172 173 ; Integer loops are always vectorizeable 174 ; CHECK: Checking a loop in "sumi_fast" 175 ; CHECK: We can vectorize this loop! 176 define void @sumi_fast(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %N) { 177 entry: 178 %cmp5 = icmp eq i32 %N, 0 179 br i1 %cmp5, label %for.end, label %for.body.preheader 180 181 for.body.preheader: ; preds = %entry 182 br label %for.body 183 184 for.body: ; preds = %for.body.preheader, %for.body 185 %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] 186 %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.06 187 %0 = load i32, i32* %arrayidx, align 4 188 %arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.06 189 %1 = load i32, i32* %arrayidx1, align 4 190 %mul = mul nsw i32 %1, %0 191 %arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.06 192 store i32 %mul, i32* %arrayidx2, align 4 193 %inc = add nuw nsw i32 %i.06, 1 194 %exitcond = icmp eq i32 %inc, %N 195 br i1 %exitcond, label %for.end.loopexit, label %for.body 196 197 for.end.loopexit: ; preds = %for.body 198 br label %for.end 199 200 for.end: ; preds = %for.end.loopexit, %entry 201 ret void 202 } 203 204 ; Floating-point loops can be vectorizeable with fast-math 205 ; CHECK: Checking a loop in "sumf_fast" 206 ; CHECK: We can vectorize this loop! 207 define void @sumf_fast(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) { 208 entry: 209 %cmp5 = icmp eq i32 %N, 0 210 br i1 %cmp5, label %for.end, label %for.body.preheader 211 212 for.body.preheader: ; preds = %entry 213 br label %for.body 214 215 for.body: ; preds = %for.body.preheader, %for.body 216 %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] 217 %arrayidx = getelementptr inbounds float, float* %A, i32 %i.06 218 %0 = load float, float* %arrayidx, align 4 219 %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.06 220 %1 = load float, float* %arrayidx1, align 4 221 %mul = fmul fast float %1, %0 222 %arrayidx2 = getelementptr inbounds float, float* %C, i32 %i.06 223 store float %mul, float* %arrayidx2, align 4 224 %inc = add nuw nsw i32 %i.06, 1 225 %exitcond = icmp eq i32 %inc, %N 226 br i1 %exitcond, label %for.end.loopexit, label %for.body 227 228 for.end.loopexit: ; preds = %for.body 229 br label %for.end 230 231 for.end: ; preds = %for.end.loopexit, %entry 232 ret void 233 } 234 235 ; Integer loops are always vectorizeable 236 ; CHECK: Checking a loop in "redi_fast" 237 ; CHECK: We can vectorize this loop! 238 define i32 @redi_fast(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) { 239 entry: 240 %cmp5 = icmp eq i32 %N, 0 241 br i1 %cmp5, label %for.end, label %for.body.preheader 242 243 for.body.preheader: ; preds = %entry 244 br label %for.body 245 246 for.body: ; preds = %for.body.preheader, %for.body 247 %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] 248 %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ] 249 %arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.07 250 %0 = load i32, i32* %arrayidx, align 4 251 %arrayidx1 = getelementptr inbounds i32, i32* %b, i32 %i.07 252 %1 = load i32, i32* %arrayidx1, align 4 253 %mul = mul nsw i32 %1, %0 254 %add = add nsw i32 %mul, %Red.06 255 %inc = add nuw nsw i32 %i.07, 1 256 %exitcond = icmp eq i32 %inc, %N 257 br i1 %exitcond, label %for.end.loopexit, label %for.body 258 259 for.end.loopexit: ; preds = %for.body 260 %add.lcssa = phi i32 [ %add, %for.body ] 261 br label %for.end 262 263 for.end: ; preds = %for.end.loopexit, %entry 264 %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ] 265 ret i32 %Red.0.lcssa 266 } 267 268 ; Floating-point loops can be vectorizeable with fast-math 269 ; CHECK: Checking a loop in "redf_fast" 270 ; CHECK: We can vectorize this loop! 271 define float @redf_fast(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i32 %N) { 272 entry: 273 %cmp5 = icmp eq i32 %N, 0 274 br i1 %cmp5, label %for.end, label %for.body.preheader 275 276 for.body.preheader: ; preds = %entry 277 br label %for.body 278 279 for.body: ; preds = %for.body.preheader, %for.body 280 %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] 281 %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ] 282 %arrayidx = getelementptr inbounds float, float* %a, i32 %i.07 283 %0 = load float, float* %arrayidx, align 4 284 %arrayidx1 = getelementptr inbounds float, float* %b, i32 %i.07 285 %1 = load float, float* %arrayidx1, align 4 286 %mul = fmul fast float %1, %0 287 %add = fadd fast float %mul, %Red.06 288 %inc = add nuw nsw i32 %i.07, 1 289 %exitcond = icmp eq i32 %inc, %N 290 br i1 %exitcond, label %for.end.loopexit, label %for.body 291 292 for.end.loopexit: ; preds = %for.body 293 %add.lcssa = phi float [ %add, %for.body ] 294 br label %for.end 295 296 for.end: ; preds = %for.end.loopexit, %entry 297 %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ] 298 ret float %Red.0.lcssa 299 } 300 301 ; Make sure calls that turn into builtins are also covered 302 ; CHECK: Checking a loop in "fabs_fast" 303 ; CHECK: We can vectorize this loop! 304 define void @fabs_fast(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) { 305 entry: 306 %cmp10 = icmp eq i32 %N, 0 307 br i1 %cmp10, label %for.end, label %for.body 308 309 for.body: ; preds = %entry, %for.body 310 %i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ] 311 %arrayidx = getelementptr inbounds float, float* %A, i32 %i.011 312 %0 = load float, float* %arrayidx, align 4 313 %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.011 314 %1 = load float, float* %arrayidx1, align 4 315 %fabsf = tail call fast float @fabsf(float %1) #2 316 %conv3 = fmul fast float %fabsf, %0 317 %arrayidx4 = getelementptr inbounds float, float* %C, i32 %i.011 318 store float %conv3, float* %arrayidx4, align 4 319 %inc = add nuw nsw i32 %i.011, 1 320 %exitcond = icmp eq i32 %inc, %N 321 br i1 %exitcond, label %for.end, label %for.body 322 323 for.end: ; preds = %for.body, %entry 324 ret void 325 } 326 327 declare float @fabsf(float) 328 329 attributes #1 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" } 330 attributes #2 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="true" "use-soft-float"="false" } 331