1 ; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s --check-prefix=NOSTORE 2 3 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" 4 5 ; #include <stdint.h> 6 ; 7 ; int foo(float *A, int n) { 8 ; float sum = 0; 9 ; for (intptr_t i=0; i < n; ++i) { 10 ; sum += 7*A[i*4 ] + 11 ; 7*A[i*4+1] + 12 ; 7*A[i*4+2] + 13 ; 7*A[i*4+3]; 14 ; } 15 ; return sum; 16 ; } 17 18 ; NOSTORE-LABEL: add_red 19 ; NOSTORE: fmul <4 x float> 20 ; NOSTORE: shufflevector <4 x float> 21 22 define i32 @add_red(float* %A, i32 %n) { 23 entry: 24 %cmp31 = icmp sgt i32 %n, 0 25 br i1 %cmp31, label %for.body.lr.ph, label %for.end 26 27 for.body.lr.ph: 28 %0 = sext i32 %n to i64 29 br label %for.body 30 31 for.body: 32 %i.033 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] 33 %sum.032 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add17, %for.body ] 34 %mul = shl nsw i64 %i.033, 2 35 %arrayidx = getelementptr inbounds float, float* %A, i64 %mul 36 %1 = load float, float* %arrayidx, align 4 37 %mul2 = fmul float %1, 7.000000e+00 38 %add28 = or i64 %mul, 1 39 %arrayidx4 = getelementptr inbounds float, float* %A, i64 %add28 40 %2 = load float, float* %arrayidx4, align 4 41 %mul5 = fmul float %2, 7.000000e+00 42 %add6 = fadd fast float %mul2, %mul5 43 %add829 = or i64 %mul, 2 44 %arrayidx9 = getelementptr inbounds float, float* %A, i64 %add829 45 %3 = load float, float* %arrayidx9, align 4 46 %mul10 = fmul float %3, 7.000000e+00 47 %add11 = fadd fast float %add6, %mul10 48 %add1330 = or i64 %mul, 3 49 %arrayidx14 = getelementptr inbounds float, float* %A, i64 %add1330 50 %4 = load float, float* %arrayidx14, align 4 51 %mul15 = fmul float %4, 7.000000e+00 52 %add16 = fadd fast float %add11, %mul15 53 %add17 = fadd fast float %sum.032, %add16 54 %inc = add nsw i64 %i.033, 1 55 %exitcond = icmp eq i64 %inc, %0 56 br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body 57 58 for.cond.for.end_crit_edge: 59 %phitmp = fptosi float %add17 to i32 60 br label %for.end 61 62 for.end: 63 %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ] 64 ret i32 %sum.0.lcssa 65 } 66 67 ; int foo(float * restrict A, float * restrict B, int n) { 68 ; float sum = 0; 69 ; for (intptr_t i=0; i < n; ++i) { 70 ; sum *= B[0]*A[i*4 ] + 71 ; B[1]*A[i*4+1] + 72 ; B[2]*A[i*4+2] + 73 ; B[3]*A[i*4+3]; 74 ; } 75 ; return sum; 76 ; } 77 78 ; CHECK-LABEL: mul_red 79 ; CHECK: fmul <4 x float> 80 ; CHECK: shufflevector <4 x float> 81 82 define i32 @mul_red(float* noalias %A, float* noalias %B, i32 %n) { 83 entry: 84 %cmp38 = icmp sgt i32 %n, 0 85 br i1 %cmp38, label %for.body.lr.ph, label %for.end 86 87 for.body.lr.ph: 88 %0 = load float, float* %B, align 4 89 %arrayidx4 = getelementptr inbounds float, float* %B, i64 1 90 %1 = load float, float* %arrayidx4, align 4 91 %arrayidx9 = getelementptr inbounds float, float* %B, i64 2 92 %2 = load float, float* %arrayidx9, align 4 93 %arrayidx15 = getelementptr inbounds float, float* %B, i64 3 94 %3 = load float, float* %arrayidx15, align 4 95 %4 = sext i32 %n to i64 96 br label %for.body 97 98 for.body: 99 %i.040 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] 100 %sum.039 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %mul21, %for.body ] 101 %mul = shl nsw i64 %i.040, 2 102 %arrayidx2 = getelementptr inbounds float, float* %A, i64 %mul 103 %5 = load float, float* %arrayidx2, align 4 104 %mul3 = fmul float %0, %5 105 %add35 = or i64 %mul, 1 106 %arrayidx6 = getelementptr inbounds float, float* %A, i64 %add35 107 %6 = load float, float* %arrayidx6, align 4 108 %mul7 = fmul float %1, %6 109 %add8 = fadd fast float %mul3, %mul7 110 %add1136 = or i64 %mul, 2 111 %arrayidx12 = getelementptr inbounds float, float* %A, i64 %add1136 112 %7 = load float, float* %arrayidx12, align 4 113 %mul13 = fmul float %2, %7 114 %add14 = fadd fast float %add8, %mul13 115 %add1737 = or i64 %mul, 3 116 %arrayidx18 = getelementptr inbounds float, float* %A, i64 %add1737 117 %8 = load float, float* %arrayidx18, align 4 118 %mul19 = fmul float %3, %8 119 %add20 = fadd fast float %add14, %mul19 120 %mul21 = fmul float %sum.039, %add20 121 %inc = add nsw i64 %i.040, 1 122 %exitcond = icmp eq i64 %inc, %4 123 br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body 124 125 for.cond.for.end_crit_edge: 126 %phitmp = fptosi float %mul21 to i32 127 br label %for.end 128 129 for.end: 130 %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ] 131 ret i32 %sum.0.lcssa 132 } 133 134 ; int foo(float * restrict A, float * restrict B, int n) { 135 ; float sum = 0; 136 ; for (intptr_t i=0; i < n; ++i) { 137 ; sum += B[0]*A[i*6 ] + 138 ; B[1]*A[i*6+1] + 139 ; B[2]*A[i*6+2] + 140 ; B[3]*A[i*6+3] + 141 ; B[4]*A[i*6+4] + 142 ; B[5]*A[i*6+5] + 143 ; B[6]*A[i*6+6] + 144 ; B[7]*A[i*6+7] + 145 ; B[8]*A[i*6+8]; 146 ; } 147 ; return sum; 148 ; } 149 150 ; CHECK-LABEL: long_red 151 ; CHECK: fmul fast <4 x float> 152 ; CHECK: shufflevector <4 x float> 153 154 define i32 @long_red(float* noalias %A, float* noalias %B, i32 %n) { 155 entry: 156 %cmp81 = icmp sgt i32 %n, 0 157 br i1 %cmp81, label %for.body.lr.ph, label %for.end 158 159 for.body.lr.ph: 160 %0 = load float, float* %B, align 4 161 %arrayidx4 = getelementptr inbounds float, float* %B, i64 1 162 %1 = load float, float* %arrayidx4, align 4 163 %arrayidx9 = getelementptr inbounds float, float* %B, i64 2 164 %2 = load float, float* %arrayidx9, align 4 165 %arrayidx15 = getelementptr inbounds float, float* %B, i64 3 166 %3 = load float, float* %arrayidx15, align 4 167 %arrayidx21 = getelementptr inbounds float, float* %B, i64 4 168 %4 = load float, float* %arrayidx21, align 4 169 %arrayidx27 = getelementptr inbounds float, float* %B, i64 5 170 %5 = load float, float* %arrayidx27, align 4 171 %arrayidx33 = getelementptr inbounds float, float* %B, i64 6 172 %6 = load float, float* %arrayidx33, align 4 173 %arrayidx39 = getelementptr inbounds float, float* %B, i64 7 174 %7 = load float, float* %arrayidx39, align 4 175 %arrayidx45 = getelementptr inbounds float, float* %B, i64 8 176 %8 = load float, float* %arrayidx45, align 4 177 %9 = sext i32 %n to i64 178 br label %for.body 179 180 for.body: 181 %i.083 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] 182 %sum.082 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add51, %for.body ] 183 %mul = mul nsw i64 %i.083, 6 184 %arrayidx2 = getelementptr inbounds float, float* %A, i64 %mul 185 %10 = load float, float* %arrayidx2, align 4 186 %mul3 = fmul fast float %0, %10 187 %add80 = or i64 %mul, 1 188 %arrayidx6 = getelementptr inbounds float, float* %A, i64 %add80 189 %11 = load float, float* %arrayidx6, align 4 190 %mul7 = fmul fast float %1, %11 191 %add8 = fadd fast float %mul3, %mul7 192 %add11 = add nsw i64 %mul, 2 193 %arrayidx12 = getelementptr inbounds float, float* %A, i64 %add11 194 %12 = load float, float* %arrayidx12, align 4 195 %mul13 = fmul fast float %2, %12 196 %add14 = fadd fast float %add8, %mul13 197 %add17 = add nsw i64 %mul, 3 198 %arrayidx18 = getelementptr inbounds float, float* %A, i64 %add17 199 %13 = load float, float* %arrayidx18, align 4 200 %mul19 = fmul fast float %3, %13 201 %add20 = fadd fast float %add14, %mul19 202 %add23 = add nsw i64 %mul, 4 203 %arrayidx24 = getelementptr inbounds float, float* %A, i64 %add23 204 %14 = load float, float* %arrayidx24, align 4 205 %mul25 = fmul fast float %4, %14 206 %add26 = fadd fast float %add20, %mul25 207 %add29 = add nsw i64 %mul, 5 208 %arrayidx30 = getelementptr inbounds float, float* %A, i64 %add29 209 %15 = load float, float* %arrayidx30, align 4 210 %mul31 = fmul fast float %5, %15 211 %add32 = fadd fast float %add26, %mul31 212 %add35 = add nsw i64 %mul, 6 213 %arrayidx36 = getelementptr inbounds float, float* %A, i64 %add35 214 %16 = load float, float* %arrayidx36, align 4 215 %mul37 = fmul fast float %6, %16 216 %add38 = fadd fast float %add32, %mul37 217 %add41 = add nsw i64 %mul, 7 218 %arrayidx42 = getelementptr inbounds float, float* %A, i64 %add41 219 %17 = load float, float* %arrayidx42, align 4 220 %mul43 = fmul fast float %7, %17 221 %add44 = fadd fast float %add38, %mul43 222 %add47 = add nsw i64 %mul, 8 223 %arrayidx48 = getelementptr inbounds float, float* %A, i64 %add47 224 %18 = load float, float* %arrayidx48, align 4 225 %mul49 = fmul fast float %8, %18 226 %add50 = fadd fast float %add44, %mul49 227 %add51 = fadd fast float %sum.082, %add50 228 %inc = add nsw i64 %i.083, 1 229 %exitcond = icmp eq i64 %inc, %9 230 br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body 231 232 for.cond.for.end_crit_edge: 233 %phitmp = fptosi float %add51 to i32 234 br label %for.end 235 236 for.end: 237 %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ] 238 ret i32 %sum.0.lcssa 239 } 240 241 ; int foo(float * restrict A, float * restrict B, int n) { 242 ; float sum = 0; 243 ; for (intptr_t i=0; i < n; ++i) { 244 ; sum += B[0]*A[i*4 ]; 245 ; sum += B[1]*A[i*4+1]; 246 ; sum += B[2]*A[i*4+2]; 247 ; sum += B[3]*A[i*4+3]; 248 ; } 249 ; return sum; 250 ; } 251 252 ; CHECK-LABEL: chain_red 253 ; CHECK: fmul fast <4 x float> 254 ; CHECK: shufflevector <4 x float> 255 256 define i32 @chain_red(float* noalias %A, float* noalias %B, i32 %n) { 257 entry: 258 %cmp41 = icmp sgt i32 %n, 0 259 br i1 %cmp41, label %for.body.lr.ph, label %for.end 260 261 for.body.lr.ph: 262 %0 = load float, float* %B, align 4 263 %arrayidx4 = getelementptr inbounds float, float* %B, i64 1 264 %1 = load float, float* %arrayidx4, align 4 265 %arrayidx10 = getelementptr inbounds float, float* %B, i64 2 266 %2 = load float, float* %arrayidx10, align 4 267 %arrayidx16 = getelementptr inbounds float, float* %B, i64 3 268 %3 = load float, float* %arrayidx16, align 4 269 %4 = sext i32 %n to i64 270 br label %for.body 271 272 for.body: 273 %i.043 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] 274 %sum.042 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add21, %for.body ] 275 %mul = shl nsw i64 %i.043, 2 276 %arrayidx2 = getelementptr inbounds float, float* %A, i64 %mul 277 %5 = load float, float* %arrayidx2, align 4 278 %mul3 = fmul fast float %0, %5 279 %add = fadd fast float %sum.042, %mul3 280 %add638 = or i64 %mul, 1 281 %arrayidx7 = getelementptr inbounds float, float* %A, i64 %add638 282 %6 = load float, float* %arrayidx7, align 4 283 %mul8 = fmul fast float %1, %6 284 %add9 = fadd fast float %add, %mul8 285 %add1239 = or i64 %mul, 2 286 %arrayidx13 = getelementptr inbounds float, float* %A, i64 %add1239 287 %7 = load float, float* %arrayidx13, align 4 288 %mul14 = fmul fast float %2, %7 289 %add15 = fadd fast float %add9, %mul14 290 %add1840 = or i64 %mul, 3 291 %arrayidx19 = getelementptr inbounds float, float* %A, i64 %add1840 292 %8 = load float, float* %arrayidx19, align 4 293 %mul20 = fmul fast float %3, %8 294 %add21 = fadd fast float %add15, %mul20 295 %inc = add nsw i64 %i.043, 1 296 %exitcond = icmp eq i64 %inc, %4 297 br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body 298 299 for.cond.for.end_crit_edge: 300 %phitmp = fptosi float %add21 to i32 301 br label %for.end 302 303 for.end: 304 %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ] 305 ret i32 %sum.0.lcssa 306 } 307 308 ; int foo(float * restrict A, float * restrict B, float * restrict C, int n) { 309 ; float sum = 0; 310 ; for (intptr_t i=0; i < n; ++i) { 311 ; C[i] = B[0] *A[i*4 ] + 312 ; B[1] *A[i*4+1] + 313 ; B[2] *A[i*4+2] + 314 ; B[3] *A[i*4+3]; 315 ; } 316 ; return sum; 317 ; } 318 319 ; CHECK-LABEL: store_red 320 ; CHECK: fmul fast <4 x float> 321 ; CHECK: shufflevector <4 x float> 322 323 define i32 @store_red(float* noalias %A, float* noalias %B, float* noalias %C, i32 %n) { 324 entry: 325 %cmp37 = icmp sgt i32 %n, 0 326 br i1 %cmp37, label %for.body.lr.ph, label %for.end 327 328 for.body.lr.ph: 329 %arrayidx4 = getelementptr inbounds float, float* %B, i64 1 330 %arrayidx9 = getelementptr inbounds float, float* %B, i64 2 331 %arrayidx15 = getelementptr inbounds float, float* %B, i64 3 332 %0 = sext i32 %n to i64 333 br label %for.body 334 335 for.body: 336 %i.039 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] 337 %C.addr.038 = phi float* [ %C, %for.body.lr.ph ], [ %incdec.ptr, %for.body ] 338 %1 = load float, float* %B, align 4 339 %mul = shl nsw i64 %i.039, 2 340 %arrayidx2 = getelementptr inbounds float, float* %A, i64 %mul 341 %2 = load float, float* %arrayidx2, align 4 342 %mul3 = fmul fast float %1, %2 343 %3 = load float, float* %arrayidx4, align 4 344 %add34 = or i64 %mul, 1 345 %arrayidx6 = getelementptr inbounds float, float* %A, i64 %add34 346 %4 = load float, float* %arrayidx6, align 4 347 %mul7 = fmul fast float %3, %4 348 %add8 = fadd fast float %mul3, %mul7 349 %5 = load float, float* %arrayidx9, align 4 350 %add1135 = or i64 %mul, 2 351 %arrayidx12 = getelementptr inbounds float, float* %A, i64 %add1135 352 %6 = load float, float* %arrayidx12, align 4 353 %mul13 = fmul fast float %5, %6 354 %add14 = fadd fast float %add8, %mul13 355 %7 = load float, float* %arrayidx15, align 4 356 %add1736 = or i64 %mul, 3 357 %arrayidx18 = getelementptr inbounds float, float* %A, i64 %add1736 358 %8 = load float, float* %arrayidx18, align 4 359 %mul19 = fmul fast float %7, %8 360 %add20 = fadd fast float %add14, %mul19 361 store float %add20, float* %C.addr.038, align 4 362 %incdec.ptr = getelementptr inbounds float, float* %C.addr.038, i64 1 363 %inc = add nsw i64 %i.039, 1 364 %exitcond = icmp eq i64 %inc, %0 365 br i1 %exitcond, label %for.end, label %for.body 366 367 for.end: 368 ret i32 0 369 } 370 371 372 ; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s --check-prefix=STORE 373 374 ; void foo(double * restrict A, double * restrict B, double * restrict C, 375 ; int n) { 376 ; for (intptr_t i=0; i < n; ++i) { 377 ; C[i] = B[0] *A[i*4 ] + B[1] *A[i*4+1]; 378 ; } 379 ; } 380 381 ; STORE-LABEL: store_red_double 382 ; STORE: fmul fast <2 x double> 383 ; STORE: extractelement <2 x double> 384 ; STORE: extractelement <2 x double> 385 386 define void @store_red_double(double* noalias %A, double* noalias %B, double* noalias %C, i32 %n) { 387 entry: 388 %cmp17 = icmp sgt i32 %n, 0 389 br i1 %cmp17, label %for.body.lr.ph, label %for.end 390 391 for.body.lr.ph: 392 %0 = load double, double* %B, align 8 393 %arrayidx4 = getelementptr inbounds double, double* %B, i64 1 394 %1 = load double, double* %arrayidx4, align 8 395 %2 = sext i32 %n to i64 396 br label %for.body 397 398 for.body: 399 %i.018 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] 400 %mul = shl nsw i64 %i.018, 2 401 %arrayidx2 = getelementptr inbounds double, double* %A, i64 %mul 402 %3 = load double, double* %arrayidx2, align 8 403 %mul3 = fmul fast double %0, %3 404 %add16 = or i64 %mul, 1 405 %arrayidx6 = getelementptr inbounds double, double* %A, i64 %add16 406 %4 = load double, double* %arrayidx6, align 8 407 %mul7 = fmul fast double %1, %4 408 %add8 = fadd fast double %mul3, %mul7 409 %arrayidx9 = getelementptr inbounds double, double* %C, i64 %i.018 410 store double %add8, double* %arrayidx9, align 8 411 %inc = add nsw i64 %i.018, 1 412 %exitcond = icmp eq i64 %inc, %2 413 br i1 %exitcond, label %for.end, label %for.body 414 415 for.end: 416 ret void 417 } 418