Home | History | Annotate | Download | only in X86
      1 ; RUN: opt -slp-vectorizer -S <  %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s --check-prefix=NOSTORE
      2 
      3 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
      4 
      5 ; #include <stdint.h>
      6 ;
      7 ; int foo(float *A, int n) {
      8 ;   float sum = 0;
      9 ;   for (intptr_t i=0; i < n; ++i) {
     10 ;     sum += 7*A[i*4  ] +
     11 ;            7*A[i*4+1] +
     12 ;            7*A[i*4+2] +
     13 ;            7*A[i*4+3];
     14 ;   }
     15 ;   return sum;
     16 ; }
     17 
     18 ; NOSTORE-LABEL: add_red
     19 ; NOSTORE: fmul <4 x float>
     20 ; NOSTORE: shufflevector <4 x float>
     21 
     22 define i32 @add_red(float* %A, i32 %n) {
     23 entry:
     24   %cmp31 = icmp sgt i32 %n, 0
     25   br i1 %cmp31, label %for.body.lr.ph, label %for.end
     26 
     27 for.body.lr.ph:
     28   %0 = sext i32 %n to i64
     29   br label %for.body
     30 
     31 for.body:
     32   %i.033 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
     33   %sum.032 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add17, %for.body ]
     34   %mul = shl nsw i64 %i.033, 2
     35   %arrayidx = getelementptr inbounds float, float* %A, i64 %mul
     36   %1 = load float, float* %arrayidx, align 4
     37   %mul2 = fmul float %1, 7.000000e+00
     38   %add28 = or i64 %mul, 1
     39   %arrayidx4 = getelementptr inbounds float, float* %A, i64 %add28
     40   %2 = load float, float* %arrayidx4, align 4
     41   %mul5 = fmul float %2, 7.000000e+00
     42   %add6 = fadd fast float %mul2, %mul5
     43   %add829 = or i64 %mul, 2
     44   %arrayidx9 = getelementptr inbounds float, float* %A, i64 %add829
     45   %3 = load float, float* %arrayidx9, align 4
     46   %mul10 = fmul float %3, 7.000000e+00
     47   %add11 = fadd fast float %add6, %mul10
     48   %add1330 = or i64 %mul, 3
     49   %arrayidx14 = getelementptr inbounds float, float* %A, i64 %add1330
     50   %4 = load float, float* %arrayidx14, align 4
     51   %mul15 = fmul float %4, 7.000000e+00
     52   %add16 = fadd fast float %add11, %mul15
     53   %add17 = fadd fast float %sum.032, %add16
     54   %inc = add nsw i64 %i.033, 1
     55   %exitcond = icmp eq i64 %inc, %0
     56   br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
     57 
     58 for.cond.for.end_crit_edge:
     59   %phitmp = fptosi float %add17 to i32
     60   br label %for.end
     61 
     62 for.end:
     63   %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
     64   ret i32 %sum.0.lcssa
     65 }
     66 
     67 ; int foo(float * restrict A, float * restrict B, int n) {
     68 ;   float sum = 0;
     69 ;   for (intptr_t i=0; i < n; ++i) {
     70 ;     sum *= B[0]*A[i*4  ] +
     71 ;       B[1]*A[i*4+1] +
     72 ;       B[2]*A[i*4+2] +
     73 ;       B[3]*A[i*4+3];
     74 ;   }
     75 ;   return sum;
     76 ; }
     77 
     78 ; CHECK-LABEL: mul_red
     79 ; CHECK: fmul <4 x float>
     80 ; CHECK: shufflevector <4 x float>
     81 
     82 define i32 @mul_red(float* noalias %A, float* noalias %B, i32 %n) {
     83 entry:
     84   %cmp38 = icmp sgt i32 %n, 0
     85   br i1 %cmp38, label %for.body.lr.ph, label %for.end
     86 
     87 for.body.lr.ph:
     88   %0 = load float, float* %B, align 4
     89   %arrayidx4 = getelementptr inbounds float, float* %B, i64 1
     90   %1 = load float, float* %arrayidx4, align 4
     91   %arrayidx9 = getelementptr inbounds float, float* %B, i64 2
     92   %2 = load float, float* %arrayidx9, align 4
     93   %arrayidx15 = getelementptr inbounds float, float* %B, i64 3
     94   %3 = load float, float* %arrayidx15, align 4
     95   %4 = sext i32 %n to i64
     96   br label %for.body
     97 
     98 for.body:
     99   %i.040 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
    100   %sum.039 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %mul21, %for.body ]
    101   %mul = shl nsw i64 %i.040, 2
    102   %arrayidx2 = getelementptr inbounds float, float* %A, i64 %mul
    103   %5 = load float, float* %arrayidx2, align 4
    104   %mul3 = fmul float %0, %5
    105   %add35 = or i64 %mul, 1
    106   %arrayidx6 = getelementptr inbounds float, float* %A, i64 %add35
    107   %6 = load float, float* %arrayidx6, align 4
    108   %mul7 = fmul float %1, %6
    109   %add8 = fadd fast float %mul3, %mul7
    110   %add1136 = or i64 %mul, 2
    111   %arrayidx12 = getelementptr inbounds float, float* %A, i64 %add1136
    112   %7 = load float, float* %arrayidx12, align 4
    113   %mul13 = fmul float %2, %7
    114   %add14 = fadd fast float %add8, %mul13
    115   %add1737 = or i64 %mul, 3
    116   %arrayidx18 = getelementptr inbounds float, float* %A, i64 %add1737
    117   %8 = load float, float* %arrayidx18, align 4
    118   %mul19 = fmul float %3, %8
    119   %add20 = fadd fast float %add14, %mul19
    120   %mul21 = fmul float %sum.039, %add20
    121   %inc = add nsw i64 %i.040, 1
    122   %exitcond = icmp eq i64 %inc, %4
    123   br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
    124 
    125 for.cond.for.end_crit_edge:
    126   %phitmp = fptosi float %mul21 to i32
    127   br label %for.end
    128 
    129 for.end:
    130   %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
    131   ret i32 %sum.0.lcssa
    132 }
    133 
    134 ; int foo(float * restrict A, float * restrict B, int n) {
    135 ;   float sum = 0;
    136 ;   for (intptr_t i=0; i < n; ++i) {
    137 ;     sum += B[0]*A[i*6  ] +
    138 ;            B[1]*A[i*6+1] +
    139 ;            B[2]*A[i*6+2] +
    140 ;            B[3]*A[i*6+3] +
    141 ;            B[4]*A[i*6+4] +
    142 ;            B[5]*A[i*6+5] +
    143 ;            B[6]*A[i*6+6] +
    144 ;            B[7]*A[i*6+7] +
    145 ;            B[8]*A[i*6+8];
    146 ;   }
    147 ;   return sum;
    148 ; }
    149 
    150 ; CHECK-LABEL: long_red
    151 ; CHECK: fmul fast <4 x float>
    152 ; CHECK: shufflevector <4 x float>
    153 
    154 define i32 @long_red(float* noalias %A, float* noalias %B, i32 %n) {
    155 entry:
    156   %cmp81 = icmp sgt i32 %n, 0
    157   br i1 %cmp81, label %for.body.lr.ph, label %for.end
    158 
    159 for.body.lr.ph:
    160   %0 = load float, float* %B, align 4
    161   %arrayidx4 = getelementptr inbounds float, float* %B, i64 1
    162   %1 = load float, float* %arrayidx4, align 4
    163   %arrayidx9 = getelementptr inbounds float, float* %B, i64 2
    164   %2 = load float, float* %arrayidx9, align 4
    165   %arrayidx15 = getelementptr inbounds float, float* %B, i64 3
    166   %3 = load float, float* %arrayidx15, align 4
    167   %arrayidx21 = getelementptr inbounds float, float* %B, i64 4
    168   %4 = load float, float* %arrayidx21, align 4
    169   %arrayidx27 = getelementptr inbounds float, float* %B, i64 5
    170   %5 = load float, float* %arrayidx27, align 4
    171   %arrayidx33 = getelementptr inbounds float, float* %B, i64 6
    172   %6 = load float, float* %arrayidx33, align 4
    173   %arrayidx39 = getelementptr inbounds float, float* %B, i64 7
    174   %7 = load float, float* %arrayidx39, align 4
    175   %arrayidx45 = getelementptr inbounds float, float* %B, i64 8
    176   %8 = load float, float* %arrayidx45, align 4
    177   %9 = sext i32 %n to i64
    178   br label %for.body
    179 
    180 for.body:
    181   %i.083 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
    182   %sum.082 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add51, %for.body ]
    183   %mul = mul nsw i64 %i.083, 6
    184   %arrayidx2 = getelementptr inbounds float, float* %A, i64 %mul
    185   %10 = load float, float* %arrayidx2, align 4
    186   %mul3 = fmul fast float %0, %10
    187   %add80 = or i64 %mul, 1
    188   %arrayidx6 = getelementptr inbounds float, float* %A, i64 %add80
    189   %11 = load float, float* %arrayidx6, align 4
    190   %mul7 = fmul fast float %1, %11
    191   %add8 = fadd fast float %mul3, %mul7
    192   %add11 = add nsw i64 %mul, 2
    193   %arrayidx12 = getelementptr inbounds float, float* %A, i64 %add11
    194   %12 = load float, float* %arrayidx12, align 4
    195   %mul13 = fmul fast float %2, %12
    196   %add14 = fadd fast float %add8, %mul13
    197   %add17 = add nsw i64 %mul, 3
    198   %arrayidx18 = getelementptr inbounds float, float* %A, i64 %add17
    199   %13 = load float, float* %arrayidx18, align 4
    200   %mul19 = fmul fast float %3, %13
    201   %add20 = fadd fast float %add14, %mul19
    202   %add23 = add nsw i64 %mul, 4
    203   %arrayidx24 = getelementptr inbounds float, float* %A, i64 %add23
    204   %14 = load float, float* %arrayidx24, align 4
    205   %mul25 = fmul fast float %4, %14
    206   %add26 = fadd fast float %add20, %mul25
    207   %add29 = add nsw i64 %mul, 5
    208   %arrayidx30 = getelementptr inbounds float, float* %A, i64 %add29
    209   %15 = load float, float* %arrayidx30, align 4
    210   %mul31 = fmul fast float %5, %15
    211   %add32 = fadd fast float %add26, %mul31
    212   %add35 = add nsw i64 %mul, 6
    213   %arrayidx36 = getelementptr inbounds float, float* %A, i64 %add35
    214   %16 = load float, float* %arrayidx36, align 4
    215   %mul37 = fmul fast float %6, %16
    216   %add38 = fadd fast float %add32, %mul37
    217   %add41 = add nsw i64 %mul, 7
    218   %arrayidx42 = getelementptr inbounds float, float* %A, i64 %add41
    219   %17 = load float, float* %arrayidx42, align 4
    220   %mul43 = fmul fast float %7, %17
    221   %add44 = fadd fast float %add38, %mul43
    222   %add47 = add nsw i64 %mul, 8
    223   %arrayidx48 = getelementptr inbounds float, float* %A, i64 %add47
    224   %18 = load float, float* %arrayidx48, align 4
    225   %mul49 = fmul fast float %8, %18
    226   %add50 = fadd fast float %add44, %mul49
    227   %add51 = fadd fast float %sum.082, %add50
    228   %inc = add nsw i64 %i.083, 1
    229   %exitcond = icmp eq i64 %inc, %9
    230   br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
    231 
    232 for.cond.for.end_crit_edge:
    233   %phitmp = fptosi float %add51 to i32
    234   br label %for.end
    235 
    236 for.end:
    237   %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
    238   ret i32 %sum.0.lcssa
    239 }
    240 
    241 ; int foo(float * restrict A, float * restrict B, int n) {
    242 ;   float sum = 0;
    243 ;   for (intptr_t i=0; i < n; ++i) {
    244 ;     sum += B[0]*A[i*4  ];
    245 ;     sum += B[1]*A[i*4+1];
    246 ;     sum += B[2]*A[i*4+2];
    247 ;     sum += B[3]*A[i*4+3];
    248 ;   }
    249 ;   return sum;
    250 ; }
    251 
    252 ; CHECK-LABEL: chain_red
    253 ; CHECK: fmul fast <4 x float>
    254 ; CHECK: shufflevector <4 x float>
    255 
    256 define i32 @chain_red(float* noalias %A, float* noalias %B, i32 %n) {
    257 entry:
    258   %cmp41 = icmp sgt i32 %n, 0
    259   br i1 %cmp41, label %for.body.lr.ph, label %for.end
    260 
    261 for.body.lr.ph:
    262   %0 = load float, float* %B, align 4
    263   %arrayidx4 = getelementptr inbounds float, float* %B, i64 1
    264   %1 = load float, float* %arrayidx4, align 4
    265   %arrayidx10 = getelementptr inbounds float, float* %B, i64 2
    266   %2 = load float, float* %arrayidx10, align 4
    267   %arrayidx16 = getelementptr inbounds float, float* %B, i64 3
    268   %3 = load float, float* %arrayidx16, align 4
    269   %4 = sext i32 %n to i64
    270   br label %for.body
    271 
    272 for.body:
    273   %i.043 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
    274   %sum.042 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add21, %for.body ]
    275   %mul = shl nsw i64 %i.043, 2
    276   %arrayidx2 = getelementptr inbounds float, float* %A, i64 %mul
    277   %5 = load float, float* %arrayidx2, align 4
    278   %mul3 = fmul fast float %0, %5
    279   %add = fadd fast float %sum.042, %mul3
    280   %add638 = or i64 %mul, 1
    281   %arrayidx7 = getelementptr inbounds float, float* %A, i64 %add638
    282   %6 = load float, float* %arrayidx7, align 4
    283   %mul8 = fmul fast float %1, %6
    284   %add9 = fadd fast float %add, %mul8
    285   %add1239 = or i64 %mul, 2
    286   %arrayidx13 = getelementptr inbounds float, float* %A, i64 %add1239
    287   %7 = load float, float* %arrayidx13, align 4
    288   %mul14 = fmul fast float %2, %7
    289   %add15 = fadd fast float %add9, %mul14
    290   %add1840 = or i64 %mul, 3
    291   %arrayidx19 = getelementptr inbounds float, float* %A, i64 %add1840
    292   %8 = load float, float* %arrayidx19, align 4
    293   %mul20 = fmul fast float %3, %8
    294   %add21 = fadd fast float %add15, %mul20
    295   %inc = add nsw i64 %i.043, 1
    296   %exitcond = icmp eq i64 %inc, %4
    297   br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
    298 
    299 for.cond.for.end_crit_edge:
    300   %phitmp = fptosi float %add21 to i32
    301   br label %for.end
    302 
    303 for.end:
    304   %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
    305   ret i32 %sum.0.lcssa
    306 }
    307 
    308 ; int foo(float * restrict A, float * restrict B, float * restrict C, int n) {
    309 ;   float sum = 0;
    310 ;   for (intptr_t i=0; i < n; ++i) {
    311 ;     C[i] = B[0] *A[i*4  ] +
    312 ;          B[1] *A[i*4+1] +
    313 ;          B[2] *A[i*4+2] +
    314 ;          B[3] *A[i*4+3];
    315 ;   }
    316 ;   return sum;
    317 ; }
    318 
    319 ; CHECK-LABEL: store_red
    320 ; CHECK: fmul fast <4 x float>
    321 ; CHECK: shufflevector <4 x float>
    322 
    323 define i32 @store_red(float* noalias %A, float* noalias %B, float* noalias %C, i32 %n) {
    324 entry:
    325   %cmp37 = icmp sgt i32 %n, 0
    326   br i1 %cmp37, label %for.body.lr.ph, label %for.end
    327 
    328 for.body.lr.ph:
    329   %arrayidx4 = getelementptr inbounds float, float* %B, i64 1
    330   %arrayidx9 = getelementptr inbounds float, float* %B, i64 2
    331   %arrayidx15 = getelementptr inbounds float, float* %B, i64 3
    332   %0 = sext i32 %n to i64
    333   br label %for.body
    334 
    335 for.body:
    336   %i.039 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
    337   %C.addr.038 = phi float* [ %C, %for.body.lr.ph ], [ %incdec.ptr, %for.body ]
    338   %1 = load float, float* %B, align 4
    339   %mul = shl nsw i64 %i.039, 2
    340   %arrayidx2 = getelementptr inbounds float, float* %A, i64 %mul
    341   %2 = load float, float* %arrayidx2, align 4
    342   %mul3 = fmul fast float %1, %2
    343   %3 = load float, float* %arrayidx4, align 4
    344   %add34 = or i64 %mul, 1
    345   %arrayidx6 = getelementptr inbounds float, float* %A, i64 %add34
    346   %4 = load float, float* %arrayidx6, align 4
    347   %mul7 = fmul fast float %3, %4
    348   %add8 = fadd fast float %mul3, %mul7
    349   %5 = load float, float* %arrayidx9, align 4
    350   %add1135 = or i64 %mul, 2
    351   %arrayidx12 = getelementptr inbounds float, float* %A, i64 %add1135
    352   %6 = load float, float* %arrayidx12, align 4
    353   %mul13 = fmul fast float %5, %6
    354   %add14 = fadd fast float %add8, %mul13
    355   %7 = load float, float* %arrayidx15, align 4
    356   %add1736 = or i64 %mul, 3
    357   %arrayidx18 = getelementptr inbounds float, float* %A, i64 %add1736
    358   %8 = load float, float* %arrayidx18, align 4
    359   %mul19 = fmul fast float %7, %8
    360   %add20 = fadd fast float %add14, %mul19
    361   store float %add20, float* %C.addr.038, align 4
    362   %incdec.ptr = getelementptr inbounds float, float* %C.addr.038, i64 1
    363   %inc = add nsw i64 %i.039, 1
    364   %exitcond = icmp eq i64 %inc, %0
    365   br i1 %exitcond, label %for.end, label %for.body
    366 
    367 for.end:
    368   ret i32 0
    369 }
    370 
    371 
    372 ; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S <  %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s --check-prefix=STORE
    373 
    374 ; void foo(double * restrict A, double * restrict B, double * restrict C,
    375 ;          int n) {
    376 ;   for (intptr_t i=0; i < n; ++i) {
    377 ;     C[i] = B[0] *A[i*4  ] + B[1] *A[i*4+1];
    378 ;   }
    379 ; }
    380 
    381 ; STORE-LABEL: store_red_double
    382 ; STORE: fmul fast <2 x double>
    383 ; STORE: extractelement <2 x double>
    384 ; STORE: extractelement <2 x double>
    385 
    386 define void @store_red_double(double* noalias %A, double* noalias %B, double* noalias %C, i32 %n) {
    387 entry:
    388   %cmp17 = icmp sgt i32 %n, 0
    389   br i1 %cmp17, label %for.body.lr.ph, label %for.end
    390 
    391 for.body.lr.ph:
    392   %0 = load double, double* %B, align 8
    393   %arrayidx4 = getelementptr inbounds double, double* %B, i64 1
    394   %1 = load double, double* %arrayidx4, align 8
    395   %2 = sext i32 %n to i64
    396   br label %for.body
    397 
    398 for.body:
    399   %i.018 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
    400   %mul = shl nsw i64 %i.018, 2
    401   %arrayidx2 = getelementptr inbounds double, double* %A, i64 %mul
    402   %3 = load double, double* %arrayidx2, align 8
    403   %mul3 = fmul fast double %0, %3
    404   %add16 = or i64 %mul, 1
    405   %arrayidx6 = getelementptr inbounds double, double* %A, i64 %add16
    406   %4 = load double, double* %arrayidx6, align 8
    407   %mul7 = fmul fast double %1, %4
    408   %add8 = fadd fast double %mul3, %mul7
    409   %arrayidx9 = getelementptr inbounds double, double* %C, i64 %i.018
    410   store double %add8, double* %arrayidx9, align 8
    411   %inc = add nsw i64 %i.018, 1
    412   %exitcond = icmp eq i64 %inc, %2
    413   br i1 %exitcond, label %for.end, label %for.body
    414 
    415 for.end:
    416   ret void
    417 }
    418