Home | History | Annotate | Download | only in LoopVectorize
      1 ; RUN: opt < %s -loop-vectorize -force-vector-width=2 -force-vector-unroll=1 -S | FileCheck %s
      2 ; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-unroll=1 -S | FileCheck %s -check-prefix=WIDTH
      3 
      4 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
      5 
      6 ; Vectorization with dependence checks.
      7 
      8 ; No plausible dependence - can be vectorized.
      9 ;  for (i = 0; i < 1024; ++i)
     10 ;    A[i] = A[i + 1] + 1;
     11 
     12 ; CHECK: f1_vec
     13 ; CHECK: <2 x i32>
     14 
     15 define void @f1_vec(i32* %A) {
     16 entry:
     17   br label %for.body
     18 
     19 for.body:
     20   %indvars.iv = phi i32 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
     21   %indvars.iv.next = add i32 %indvars.iv, 1
     22   %arrayidx = getelementptr inbounds i32* %A, i32 %indvars.iv.next
     23   %0 = load i32* %arrayidx, align 4
     24   %add1 = add nsw i32 %0, 1
     25   %arrayidx3 = getelementptr inbounds i32* %A, i32 %indvars.iv
     26   store i32 %add1, i32* %arrayidx3, align 4
     27   %exitcond = icmp ne i32 %indvars.iv.next, 1024
     28   br i1 %exitcond, label %for.body, label %for.end
     29 
     30 for.end:
     31   ret void
     32 }
     33 
     34 ; Plausible dependence of distance 1 - can't be vectorized.
     35 ;  for (i = 0; i < 1024; ++i)
     36 ;    A[i+1] = A[i] + 1;
     37 
     38 ; CHECK: f2_novec
     39 ; CHECK-NOT: <2 x i32>
     40 
     41 define void @f2_novec(i32* %A) {
     42 entry:
     43   br label %for.body
     44 
     45 for.body:
     46   %indvars.iv = phi i32 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
     47   %arrayidx = getelementptr inbounds i32* %A, i32 %indvars.iv
     48   %0 = load i32* %arrayidx, align 4
     49   %add = add nsw i32 %0, 1
     50   %indvars.iv.next = add i32 %indvars.iv, 1
     51   %arrayidx3 = getelementptr inbounds i32* %A, i32 %indvars.iv.next
     52   store i32 %add, i32* %arrayidx3, align 4
     53   %exitcond = icmp ne i32 %indvars.iv.next, 1024
     54   br i1 %exitcond, label %for.body, label %for.end
     55 
     56 for.end:
     57   ret void
     58 }
     59 
     60 ; Plausible dependence of distance 2 - can be vectorized with a width of 2.
     61 ;  for (i = 0; i < 1024; ++i)
     62 ;    A[i+2] = A[i] + 1;
     63 
     64 ; CHECK: f3_vec_len
     65 ; CHECK: <2 x i32>
     66 
     67 ; WIDTH: f3_vec_len
     68 ; WIDTH-NOT: <4 x i32>
     69 
     70 define void @f3_vec_len(i32* %A) {
     71 entry:
     72   br label %for.body
     73 
     74 for.body:
     75   %i.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
     76   %idxprom = sext i32 %i.01 to i64
     77   %arrayidx = getelementptr inbounds i32* %A, i64 %idxprom
     78   %0 = load i32* %arrayidx, align 4
     79   %add = add nsw i32 %0, 1
     80   %add1 = add nsw i32 %i.01, 2
     81   %idxprom2 = sext i32 %add1 to i64
     82   %arrayidx3 = getelementptr inbounds i32* %A, i64 %idxprom2
     83   store i32 %add, i32* %arrayidx3, align 4
     84   %inc = add nsw i32 %i.01, 1
     85   %cmp = icmp slt i32 %inc, 1024
     86   br i1 %cmp, label %for.body, label %for.end
     87 
     88 for.end:
     89   ret void
     90 }
     91 
     92 ; Plausible dependence of distance 1 - cannot be vectorized (without reordering
     93 ; accesses).
     94 ;   for (i = 0; i < 1024; ++i) {
     95 ;     B[i] = A[i];
     96 ;     A[i] = B[i + 1];
     97 ;   }
     98 
     99 ; CHECK: f5
    100 ; CHECK-NOT: <2 x i32>
    101 
    102 define void @f5(i32*  %A, i32* %B) {
    103 entry:
    104   br label %for.body
    105 
    106 for.body:
    107   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
    108   %arrayidx = getelementptr inbounds i32* %A, i64 %indvars.iv
    109   %0 = load i32* %arrayidx, align 4
    110   %arrayidx2 = getelementptr inbounds i32* %B, i64 %indvars.iv
    111   store i32 %0, i32* %arrayidx2, align 4
    112   %indvars.iv.next = add nsw i64 %indvars.iv, 1
    113   %arrayidx4 = getelementptr inbounds i32* %B, i64 %indvars.iv.next
    114   %1 = load i32* %arrayidx4, align 4
    115   store i32 %1, i32* %arrayidx, align 4
    116   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
    117   %exitcond = icmp ne i32 %lftr.wideiv, 1024
    118   br i1 %exitcond, label %for.body, label %for.end
    119 
    120 for.end:
    121   ret void
    122 }
    123 
    124 ; Dependence through a phi node - must not vectorize.
    125 ;   for (i = 0; i < 1024; ++i) {
    126 ;     a[i+1] = tmp;
    127 ;     tmp = a[i];
    128 ;   }
    129 
    130 ; CHECK: f6
    131 ; CHECK-NOT: <2 x i32>
    132 
    133 define i32 @f6(i32* %a, i32 %tmp) {
    134 entry:
    135   br label %for.body
    136 
    137 for.body:
    138   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
    139   %tmp.addr.08 = phi i32 [ %tmp, %entry ], [ %0, %for.body ]
    140   %indvars.iv.next = add nsw i64 %indvars.iv, 1
    141   %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv.next
    142   store i32 %tmp.addr.08, i32* %arrayidx, align 4
    143   %arrayidx3 = getelementptr inbounds i32* %a, i64 %indvars.iv
    144   %0 = load i32* %arrayidx3, align 4
    145   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
    146   %exitcond = icmp ne i32 %lftr.wideiv, 1024
    147   br i1 %exitcond, label %for.body, label %for.end
    148 
    149 for.end:
    150   ret i32 undef
    151 }
    152 
    153 ; Don't vectorize true loop carried dependencies that are not a multiple of the
    154 ; vector width.
    155 ; Example:
    156 ;   for (int i = ...; ++i) {
    157 ;     a[i] = a[i-3] + ...;
    158 ; It is a bad idea to vectorize this loop because store-load forwarding will not
    159 ; happen.
    160 ;
    161 
    162 ; CHECK-LABEL: @nostoreloadforward(
    163 ; CHECK-NOT: <2 x i32>
    164 
    165 define void @nostoreloadforward(i32* %A) {
    166 entry:
    167   br label %for.body
    168 
    169 for.body:
    170   %indvars.iv = phi i64 [ 16, %entry ], [ %indvars.iv.next, %for.body ]
    171   %0 = add nsw i64 %indvars.iv, -3
    172   %arrayidx = getelementptr inbounds i32* %A, i64 %0
    173   %1 = load i32* %arrayidx, align 4
    174   %2 = add nsw i64 %indvars.iv, 4
    175   %arrayidx2 = getelementptr inbounds i32* %A, i64 %2
    176   %3 = load i32* %arrayidx2, align 4
    177   %add3 = add nsw i32 %3, %1
    178   %arrayidx5 = getelementptr inbounds i32* %A, i64 %indvars.iv
    179   store i32 %add3, i32* %arrayidx5, align 4
    180   %indvars.iv.next = add i64 %indvars.iv, 1
    181   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
    182   %exitcond = icmp ne i32 %lftr.wideiv, 128
    183   br i1 %exitcond, label %for.body, label %for.end
    184 
    185 for.end:
    186   ret void
    187 }
    188 
    189 ; Example:
    190 ;   for (int i = ...; ++i) {
    191 ;     a[i] = b[i];
    192 ;     c[i] = a[i-3] + ...;
    193 ; It is a bad idea to vectorize this loop because store-load forwarding will not
    194 ; happen.
    195 ;
    196 
    197 ; CHECK-LABEL: @nostoreloadforward2(
    198 ; CHECK-NOT: <2 x i32>
    199 
    200 define void @nostoreloadforward2(i32* noalias %A, i32* noalias %B, i32* noalias %C) {
    201 entry:
    202   br label %for.body
    203 
    204 for.body:
    205   %indvars.iv = phi i64 [ 16, %entry ], [ %indvars.iv.next, %for.body ]
    206   %arrayidx = getelementptr inbounds i32* %B, i64 %indvars.iv
    207   %0 = load i32* %arrayidx, align 4
    208   %arrayidx2 = getelementptr inbounds i32* %A, i64 %indvars.iv
    209   store i32 %0, i32* %arrayidx2, align 4
    210   %1 = add nsw i64 %indvars.iv, -3
    211   %arrayidx4 = getelementptr inbounds i32* %A, i64 %1
    212   %2 = load i32* %arrayidx4, align 4
    213   %arrayidx6 = getelementptr inbounds i32* %C, i64 %indvars.iv
    214   store i32 %2, i32* %arrayidx6, align 4
    215   %indvars.iv.next = add i64 %indvars.iv, 1
    216   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
    217   %exitcond = icmp ne i32 %lftr.wideiv, 128
    218   br i1 %exitcond, label %for.body, label %for.end
    219 
    220 for.end:
    221   ret void
    222 }
    223