1 ; RUN: opt < %s -loop-vectorize -force-vector-width=2 -force-vector-unroll=1 -S | FileCheck %s 2 ; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-unroll=1 -S | FileCheck %s -check-prefix=WIDTH 3 4 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" 5 6 ; Vectorization with dependence checks. 7 8 ; No plausible dependence - can be vectorized. 9 ; for (i = 0; i < 1024; ++i) 10 ; A[i] = A[i + 1] + 1; 11 12 ; CHECK: f1_vec 13 ; CHECK: <2 x i32> 14 15 define void @f1_vec(i32* %A) { 16 entry: 17 br label %for.body 18 19 for.body: 20 %indvars.iv = phi i32 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 21 %indvars.iv.next = add i32 %indvars.iv, 1 22 %arrayidx = getelementptr inbounds i32* %A, i32 %indvars.iv.next 23 %0 = load i32* %arrayidx, align 4 24 %add1 = add nsw i32 %0, 1 25 %arrayidx3 = getelementptr inbounds i32* %A, i32 %indvars.iv 26 store i32 %add1, i32* %arrayidx3, align 4 27 %exitcond = icmp ne i32 %indvars.iv.next, 1024 28 br i1 %exitcond, label %for.body, label %for.end 29 30 for.end: 31 ret void 32 } 33 34 ; Plausible dependence of distance 1 - can't be vectorized. 35 ; for (i = 0; i < 1024; ++i) 36 ; A[i+1] = A[i] + 1; 37 38 ; CHECK: f2_novec 39 ; CHECK-NOT: <2 x i32> 40 41 define void @f2_novec(i32* %A) { 42 entry: 43 br label %for.body 44 45 for.body: 46 %indvars.iv = phi i32 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 47 %arrayidx = getelementptr inbounds i32* %A, i32 %indvars.iv 48 %0 = load i32* %arrayidx, align 4 49 %add = add nsw i32 %0, 1 50 %indvars.iv.next = add i32 %indvars.iv, 1 51 %arrayidx3 = getelementptr inbounds i32* %A, i32 %indvars.iv.next 52 store i32 %add, i32* %arrayidx3, align 4 53 %exitcond = icmp ne i32 %indvars.iv.next, 1024 54 br i1 %exitcond, label %for.body, label %for.end 55 56 for.end: 57 ret void 58 } 59 60 ; Plausible dependence of distance 2 - can be vectorized with a width of 2. 61 ; for (i = 0; i < 1024; ++i) 62 ; A[i+2] = A[i] + 1; 63 64 ; CHECK: f3_vec_len 65 ; CHECK: <2 x i32> 66 67 ; WIDTH: f3_vec_len 68 ; WIDTH-NOT: <4 x i32> 69 70 define void @f3_vec_len(i32* %A) { 71 entry: 72 br label %for.body 73 74 for.body: 75 %i.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ] 76 %idxprom = sext i32 %i.01 to i64 77 %arrayidx = getelementptr inbounds i32* %A, i64 %idxprom 78 %0 = load i32* %arrayidx, align 4 79 %add = add nsw i32 %0, 1 80 %add1 = add nsw i32 %i.01, 2 81 %idxprom2 = sext i32 %add1 to i64 82 %arrayidx3 = getelementptr inbounds i32* %A, i64 %idxprom2 83 store i32 %add, i32* %arrayidx3, align 4 84 %inc = add nsw i32 %i.01, 1 85 %cmp = icmp slt i32 %inc, 1024 86 br i1 %cmp, label %for.body, label %for.end 87 88 for.end: 89 ret void 90 } 91 92 ; Plausible dependence of distance 1 - cannot be vectorized (without reordering 93 ; accesses). 94 ; for (i = 0; i < 1024; ++i) { 95 ; B[i] = A[i]; 96 ; A[i] = B[i + 1]; 97 ; } 98 99 ; CHECK: f5 100 ; CHECK-NOT: <2 x i32> 101 102 define void @f5(i32* %A, i32* %B) { 103 entry: 104 br label %for.body 105 106 for.body: 107 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 108 %arrayidx = getelementptr inbounds i32* %A, i64 %indvars.iv 109 %0 = load i32* %arrayidx, align 4 110 %arrayidx2 = getelementptr inbounds i32* %B, i64 %indvars.iv 111 store i32 %0, i32* %arrayidx2, align 4 112 %indvars.iv.next = add nsw i64 %indvars.iv, 1 113 %arrayidx4 = getelementptr inbounds i32* %B, i64 %indvars.iv.next 114 %1 = load i32* %arrayidx4, align 4 115 store i32 %1, i32* %arrayidx, align 4 116 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 117 %exitcond = icmp ne i32 %lftr.wideiv, 1024 118 br i1 %exitcond, label %for.body, label %for.end 119 120 for.end: 121 ret void 122 } 123 124 ; Dependence through a phi node - must not vectorize. 125 ; for (i = 0; i < 1024; ++i) { 126 ; a[i+1] = tmp; 127 ; tmp = a[i]; 128 ; } 129 130 ; CHECK: f6 131 ; CHECK-NOT: <2 x i32> 132 133 define i32 @f6(i32* %a, i32 %tmp) { 134 entry: 135 br label %for.body 136 137 for.body: 138 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 139 %tmp.addr.08 = phi i32 [ %tmp, %entry ], [ %0, %for.body ] 140 %indvars.iv.next = add nsw i64 %indvars.iv, 1 141 %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv.next 142 store i32 %tmp.addr.08, i32* %arrayidx, align 4 143 %arrayidx3 = getelementptr inbounds i32* %a, i64 %indvars.iv 144 %0 = load i32* %arrayidx3, align 4 145 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 146 %exitcond = icmp ne i32 %lftr.wideiv, 1024 147 br i1 %exitcond, label %for.body, label %for.end 148 149 for.end: 150 ret i32 undef 151 } 152 153 ; Don't vectorize true loop carried dependencies that are not a multiple of the 154 ; vector width. 155 ; Example: 156 ; for (int i = ...; ++i) { 157 ; a[i] = a[i-3] + ...; 158 ; It is a bad idea to vectorize this loop because store-load forwarding will not 159 ; happen. 160 ; 161 162 ; CHECK-LABEL: @nostoreloadforward( 163 ; CHECK-NOT: <2 x i32> 164 165 define void @nostoreloadforward(i32* %A) { 166 entry: 167 br label %for.body 168 169 for.body: 170 %indvars.iv = phi i64 [ 16, %entry ], [ %indvars.iv.next, %for.body ] 171 %0 = add nsw i64 %indvars.iv, -3 172 %arrayidx = getelementptr inbounds i32* %A, i64 %0 173 %1 = load i32* %arrayidx, align 4 174 %2 = add nsw i64 %indvars.iv, 4 175 %arrayidx2 = getelementptr inbounds i32* %A, i64 %2 176 %3 = load i32* %arrayidx2, align 4 177 %add3 = add nsw i32 %3, %1 178 %arrayidx5 = getelementptr inbounds i32* %A, i64 %indvars.iv 179 store i32 %add3, i32* %arrayidx5, align 4 180 %indvars.iv.next = add i64 %indvars.iv, 1 181 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 182 %exitcond = icmp ne i32 %lftr.wideiv, 128 183 br i1 %exitcond, label %for.body, label %for.end 184 185 for.end: 186 ret void 187 } 188 189 ; Example: 190 ; for (int i = ...; ++i) { 191 ; a[i] = b[i]; 192 ; c[i] = a[i-3] + ...; 193 ; It is a bad idea to vectorize this loop because store-load forwarding will not 194 ; happen. 195 ; 196 197 ; CHECK-LABEL: @nostoreloadforward2( 198 ; CHECK-NOT: <2 x i32> 199 200 define void @nostoreloadforward2(i32* noalias %A, i32* noalias %B, i32* noalias %C) { 201 entry: 202 br label %for.body 203 204 for.body: 205 %indvars.iv = phi i64 [ 16, %entry ], [ %indvars.iv.next, %for.body ] 206 %arrayidx = getelementptr inbounds i32* %B, i64 %indvars.iv 207 %0 = load i32* %arrayidx, align 4 208 %arrayidx2 = getelementptr inbounds i32* %A, i64 %indvars.iv 209 store i32 %0, i32* %arrayidx2, align 4 210 %1 = add nsw i64 %indvars.iv, -3 211 %arrayidx4 = getelementptr inbounds i32* %A, i64 %1 212 %2 = load i32* %arrayidx4, align 4 213 %arrayidx6 = getelementptr inbounds i32* %C, i64 %indvars.iv 214 store i32 %2, i32* %arrayidx6, align 4 215 %indvars.iv.next = add i64 %indvars.iv, 1 216 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 217 %exitcond = icmp ne i32 %lftr.wideiv, 128 218 br i1 %exitcond, label %for.body, label %for.end 219 220 for.end: 221 ret void 222 } 223