1 ; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -dce -instcombine -S | FileCheck %s 2 3 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" 4 target triple = "aarch64--linux-gnu" 5 6 ; CHECK-LABEL: @reduction_i8 7 ; 8 ; char reduction_i8(char *a, char *b, int n) { 9 ; char sum = 0; 10 ; for (int i = 0; i < n; ++i) 11 ; sum += (a[i] + b[i]); 12 ; return sum; 13 ; } 14 ; 15 ; CHECK: vector.body: 16 ; CHECK: phi <16 x i8> 17 ; CHECK: load <16 x i8> 18 ; CHECK: load <16 x i8> 19 ; CHECK: add <16 x i8> 20 ; CHECK: add <16 x i8> 21 ; 22 ; CHECK: middle.block: 23 ; CHECK: shufflevector <16 x i8> 24 ; CHECK: add <16 x i8> 25 ; CHECK: shufflevector <16 x i8> 26 ; CHECK: add <16 x i8> 27 ; CHECK: shufflevector <16 x i8> 28 ; CHECK: add <16 x i8> 29 ; CHECK: shufflevector <16 x i8> 30 ; CHECK: add <16 x i8> 31 ; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = extractelement <16 x i8> 32 ; CHECK: zext i8 [[Rdx]] to i32 33 ; 34 define i8 @reduction_i8(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %n) { 35 entry: 36 %cmp.12 = icmp sgt i32 %n, 0 37 br i1 %cmp.12, label %for.body.preheader, label %for.cond.cleanup 38 39 for.body.preheader: 40 br label %for.body 41 42 for.cond.for.cond.cleanup_crit_edge: 43 %add5.lcssa = phi i32 [ %add5, %for.body ] 44 %conv6 = trunc i32 %add5.lcssa to i8 45 br label %for.cond.cleanup 46 47 for.cond.cleanup: 48 %sum.0.lcssa = phi i8 [ %conv6, %for.cond.for.cond.cleanup_crit_edge ], [ 0, %entry ] 49 ret i8 %sum.0.lcssa 50 51 for.body: 52 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] 53 %sum.013 = phi i32 [ %add5, %for.body ], [ 0, %for.body.preheader ] 54 %arrayidx = getelementptr inbounds i8, i8* %a, i64 %indvars.iv 55 %0 = load i8, i8* %arrayidx, align 1 56 %conv = zext i8 %0 to i32 57 %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %indvars.iv 58 %1 = load i8, i8* %arrayidx2, align 1 59 %conv3 = zext i8 %1 to i32 60 %conv4 = and i32 %sum.013, 255 61 %add = add nuw nsw i32 %conv, %conv4 62 %add5 = add nuw nsw i32 %add, %conv3 63 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 64 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 65 %exitcond = icmp eq i32 %lftr.wideiv, %n 66 br i1 %exitcond, label %for.cond.for.cond.cleanup_crit_edge, label %for.body 67 } 68 69 ; CHECK-LABEL: @reduction_i16_1 70 ; 71 ; short reduction_i16_1(short *a, short *b, int n) { 72 ; short sum = 0; 73 ; for (int i = 0; i < n; ++i) 74 ; sum += (a[i] + b[i]); 75 ; return sum; 76 ; } 77 ; 78 ; CHECK: vector.body: 79 ; CHECK: phi <8 x i16> 80 ; CHECK: load <8 x i16> 81 ; CHECK: load <8 x i16> 82 ; CHECK: add <8 x i16> 83 ; CHECK: add <8 x i16> 84 ; 85 ; CHECK: middle.block: 86 ; CHECK: shufflevector <8 x i16> 87 ; CHECK: add <8 x i16> 88 ; CHECK: shufflevector <8 x i16> 89 ; CHECK: add <8 x i16> 90 ; CHECK: shufflevector <8 x i16> 91 ; CHECK: add <8 x i16> 92 ; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = extractelement <8 x i16> 93 ; CHECK: zext i16 [[Rdx]] to i32 94 ; 95 define i16 @reduction_i16_1(i16* nocapture readonly %a, i16* nocapture readonly %b, i32 %n) { 96 entry: 97 %cmp.16 = icmp sgt i32 %n, 0 98 br i1 %cmp.16, label %for.body.preheader, label %for.cond.cleanup 99 100 for.body.preheader: 101 br label %for.body 102 103 for.cond.for.cond.cleanup_crit_edge: 104 %add5.lcssa = phi i32 [ %add5, %for.body ] 105 %conv6 = trunc i32 %add5.lcssa to i16 106 br label %for.cond.cleanup 107 108 for.cond.cleanup: 109 %sum.0.lcssa = phi i16 [ %conv6, %for.cond.for.cond.cleanup_crit_edge ], [ 0, %entry ] 110 ret i16 %sum.0.lcssa 111 112 for.body: 113 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] 114 %sum.017 = phi i32 [ %add5, %for.body ], [ 0, %for.body.preheader ] 115 %arrayidx = getelementptr inbounds i16, i16* %a, i64 %indvars.iv 116 %0 = load i16, i16* %arrayidx, align 2 117 %conv.14 = zext i16 %0 to i32 118 %arrayidx2 = getelementptr inbounds i16, i16* %b, i64 %indvars.iv 119 %1 = load i16, i16* %arrayidx2, align 2 120 %conv3.15 = zext i16 %1 to i32 121 %conv4.13 = and i32 %sum.017, 65535 122 %add = add nuw nsw i32 %conv.14, %conv4.13 123 %add5 = add nuw nsw i32 %add, %conv3.15 124 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 125 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 126 %exitcond = icmp eq i32 %lftr.wideiv, %n 127 br i1 %exitcond, label %for.cond.for.cond.cleanup_crit_edge, label %for.body 128 } 129 130 ; CHECK-LABEL: @reduction_i16_2 131 ; 132 ; short reduction_i16_2(char *a, char *b, int n) { 133 ; short sum = 0; 134 ; for (int i = 0; i < n; ++i) 135 ; sum += (a[i] + b[i]); 136 ; return sum; 137 ; } 138 ; 139 ; CHECK: vector.body: 140 ; CHECK: phi <8 x i16> 141 ; CHECK: [[Ld1:%[a-zA-Z0-9.]+]] = load <8 x i8> 142 ; CHECK: zext <8 x i8> [[Ld1]] to <8 x i16> 143 ; CHECK: [[Ld2:%[a-zA-Z0-9.]+]] = load <8 x i8> 144 ; CHECK: zext <8 x i8> [[Ld2]] to <8 x i16> 145 ; CHECK: add <8 x i16> 146 ; CHECK: add <8 x i16> 147 ; 148 ; CHECK: middle.block: 149 ; CHECK: shufflevector <8 x i16> 150 ; CHECK: add <8 x i16> 151 ; CHECK: shufflevector <8 x i16> 152 ; CHECK: add <8 x i16> 153 ; CHECK: shufflevector <8 x i16> 154 ; CHECK: add <8 x i16> 155 ; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = extractelement <8 x i16> 156 ; CHECK: zext i16 [[Rdx]] to i32 157 ; 158 define i16 @reduction_i16_2(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %n) { 159 entry: 160 %cmp.14 = icmp sgt i32 %n, 0 161 br i1 %cmp.14, label %for.body.preheader, label %for.cond.cleanup 162 163 for.body.preheader: 164 br label %for.body 165 166 for.cond.for.cond.cleanup_crit_edge: 167 %add5.lcssa = phi i32 [ %add5, %for.body ] 168 %conv6 = trunc i32 %add5.lcssa to i16 169 br label %for.cond.cleanup 170 171 for.cond.cleanup: 172 %sum.0.lcssa = phi i16 [ %conv6, %for.cond.for.cond.cleanup_crit_edge ], [ 0, %entry ] 173 ret i16 %sum.0.lcssa 174 175 for.body: 176 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] 177 %sum.015 = phi i32 [ %add5, %for.body ], [ 0, %for.body.preheader ] 178 %arrayidx = getelementptr inbounds i8, i8* %a, i64 %indvars.iv 179 %0 = load i8, i8* %arrayidx, align 1 180 %conv = zext i8 %0 to i32 181 %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %indvars.iv 182 %1 = load i8, i8* %arrayidx2, align 1 183 %conv3 = zext i8 %1 to i32 184 %conv4.13 = and i32 %sum.015, 65535 185 %add = add nuw nsw i32 %conv, %conv4.13 186 %add5 = add nuw nsw i32 %add, %conv3 187 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 188 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 189 %exitcond = icmp eq i32 %lftr.wideiv, %n 190 br i1 %exitcond, label %for.cond.for.cond.cleanup_crit_edge, label %for.body 191 } 192