1 ; RUN: opt < %s -S -mtriple=aarch64-none-linux-gnu -mattr=+neon -early-cse | FileCheck %s 2 ; RUN: opt < %s -S -mtriple=aarch64-none-linux-gnu -mattr=+neon -passes=early-cse | FileCheck %s 3 4 define <4 x i32> @test_cse(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) { 5 entry: 6 ; Check that @llvm.aarch64.neon.ld2 is optimized away by Early CSE. 7 ; CHECK-LABEL: @test_cse 8 ; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8 9 %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0 10 %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1 11 br label %for.cond 12 13 for.cond: ; preds = %for.body, %entry 14 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] 15 %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ] 16 %cmp = icmp slt i32 %i.0, %n 17 br i1 %cmp, label %for.body, label %for.end 18 19 for.body: ; preds = %for.cond 20 %0 = bitcast i32* %a to i8* 21 %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8> 22 %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8> 23 %3 = bitcast <16 x i8> %1 to <4 x i32> 24 %4 = bitcast <16 x i8> %2 to <4 x i32> 25 call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0) 26 %5 = bitcast i32* %a to i8* 27 %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %5) 28 %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0 29 %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1 30 %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract) 31 %inc = add nsw i32 %i.0, 1 32 br label %for.cond 33 34 for.end: ; preds = %for.cond 35 ret <4 x i32> %res.0 36 } 37 38 define <4 x i32> @test_cse2(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) { 39 entry: 40 ; Check that the first @llvm.aarch64.neon.st2 is optimized away by Early CSE. 41 ; CHECK-LABEL: @test_cse2 42 ; CHECK-NOT: call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %3, i8* %0) 43 ; CHECK: call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0) 44 %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0 45 %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1 46 br label %for.cond 47 48 for.cond: ; preds = %for.body, %entry 49 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] 50 %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ] 51 %cmp = icmp slt i32 %i.0, %n 52 br i1 %cmp, label %for.body, label %for.end 53 54 for.body: ; preds = %for.cond 55 %0 = bitcast i32* %a to i8* 56 %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8> 57 %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8> 58 %3 = bitcast <16 x i8> %1 to <4 x i32> 59 %4 = bitcast <16 x i8> %2 to <4 x i32> 60 call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %3, i8* %0) 61 call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0) 62 %5 = bitcast i32* %a to i8* 63 %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %5) 64 %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0 65 %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1 66 %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract) 67 %inc = add nsw i32 %i.0, 1 68 br label %for.cond 69 70 for.end: ; preds = %for.cond 71 ret <4 x i32> %res.0 72 } 73 74 define <4 x i32> @test_cse3(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) #0 { 75 entry: 76 ; Check that the first @llvm.aarch64.neon.ld2 is optimized away by Early CSE. 77 ; CHECK-LABEL: @test_cse3 78 ; CHECK: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8 79 ; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8 80 %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0 81 %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1 82 br label %for.cond 83 84 for.cond: ; preds = %for.body, %entry 85 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] 86 %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ] 87 %cmp = icmp slt i32 %i.0, %n 88 br i1 %cmp, label %for.body, label %for.end 89 90 for.body: ; preds = %for.cond 91 %0 = bitcast i32* %a to i8* 92 %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %0) 93 %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0 94 %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1 95 %1 = bitcast i32* %a to i8* 96 %vld22 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %1) 97 %vld22.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld22, 0 98 %vld22.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld22, 1 99 %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld22.fca.0.extract) 100 %inc = add nsw i32 %i.0, 1 101 br label %for.cond 102 103 for.end: ; preds = %for.cond 104 ret <4 x i32> %res.0 105 } 106 107 108 define <4 x i32> @test_nocse(i32* %a, i32* %b, [2 x <4 x i32>] %s.coerce, i32 %n) { 109 entry: 110 ; Check that the store prevents @llvm.aarch64.neon.ld2 from being optimized 111 ; away by Early CSE. 112 ; CHECK-LABEL: @test_nocse 113 ; CHECK: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8 114 %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0 115 %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1 116 br label %for.cond 117 118 for.cond: ; preds = %for.body, %entry 119 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] 120 %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ] 121 %cmp = icmp slt i32 %i.0, %n 122 br i1 %cmp, label %for.body, label %for.end 123 124 for.body: ; preds = %for.cond 125 %0 = bitcast i32* %a to i8* 126 %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8> 127 %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8> 128 %3 = bitcast <16 x i8> %1 to <4 x i32> 129 %4 = bitcast <16 x i8> %2 to <4 x i32> 130 call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0) 131 store i32 0, i32* %b, align 4 132 %5 = bitcast i32* %a to i8* 133 %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %5) 134 %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0 135 %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1 136 %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract) 137 %inc = add nsw i32 %i.0, 1 138 br label %for.cond 139 140 for.end: ; preds = %for.cond 141 ret <4 x i32> %res.0 142 } 143 144 define <4 x i32> @test_nocse2(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) { 145 entry: 146 ; Check that @llvm.aarch64.neon.ld3 is not optimized away by Early CSE due 147 ; to mismatch between st2 and ld3. 148 ; CHECK-LABEL: @test_nocse2 149 ; CHECK: call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8 150 %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0 151 %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1 152 br label %for.cond 153 154 for.cond: ; preds = %for.body, %entry 155 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] 156 %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ] 157 %cmp = icmp slt i32 %i.0, %n 158 br i1 %cmp, label %for.body, label %for.end 159 160 for.body: ; preds = %for.cond 161 %0 = bitcast i32* %a to i8* 162 %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8> 163 %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8> 164 %3 = bitcast <16 x i8> %1 to <4 x i32> 165 %4 = bitcast <16 x i8> %2 to <4 x i32> 166 call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0) 167 %5 = bitcast i32* %a to i8* 168 %vld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8(i8* %5) 169 %vld3.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 0 170 %vld3.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 2 171 %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld3.fca.0.extract, <4 x i32> %vld3.fca.2.extract) 172 %inc = add nsw i32 %i.0, 1 173 br label %for.cond 174 175 for.end: ; preds = %for.cond 176 ret <4 x i32> %res.0 177 } 178 179 define <4 x i32> @test_nocse3(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) { 180 entry: 181 ; Check that @llvm.aarch64.neon.st3 is not optimized away by Early CSE due to 182 ; mismatch between st2 and st3. 183 ; CHECK-LABEL: @test_nocse3 184 ; CHECK: call void @llvm.aarch64.neon.st3.v4i32.p0i8 185 ; CHECK: call void @llvm.aarch64.neon.st2.v4i32.p0i8 186 %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0 187 %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1 188 br label %for.cond 189 190 for.cond: ; preds = %for.body, %entry 191 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] 192 %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ] 193 %cmp = icmp slt i32 %i.0, %n 194 br i1 %cmp, label %for.body, label %for.end 195 196 for.body: ; preds = %for.cond 197 %0 = bitcast i32* %a to i8* 198 %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8> 199 %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8> 200 %3 = bitcast <16 x i8> %1 to <4 x i32> 201 %4 = bitcast <16 x i8> %2 to <4 x i32> 202 call void @llvm.aarch64.neon.st3.v4i32.p0i8(<4 x i32> %4, <4 x i32> %3, <4 x i32> %3, i8* %0) 203 call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %3, i8* %0) 204 %5 = bitcast i32* %a to i8* 205 %vld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8(i8* %5) 206 %vld3.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 0 207 %vld3.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 1 208 %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld3.fca.0.extract, <4 x i32> %vld3.fca.0.extract) 209 %inc = add nsw i32 %i.0, 1 210 br label %for.cond 211 212 for.end: ; preds = %for.cond 213 ret <4 x i32> %res.0 214 } 215 216 ; Function Attrs: nounwind 217 declare void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32>, <4 x i32>, i8* nocapture) 218 219 ; Function Attrs: nounwind 220 declare void @llvm.aarch64.neon.st3.v4i32.p0i8(<4 x i32>, <4 x i32>, <4 x i32>, i8* nocapture) 221 222 ; Function Attrs: nounwind readonly 223 declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8*) 224 225 ; Function Attrs: nounwind readonly 226 declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8(i8*) 227 228 define internal fastcc <4 x i32> @vaddq_s32(<4 x i32> %__p0, <4 x i32> %__p1) { 229 entry: 230 %add = add <4 x i32> %__p0, %__p1 231 ret <4 x i32> %add 232 } 233