1 ; RUN: llc < %s -verify-machineinstrs -march=arm64 -aarch64-neon-syntax=generic | FileCheck %s --check-prefix=CHECK-V8a 2 ; RUN: llc < %s -verify-machineinstrs -march=arm64 -mattr=+v8.1a -aarch64-neon-syntax=generic | FileCheck %s --check-prefix=CHECK-V81a 3 ; RUN: llc < %s -verify-machineinstrs -march=arm64 -mattr=+v8.1a -aarch64-neon-syntax=apple | FileCheck %s --check-prefix=CHECK-V81a-apple 4 5 declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>) 6 declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>) 7 declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>) 8 declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>) 9 declare i32 @llvm.aarch64.neon.sqrdmulh.i32(i32, i32) 10 declare i16 @llvm.aarch64.neon.sqrdmulh.i16(i16, i16) 11 12 declare <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16>, <4 x i16>) 13 declare <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16>, <8 x i16>) 14 declare <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32>, <2 x i32>) 15 declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>) 16 declare i32 @llvm.aarch64.neon.sqadd.i32(i32, i32) 17 declare i16 @llvm.aarch64.neon.sqadd.i16(i16, i16) 18 19 declare <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16>, <4 x i16>) 20 declare <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16>, <8 x i16>) 21 declare <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32>, <2 x i32>) 22 declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>) 23 declare i32 @llvm.aarch64.neon.sqsub.i32(i32, i32) 24 declare i16 @llvm.aarch64.neon.sqsub.i16(i16, i16) 25 26 ;----------------------------------------------------------------------------- 27 ; RDMA Vector 28 ; test for SIMDThreeSameVectorSQRDMLxHTiedHS 29 30 define <4 x i16> @test_sqrdmlah_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) { 31 ; CHECK-LABEL: test_sqrdmlah_v4i16: 32 %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %mhs, <4 x i16> %rhs) 33 %retval = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc, <4 x i16> %prod) 34 ; CHECK-V8a: sqrdmulh v1.4h, v1.4h, v2.4h 35 ; CHECK-V81a: sqrdmlah v0.4h, v1.4h, v2.4h 36 ; CHECK-V81a-apple: sqrdmlah.4h v0, v1, v2 37 ret <4 x i16> %retval 38 } 39 40 define <8 x i16> @test_sqrdmlah_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) { 41 ; CHECK-LABEL: test_sqrdmlah_v8i16: 42 %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs) 43 %retval = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc, <8 x i16> %prod) 44 ; CHECK-V8a: sqrdmulh v1.8h, v1.8h, v2.8h 45 ; CHECK-V81a: sqrdmlah v0.8h, v1.8h, v2.8h 46 ; CHECK-V81a-apple: sqrdmlah.8h v0, v1, v2 47 ret <8 x i16> %retval 48 } 49 50 define <2 x i32> @test_sqrdmlah_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) { 51 ; CHECK-LABEL: test_sqrdmlah_v2i32: 52 %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs) 53 %retval = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %acc, <2 x i32> %prod) 54 ; CHECK-V8a: sqrdmulh v1.2s, v1.2s, v2.2s 55 ; CHECK-V81a: sqrdmlah v0.2s, v1.2s, v2.2s 56 ; CHECK-V81a-apple: sqrdmlah.2s v0, v1, v2 57 ret <2 x i32> %retval 58 } 59 60 define <4 x i32> @test_sqrdmlah_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) { 61 ; CHECK-LABEL: test_sqrdmlah_v4i32: 62 %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs) 63 %retval = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc, <4 x i32> %prod) 64 ; CHECK-V81: sqrdmulh v1.4s, v1.4s, v2.4s 65 ; CHECK-V81a: sqrdmlah v0.4s, v1.4s, v2.4s 66 ; CHECK-V81a-apple: sqrdmlah.4s v0, v1, v2 67 ret <4 x i32> %retval 68 } 69 70 define <4 x i16> @test_sqrdmlsh_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) { 71 ; CHECK-LABEL: test_sqrdmlsh_v4i16: 72 %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %mhs, <4 x i16> %rhs) 73 %retval = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc, <4 x i16> %prod) 74 ; CHECK-V8a: sqrdmulh v1.4h, v1.4h, v2.4h 75 ; CHECK-V81a: sqrdmlsh v0.4h, v1.4h, v2.4h 76 ; CHECK-V81a-apple: sqrdmlsh.4h v0, v1, v2 77 ret <4 x i16> %retval 78 } 79 80 define <8 x i16> @test_sqrdmlsh_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) { 81 ; CHECK-LABEL: test_sqrdmlsh_v8i16: 82 %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs) 83 %retval = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc, <8 x i16> %prod) 84 ; CHECK-V8a: sqrdmulh v1.8h, v1.8h, v2.8h 85 ; CHECK-V81a: sqrdmlsh v0.8h, v1.8h, v2.8h 86 ; CHECK-V81a-apple: sqrdmlsh.8h v0, v1, v2 87 ret <8 x i16> %retval 88 } 89 90 define <2 x i32> @test_sqrdmlsh_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) { 91 ; CHECK-LABEL: test_sqrdmlsh_v2i32: 92 %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs) 93 %retval = call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %acc, <2 x i32> %prod) 94 ; CHECK-V8a: sqrdmulh v1.2s, v1.2s, v2.2s 95 ; CHECK-V81a: sqrdmlsh v0.2s, v1.2s, v2.2s 96 ; CHECK-V81a-apple: sqrdmlsh.2s v0, v1, v2 97 ret <2 x i32> %retval 98 } 99 100 define <4 x i32> @test_sqrdmlsh_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) { 101 ; CHECK-LABEL: test_sqrdmlsh_v4i32: 102 %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs) 103 %retval = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc, <4 x i32> %prod) 104 ; CHECK-V8a: sqrdmulh v1.4s, v1.4s, v2.4s 105 ; CHECK-V81a: sqrdmlsh v0.4s, v1.4s, v2.4s 106 ; CHECK-V81a-apple: sqrdmlsh.4s v0, v1, v2 107 ret <4 x i32> %retval 108 } 109 110 ;----------------------------------------------------------------------------- 111 ; RDMA Vector, by element 112 ; tests for vXiYY_indexed in SIMDIndexedSQRDMLxHSDTied 113 114 define <4 x i16> @test_sqrdmlah_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16> %v) { 115 ; CHECK-LABEL: test_sqrdmlah_lane_s16: 116 entry: 117 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 118 %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle) 119 %retval = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc, <4 x i16> %prod) 120 ; CHECK-V8a : sqrdmulh v1.4h, v1.4h, v2.h[3] 121 ; CHECK-V81a: sqrdmlah v0.4h, v1.4h, v2.h[3] 122 ; CHECK-V81a-apple: sqrdmlah.4h v0, v1, v2[3] 123 ret <4 x i16> %retval 124 } 125 126 define <8 x i16> @test_sqrdmlahq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <8 x i16> %v) { 127 ; CHECK-LABEL: test_sqrdmlahq_lane_s16: 128 entry: 129 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> 130 %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle) 131 %retval = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc, <8 x i16> %prod) 132 ; CHECK-V8a: sqrdmulh v1.8h, v1.8h, v2.h[2] 133 ; CHECK-V81a: sqrdmlah v0.8h, v1.8h, v2.h[2] 134 ; CHECK-V81a-apple: sqrdmlah.8h v0, v1, v2[2] 135 ret <8 x i16> %retval 136 } 137 138 define <2 x i32> @test_sqrdmlah_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32> %v) { 139 ; CHECK-LABEL: test_sqrdmlah_lane_s32: 140 entry: 141 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> 142 %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle) 143 %retval = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %acc, <2 x i32> %prod) 144 ; CHECK-V8a: sqrdmulh v1.2s, v1.2s, v2.s[1] 145 ; CHECK-V81a: sqrdmlah v0.2s, v1.2s, v2.s[1] 146 ; CHECK-V81a-apple: sqrdmlah.2s v0, v1, v2[1] 147 ret <2 x i32> %retval 148 } 149 150 define <4 x i32> @test_sqrdmlahq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <4 x i32> %v) { 151 ; CHECK-LABEL: test_sqrdmlahq_lane_s32: 152 entry: 153 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer 154 %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle) 155 %retval = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc, <4 x i32> %prod) 156 ; CHECK-V8a: sqrdmulh v1.4s, v1.4s, v2.s[0] 157 ; CHECK-V81a: sqrdmlah v0.4s, v1.4s, v2.s[0] 158 ; CHECK-V81a-apple: sqrdmlah.4s v0, v1, v2[0] 159 ret <4 x i32> %retval 160 } 161 162 define <4 x i16> @test_sqrdmlsh_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16> %v) { 163 ; CHECK-LABEL: test_sqrdmlsh_lane_s16: 164 entry: 165 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 166 %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle) 167 %retval = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc, <4 x i16> %prod) 168 ; CHECK-V8a: sqrdmulh v1.4h, v1.4h, v2.h[3] 169 ; CHECK-V81a: sqrdmlsh v0.4h, v1.4h, v2.h[3] 170 ; CHECK-V81a-apple: sqrdmlsh.4h v0, v1, v2[3] 171 ret <4 x i16> %retval 172 } 173 174 define <8 x i16> @test_sqrdmlshq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <8 x i16> %v) { 175 ; CHECK-LABEL: test_sqrdmlshq_lane_s16: 176 entry: 177 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> 178 %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle) 179 %retval = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc, <8 x i16> %prod) 180 ; CHECK-V8a: sqrdmulh v1.8h, v1.8h, v2.h[2] 181 ; CHECK-V81a: sqrdmlsh v0.8h, v1.8h, v2.h[2] 182 ; CHECK-V81a-apple: sqrdmlsh.8h v0, v1, v2[2] 183 ret <8 x i16> %retval 184 } 185 186 define <2 x i32> @test_sqrdmlsh_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32> %v) { 187 ; CHECK-LABEL: test_sqrdmlsh_lane_s32: 188 entry: 189 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> 190 %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle) 191 %retval = call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %acc, <2 x i32> %prod) 192 ; CHECK-V8a: sqrdmulh v1.2s, v1.2s, v2.s[1] 193 ; CHECK-V81a: sqrdmlsh v0.2s, v1.2s, v2.s[1] 194 ; CHECK-V81a-apple: sqrdmlsh.2s v0, v1, v2[1] 195 ret <2 x i32> %retval 196 } 197 198 define <4 x i32> @test_sqrdmlshq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <4 x i32> %v) { 199 ; CHECK-LABEL: test_sqrdmlshq_lane_s32: 200 entry: 201 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer 202 %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle) 203 %retval = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc, <4 x i32> %prod) 204 ; CHECK-V8a: sqrdmulh v1.4s, v1.4s, v2.s[0] 205 ; CHECK-V81a: sqrdmlsh v0.4s, v1.4s, v2.s[0] 206 ; CHECK-V81a-apple: sqrdmlsh.4s v0, v1, v2[0] 207 ret <4 x i32> %retval 208 } 209 210 ;----------------------------------------------------------------------------- 211 ; RDMA Vector, by element, extracted 212 ; i16 tests are for vXi16_indexed in SIMDIndexedSQRDMLxHSDTied, with IR in ACLE style 213 ; i32 tests are for "def : Pat" in SIMDIndexedSQRDMLxHSDTied 214 215 define i16 @test_sqrdmlah_extracted_lane_s16(i16 %acc,<4 x i16> %x, <4 x i16> %v) { 216 ; CHECK-LABEL: test_sqrdmlah_extracted_lane_s16: 217 entry: 218 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1> 219 %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle) 220 %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0 221 %retval_vec = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod) 222 %retval = extractelement <4 x i16> %retval_vec, i64 0 223 ; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4h, v0.4h, v1.h[1] 224 ; CHECK-V81a: sqrdmlah {{v[2-9]+}}.4h, v0.4h, v1.h[1] 225 ; CHECK-V81a-apple: sqrdmlah.4h {{v[2-9]+}}, v0, v1[1] 226 ret i16 %retval 227 } 228 229 define i16 @test_sqrdmlahq_extracted_lane_s16(i16 %acc,<8 x i16> %x, <8 x i16> %v) { 230 ; CHECK-LABEL: test_sqrdmlahq_extracted_lane_s16: 231 entry: 232 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1, i32 1,i32 1,i32 1,i32 1> 233 %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle) 234 %acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0 235 %retval_vec = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod) 236 %retval = extractelement <8 x i16> %retval_vec, i64 0 237 ; CHECK-V8a: sqrdmulh {{v[0-9]+}}.8h, v0.8h, v1.h[1] 238 ; CHECK-V81a: sqrdmlah {{v[2-9]+}}.8h, v0.8h, v1.h[1] 239 ; CHECK-V81a-apple: sqrdmlah.8h {{v[2-9]+}}, v0, v1[1] 240 ret i16 %retval 241 } 242 243 define i32 @test_sqrdmlah_extracted_lane_s32(i32 %acc,<2 x i32> %x, <2 x i32> %v) { 244 ; CHECK-LABEL: test_sqrdmlah_extracted_lane_s32: 245 entry: 246 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer 247 %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle) 248 %extract = extractelement <2 x i32> %prod, i64 0 249 %retval = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %extract) 250 ; CHECK-V8a: sqrdmulh v0.2s, v0.2s, v1.s[0] 251 ; CHECK-V81a: sqrdmlah v2.2s, v0.2s, v1.s[0] 252 ; CHECK-V81a-apple: sqrdmlah.2s v2, v0, v1[0] 253 ret i32 %retval 254 } 255 256 define i32 @test_sqrdmlahq_extracted_lane_s32(i32 %acc,<4 x i32> %x, <4 x i32> %v) { 257 ; CHECK-LABEL: test_sqrdmlahq_extracted_lane_s32: 258 entry: 259 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer 260 %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle) 261 %extract = extractelement <4 x i32> %prod, i64 0 262 %retval = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %extract) 263 ; CHECK-V8a: sqrdmulh v0.4s, v0.4s, v1.s[0] 264 ; CHECK-V81a: sqrdmlah v2.4s, v0.4s, v1.s[0] 265 ; CHECK-V81a-apple: sqrdmlah.4s v2, v0, v1[0] 266 ret i32 %retval 267 } 268 269 define i16 @test_sqrdmlsh_extracted_lane_s16(i16 %acc,<4 x i16> %x, <4 x i16> %v) { 270 ; CHECK-LABEL: test_sqrdmlsh_extracted_lane_s16: 271 entry: 272 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1> 273 %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle) 274 %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0 275 %retval_vec = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod) 276 %retval = extractelement <4 x i16> %retval_vec, i64 0 277 ; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4h, v0.4h, v1.h[1] 278 ; CHECK-V81a: sqrdmlsh {{v[2-9]+}}.4h, v0.4h, v1.h[1] 279 ; CHECK-V81a-apple: sqrdmlsh.4h {{v[2-9]+}}, v0, v1[1] 280 ret i16 %retval 281 } 282 283 define i16 @test_sqrdmlshq_extracted_lane_s16(i16 %acc,<8 x i16> %x, <8 x i16> %v) { 284 ; CHECK-LABEL: test_sqrdmlshq_extracted_lane_s16: 285 entry: 286 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1, i32 1,i32 1,i32 1,i32 1> 287 %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle) 288 %acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0 289 %retval_vec = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod) 290 %retval = extractelement <8 x i16> %retval_vec, i64 0 291 ; CHECK-V8a: sqrdmulh {{v[0-9]+}}.8h, v0.8h, v1.h[1] 292 ; CHECK-V81a: sqrdmlsh {{v[2-9]+}}.8h, v0.8h, v1.h[1] 293 ; CHECK-V81a-apple: sqrdmlsh.8h {{v[2-9]+}}, v0, v1[1] 294 ret i16 %retval 295 } 296 297 define i32 @test_sqrdmlsh_extracted_lane_s32(i32 %acc,<2 x i32> %x, <2 x i32> %v) { 298 ; CHECK-LABEL: test_sqrdmlsh_extracted_lane_s32: 299 entry: 300 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer 301 %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle) 302 %extract = extractelement <2 x i32> %prod, i64 0 303 %retval = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %extract) 304 ; CHECK-V8a: sqrdmulh v0.2s, v0.2s, v1.s[0] 305 ; CHECK-V81a: sqrdmlsh v2.2s, v0.2s, v1.s[0] 306 ; CHECK-V81a-apple: sqrdmlsh.2s v2, v0, v1[0] 307 ret i32 %retval 308 } 309 310 define i32 @test_sqrdmlshq_extracted_lane_s32(i32 %acc,<4 x i32> %x, <4 x i32> %v) { 311 ; CHECK-LABEL: test_sqrdmlshq_extracted_lane_s32: 312 entry: 313 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer 314 %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle) 315 %extract = extractelement <4 x i32> %prod, i64 0 316 %retval = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %extract) 317 ; CHECK-V8a: sqrdmulh v0.4s, v0.4s, v1.s[0] 318 ; CHECK-V81a: sqrdmlsh v2.4s, v0.4s, v1.s[0] 319 ; CHECK-V81a-apple: sqrdmlsh.4s v2, v0, v1[0] 320 ret i32 %retval 321 } 322 323 ;----------------------------------------------------------------------------- 324 ; RDMA Scalar 325 ; test for "def : Pat" near SIMDThreeScalarHSTied in AArch64InstInfo.td 326 327 define i16 @test_sqrdmlah_v1i16(i16 %acc, i16 %x, i16 %y) { 328 ; CHECK-LABEL: test_sqrdmlah_v1i16: 329 %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0 330 %y_vec = insertelement <4 x i16> undef, i16 %y, i64 0 331 %prod_vec = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec, <4 x i16> %y_vec) 332 %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0 333 %retval_vec = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod_vec) 334 %retval = extractelement <4 x i16> %retval_vec, i64 0 335 ; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h 336 ; CHECK-V81a: sqrdmlah {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h 337 ; CHECK-V81a-apple: sqrdmlah.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} 338 ret i16 %retval 339 } 340 341 define i32 @test_sqrdmlah_v1i32(i32 %acc, i32 %x, i32 %y) { 342 ; CHECK-LABEL: test_sqrdmlah_v1i32: 343 %x_vec = insertelement <4 x i32> undef, i32 %x, i64 0 344 %y_vec = insertelement <4 x i32> undef, i32 %y, i64 0 345 %prod_vec = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x_vec, <4 x i32> %y_vec) 346 %acc_vec = insertelement <4 x i32> undef, i32 %acc, i64 0 347 %retval_vec = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc_vec, <4 x i32> %prod_vec) 348 %retval = extractelement <4 x i32> %retval_vec, i64 0 349 ; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s 350 ; CHECK-V81a: sqrdmlah {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s 351 ; CHECK-V81a-apple: sqrdmlah.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} 352 ret i32 %retval 353 } 354 355 356 define i16 @test_sqrdmlsh_v1i16(i16 %acc, i16 %x, i16 %y) { 357 ; CHECK-LABEL: test_sqrdmlsh_v1i16: 358 %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0 359 %y_vec = insertelement <4 x i16> undef, i16 %y, i64 0 360 %prod_vec = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec, <4 x i16> %y_vec) 361 %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0 362 %retval_vec = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod_vec) 363 %retval = extractelement <4 x i16> %retval_vec, i64 0 364 ; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h 365 ; CHECK-V81a: sqrdmlsh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h 366 ; CHECK-V81a-apple: sqrdmlsh.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} 367 ret i16 %retval 368 } 369 370 define i32 @test_sqrdmlsh_v1i32(i32 %acc, i32 %x, i32 %y) { 371 ; CHECK-LABEL: test_sqrdmlsh_v1i32: 372 %x_vec = insertelement <4 x i32> undef, i32 %x, i64 0 373 %y_vec = insertelement <4 x i32> undef, i32 %y, i64 0 374 %prod_vec = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x_vec, <4 x i32> %y_vec) 375 %acc_vec = insertelement <4 x i32> undef, i32 %acc, i64 0 376 %retval_vec = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc_vec, <4 x i32> %prod_vec) 377 %retval = extractelement <4 x i32> %retval_vec, i64 0 378 ; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s 379 ; CHECK-V81a: sqrdmlsh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s 380 ; CHECK-V81a-apple: sqrdmlsh.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} 381 ret i32 %retval 382 } 383 define i32 @test_sqrdmlah_i32(i32 %acc, i32 %mhs, i32 %rhs) { 384 ; CHECK-LABEL: test_sqrdmlah_i32: 385 %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs, i32 %rhs) 386 %retval = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %prod) 387 ; CHECK-V8a: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} 388 ; CHECK-V81a: sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} 389 ; CHECK-V81a-apple: sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} 390 ret i32 %retval 391 } 392 393 define i32 @test_sqrdmlsh_i32(i32 %acc, i32 %mhs, i32 %rhs) { 394 ; CHECK-LABEL: test_sqrdmlsh_i32: 395 %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs, i32 %rhs) 396 %retval = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %prod) 397 ; CHECK-V8a: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} 398 ; CHECK-V81a: sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} 399 ; CHECK-V81a-apple: sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} 400 ret i32 %retval 401 } 402 403 ;----------------------------------------------------------------------------- 404 ; RDMA Scalar, by element 405 ; i16 tests are performed via tests in above chapter, with IR in ACLE style 406 ; i32 tests are for i32_indexed in SIMDIndexedSQRDMLxHSDTied 407 408 define i16 @test_sqrdmlah_extract_i16(i16 %acc, i16 %x, <4 x i16> %y_vec) { 409 ; CHECK-LABEL: test_sqrdmlah_extract_i16: 410 %shuffle = shufflevector <4 x i16> %y_vec, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1> 411 %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0 412 %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec, <4 x i16> %shuffle) 413 %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0 414 %retval_vec = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod) 415 %retval = extractelement <4 x i16> %retval_vec, i32 0 416 ; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, v0.h[1] 417 ; CHECK-V81a: sqrdmlah {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, v0.h[1] 418 ; CHECK-V81a-apple: sqrdmlah.4h {{v[0-9]+}}, {{v[0-9]+}}, v0[1] 419 ret i16 %retval 420 } 421 422 define i32 @test_sqrdmlah_extract_i32(i32 %acc, i32 %mhs, <4 x i32> %rhs) { 423 ; CHECK-LABEL: test_sqrdmlah_extract_i32: 424 %extract = extractelement <4 x i32> %rhs, i32 3 425 %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs, i32 %extract) 426 %retval = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %prod) 427 ; CHECK-V8a: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3] 428 ; CHECK-V81a: sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3] 429 ; CHECK-V81a-apple: sqrdmlah.s {{s[0-9]+}}, {{s[0-9]+}}, v0[3] 430 ret i32 %retval 431 } 432 433 define i16 @test_sqrdmlshq_extract_i16(i16 %acc, i16 %x, <8 x i16> %y_vec) { 434 ; CHECK-LABEL: test_sqrdmlshq_extract_i16: 435 %shuffle = shufflevector <8 x i16> %y_vec, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1,i32 1,i32 1,i32 1,i32 1> 436 %x_vec = insertelement <8 x i16> undef, i16 %x, i64 0 437 %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x_vec, <8 x i16> %shuffle) 438 %acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0 439 %retval_vec = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod) 440 %retval = extractelement <8 x i16> %retval_vec, i32 0 441 ; CHECK-V8a: sqrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, v0.h[1] 442 ; CHECK-V81a: sqrdmlsh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, v0.h[1] 443 ; CHECK-V81a-apple: sqrdmlsh.8h {{v[0-9]+}}, {{v[0-9]+}}, v0[1] 444 ret i16 %retval 445 } 446 447 define i32 @test_sqrdmlsh_extract_i32(i32 %acc, i32 %mhs, <4 x i32> %rhs) { 448 ; CHECK-LABEL: test_sqrdmlsh_extract_i32: 449 %extract = extractelement <4 x i32> %rhs, i32 3 450 %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs, i32 %extract) 451 %retval = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %prod) 452 ; CHECK-V8a: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3] 453 ; CHECK-V81a: sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3] 454 ; CHECK-V81a-apple: sqrdmlsh.s {{s[0-9]+}}, {{s[0-9]+}}, v0[3] 455 ret i32 %retval 456 } 457