1 ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s 2 3 4 define <8 x i16> @sabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind { 5 ;CHECK-LABEL: sabdl8h: 6 ;CHECK: sabdl.8h 7 %tmp1 = load <8 x i8>, <8 x i8>* %A 8 %tmp2 = load <8 x i8>, <8 x i8>* %B 9 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 10 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16> 11 ret <8 x i16> %tmp4 12 } 13 14 define <4 x i32> @sabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind { 15 ;CHECK-LABEL: sabdl4s: 16 ;CHECK: sabdl.4s 17 %tmp1 = load <4 x i16>, <4 x i16>* %A 18 %tmp2 = load <4 x i16>, <4 x i16>* %B 19 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 20 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32> 21 ret <4 x i32> %tmp4 22 } 23 24 define <2 x i64> @sabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind { 25 ;CHECK-LABEL: sabdl2d: 26 ;CHECK: sabdl.2d 27 %tmp1 = load <2 x i32>, <2 x i32>* %A 28 %tmp2 = load <2 x i32>, <2 x i32>* %B 29 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 30 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64> 31 ret <2 x i64> %tmp4 32 } 33 34 define <8 x i16> @sabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind { 35 ;CHECK-LABEL: sabdl2_8h: 36 ;CHECK: sabdl.8h 37 %load1 = load <16 x i8>, <16 x i8>* %A 38 %load2 = load <16 x i8>, <16 x i8>* %B 39 %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 40 %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 41 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 42 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16> 43 ret <8 x i16> %tmp4 44 } 45 46 define <4 x i32> @sabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind { 47 ;CHECK-LABEL: sabdl2_4s: 48 ;CHECK: sabdl.4s 49 %load1 = load <8 x i16>, <8 x i16>* %A 50 %load2 = load <8 x i16>, <8 x i16>* %B 51 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 52 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 53 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 54 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32> 55 ret <4 x i32> %tmp4 56 } 57 58 define <2 x i64> @sabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind { 59 ;CHECK-LABEL: sabdl2_2d: 60 ;CHECK: sabdl.2d 61 %load1 = load <4 x i32>, <4 x i32>* %A 62 %load2 = load <4 x i32>, <4 x i32>* %B 63 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 64 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 65 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 66 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64> 67 ret <2 x i64> %tmp4 68 } 69 70 define <8 x i16> @uabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind { 71 ;CHECK-LABEL: uabdl8h: 72 ;CHECK: uabdl.8h 73 %tmp1 = load <8 x i8>, <8 x i8>* %A 74 %tmp2 = load <8 x i8>, <8 x i8>* %B 75 %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 76 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16> 77 ret <8 x i16> %tmp4 78 } 79 80 define <4 x i32> @uabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind { 81 ;CHECK-LABEL: uabdl4s: 82 ;CHECK: uabdl.4s 83 %tmp1 = load <4 x i16>, <4 x i16>* %A 84 %tmp2 = load <4 x i16>, <4 x i16>* %B 85 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 86 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32> 87 ret <4 x i32> %tmp4 88 } 89 90 define <2 x i64> @uabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind { 91 ;CHECK-LABEL: uabdl2d: 92 ;CHECK: uabdl.2d 93 %tmp1 = load <2 x i32>, <2 x i32>* %A 94 %tmp2 = load <2 x i32>, <2 x i32>* %B 95 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 96 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64> 97 ret <2 x i64> %tmp4 98 } 99 100 define <8 x i16> @uabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind { 101 ;CHECK-LABEL: uabdl2_8h: 102 ;CHECK: uabdl.8h 103 %load1 = load <16 x i8>, <16 x i8>* %A 104 %load2 = load <16 x i8>, <16 x i8>* %B 105 %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 106 %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 107 108 %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 109 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16> 110 ret <8 x i16> %tmp4 111 } 112 113 define <4 x i32> @uabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind { 114 ;CHECK-LABEL: uabdl2_4s: 115 ;CHECK: uabdl.4s 116 %load1 = load <8 x i16>, <8 x i16>* %A 117 %load2 = load <8 x i16>, <8 x i16>* %B 118 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 119 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 120 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 121 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32> 122 ret <4 x i32> %tmp4 123 } 124 125 define <2 x i64> @uabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind { 126 ;CHECK-LABEL: uabdl2_2d: 127 ;CHECK: uabdl.2d 128 %load1 = load <4 x i32>, <4 x i32>* %A 129 %load2 = load <4 x i32>, <4 x i32>* %B 130 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 131 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 132 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 133 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64> 134 ret <2 x i64> %tmp4 135 } 136 137 declare i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16>) 138 139 define i16 @uabdl8h_rdx(<16 x i8>* %a, <16 x i8>* %b) { 140 ; CHECK-LABEL: uabdl8h_rdx 141 ; CHECK: uabdl2.8h 142 ; CHECK: uabdl.8h 143 %aload = load <16 x i8>, <16 x i8>* %a, align 1 144 %bload = load <16 x i8>, <16 x i8>* %b, align 1 145 %aext = zext <16 x i8> %aload to <16 x i16> 146 %bext = zext <16 x i8> %bload to <16 x i16> 147 %abdiff = sub nsw <16 x i16> %aext, %bext 148 %abcmp = icmp slt <16 x i16> %abdiff, zeroinitializer 149 %ababs = sub nsw <16 x i16> zeroinitializer, %abdiff 150 %absel = select <16 x i1> %abcmp, <16 x i16> %ababs, <16 x i16> %abdiff 151 %reduced_v = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> %absel) 152 ret i16 %reduced_v 153 } 154 155 declare i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32>) 156 157 define i32 @uabdl4s_rdx(<8 x i16>* %a, <8 x i16>* %b) { 158 ; CHECK-LABEL: uabdl4s_rdx 159 ; CHECK: uabdl2.4s 160 ; CHECK: uabdl.4s 161 %aload = load <8 x i16>, <8 x i16>* %a, align 1 162 %bload = load <8 x i16>, <8 x i16>* %b, align 1 163 %aext = zext <8 x i16> %aload to <8 x i32> 164 %bext = zext <8 x i16> %bload to <8 x i32> 165 %abdiff = sub nsw <8 x i32> %aext, %bext 166 %abcmp = icmp slt <8 x i32> %abdiff, zeroinitializer 167 %ababs = sub nsw <8 x i32> zeroinitializer, %abdiff 168 %absel = select <8 x i1> %abcmp, <8 x i32> %ababs, <8 x i32> %abdiff 169 %reduced_v = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> %absel) 170 ret i32 %reduced_v 171 } 172 173 declare i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64>) 174 175 define i64 @uabdl2d_rdx(<4 x i32>* %a, <4 x i32>* %b, i32 %h) { 176 ; CHECK: uabdl2d_rdx 177 ; CHECK: uabdl2.2d 178 ; CHECK: uabdl.2d 179 %aload = load <4 x i32>, <4 x i32>* %a, align 1 180 %bload = load <4 x i32>, <4 x i32>* %b, align 1 181 %aext = zext <4 x i32> %aload to <4 x i64> 182 %bext = zext <4 x i32> %bload to <4 x i64> 183 %abdiff = sub nsw <4 x i64> %aext, %bext 184 %abcmp = icmp slt <4 x i64> %abdiff, zeroinitializer 185 %ababs = sub nsw <4 x i64> zeroinitializer, %abdiff 186 %absel = select <4 x i1> %abcmp, <4 x i64> %ababs, <4 x i64> %abdiff 187 %reduced_v = call i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64> %absel) 188 ret i64 %reduced_v 189 } 190 191 define <2 x float> @fabd_2s(<2 x float>* %A, <2 x float>* %B) nounwind { 192 ;CHECK-LABEL: fabd_2s: 193 ;CHECK: fabd.2s 194 %tmp1 = load <2 x float>, <2 x float>* %A 195 %tmp2 = load <2 x float>, <2 x float>* %B 196 %tmp3 = call <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float> %tmp1, <2 x float> %tmp2) 197 ret <2 x float> %tmp3 198 } 199 200 define <4 x float> @fabd_4s(<4 x float>* %A, <4 x float>* %B) nounwind { 201 ;CHECK-LABEL: fabd_4s: 202 ;CHECK: fabd.4s 203 %tmp1 = load <4 x float>, <4 x float>* %A 204 %tmp2 = load <4 x float>, <4 x float>* %B 205 %tmp3 = call <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float> %tmp1, <4 x float> %tmp2) 206 ret <4 x float> %tmp3 207 } 208 209 define <2 x double> @fabd_2d(<2 x double>* %A, <2 x double>* %B) nounwind { 210 ;CHECK-LABEL: fabd_2d: 211 ;CHECK: fabd.2d 212 %tmp1 = load <2 x double>, <2 x double>* %A 213 %tmp2 = load <2 x double>, <2 x double>* %B 214 %tmp3 = call <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double> %tmp1, <2 x double> %tmp2) 215 ret <2 x double> %tmp3 216 } 217 218 declare <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float>, <2 x float>) nounwind readnone 219 declare <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float>, <4 x float>) nounwind readnone 220 declare <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double>, <2 x double>) nounwind readnone 221 222 define <2 x float> @fabd_2s_from_fsub_fabs(<2 x float>* %A, <2 x float>* %B) nounwind { 223 ;CHECK-LABEL: fabd_2s_from_fsub_fabs: 224 ;CHECK: fabd.2s 225 %tmp1 = load <2 x float>, <2 x float>* %A 226 %tmp2 = load <2 x float>, <2 x float>* %B 227 %sub = fsub <2 x float> %tmp1, %tmp2 228 %abs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %sub) 229 ret <2 x float> %abs 230 } 231 232 define <4 x float> @fabd_4s_from_fsub_fabs(<4 x float>* %A, <4 x float>* %B) nounwind { 233 ;CHECK-LABEL: fabd_4s_from_fsub_fabs: 234 ;CHECK: fabd.4s 235 %tmp1 = load <4 x float>, <4 x float>* %A 236 %tmp2 = load <4 x float>, <4 x float>* %B 237 %sub = fsub <4 x float> %tmp1, %tmp2 238 %abs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %sub) 239 ret <4 x float> %abs 240 } 241 242 define <2 x double> @fabd_2d_from_fsub_fabs(<2 x double>* %A, <2 x double>* %B) nounwind { 243 ;CHECK-LABEL: fabd_2d_from_fsub_fabs: 244 ;CHECK: fabd.2d 245 %tmp1 = load <2 x double>, <2 x double>* %A 246 %tmp2 = load <2 x double>, <2 x double>* %B 247 %sub = fsub <2 x double> %tmp1, %tmp2 248 %abs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %sub) 249 ret <2 x double> %abs 250 } 251 252 declare <2 x float> @llvm.fabs.v2f32(<2 x float>) nounwind readnone 253 declare <4 x float> @llvm.fabs.v4f32(<4 x float>) nounwind readnone 254 declare <2 x double> @llvm.fabs.v2f64(<2 x double>) nounwind readnone 255 256 define <8 x i8> @sabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind { 257 ;CHECK-LABEL: sabd_8b: 258 ;CHECK: sabd.8b 259 %tmp1 = load <8 x i8>, <8 x i8>* %A 260 %tmp2 = load <8 x i8>, <8 x i8>* %B 261 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 262 ret <8 x i8> %tmp3 263 } 264 265 define <16 x i8> @sabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind { 266 ;CHECK-LABEL: sabd_16b: 267 ;CHECK: sabd.16b 268 %tmp1 = load <16 x i8>, <16 x i8>* %A 269 %tmp2 = load <16 x i8>, <16 x i8>* %B 270 %tmp3 = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) 271 ret <16 x i8> %tmp3 272 } 273 274 define <4 x i16> @sabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind { 275 ;CHECK-LABEL: sabd_4h: 276 ;CHECK: sabd.4h 277 %tmp1 = load <4 x i16>, <4 x i16>* %A 278 %tmp2 = load <4 x i16>, <4 x i16>* %B 279 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 280 ret <4 x i16> %tmp3 281 } 282 283 define <8 x i16> @sabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind { 284 ;CHECK-LABEL: sabd_8h: 285 ;CHECK: sabd.8h 286 %tmp1 = load <8 x i16>, <8 x i16>* %A 287 %tmp2 = load <8 x i16>, <8 x i16>* %B 288 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) 289 ret <8 x i16> %tmp3 290 } 291 292 define <2 x i32> @sabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind { 293 ;CHECK-LABEL: sabd_2s: 294 ;CHECK: sabd.2s 295 %tmp1 = load <2 x i32>, <2 x i32>* %A 296 %tmp2 = load <2 x i32>, <2 x i32>* %B 297 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 298 ret <2 x i32> %tmp3 299 } 300 301 define <4 x i32> @sabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind { 302 ;CHECK-LABEL: sabd_4s: 303 ;CHECK: sabd.4s 304 %tmp1 = load <4 x i32>, <4 x i32>* %A 305 %tmp2 = load <4 x i32>, <4 x i32>* %B 306 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) 307 ret <4 x i32> %tmp3 308 } 309 310 declare <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone 311 declare <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone 312 declare <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone 313 declare <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone 314 declare <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 315 declare <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 316 317 define <8 x i8> @uabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind { 318 ;CHECK-LABEL: uabd_8b: 319 ;CHECK: uabd.8b 320 %tmp1 = load <8 x i8>, <8 x i8>* %A 321 %tmp2 = load <8 x i8>, <8 x i8>* %B 322 %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 323 ret <8 x i8> %tmp3 324 } 325 326 define <16 x i8> @uabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind { 327 ;CHECK-LABEL: uabd_16b: 328 ;CHECK: uabd.16b 329 %tmp1 = load <16 x i8>, <16 x i8>* %A 330 %tmp2 = load <16 x i8>, <16 x i8>* %B 331 %tmp3 = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) 332 ret <16 x i8> %tmp3 333 } 334 335 define <4 x i16> @uabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind { 336 ;CHECK-LABEL: uabd_4h: 337 ;CHECK: uabd.4h 338 %tmp1 = load <4 x i16>, <4 x i16>* %A 339 %tmp2 = load <4 x i16>, <4 x i16>* %B 340 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 341 ret <4 x i16> %tmp3 342 } 343 344 define <8 x i16> @uabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind { 345 ;CHECK-LABEL: uabd_8h: 346 ;CHECK: uabd.8h 347 %tmp1 = load <8 x i16>, <8 x i16>* %A 348 %tmp2 = load <8 x i16>, <8 x i16>* %B 349 %tmp3 = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) 350 ret <8 x i16> %tmp3 351 } 352 353 define <2 x i32> @uabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind { 354 ;CHECK-LABEL: uabd_2s: 355 ;CHECK: uabd.2s 356 %tmp1 = load <2 x i32>, <2 x i32>* %A 357 %tmp2 = load <2 x i32>, <2 x i32>* %B 358 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 359 ret <2 x i32> %tmp3 360 } 361 362 define <4 x i32> @uabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind { 363 ;CHECK-LABEL: uabd_4s: 364 ;CHECK: uabd.4s 365 %tmp1 = load <4 x i32>, <4 x i32>* %A 366 %tmp2 = load <4 x i32>, <4 x i32>* %B 367 %tmp3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) 368 ret <4 x i32> %tmp3 369 } 370 371 declare <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone 372 declare <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone 373 declare <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone 374 declare <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone 375 declare <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 376 declare <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 377 378 define <8 x i8> @sqabs_8b(<8 x i8>* %A) nounwind { 379 ;CHECK-LABEL: sqabs_8b: 380 ;CHECK: sqabs.8b 381 %tmp1 = load <8 x i8>, <8 x i8>* %A 382 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8> %tmp1) 383 ret <8 x i8> %tmp3 384 } 385 386 define <16 x i8> @sqabs_16b(<16 x i8>* %A) nounwind { 387 ;CHECK-LABEL: sqabs_16b: 388 ;CHECK: sqabs.16b 389 %tmp1 = load <16 x i8>, <16 x i8>* %A 390 %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8> %tmp1) 391 ret <16 x i8> %tmp3 392 } 393 394 define <4 x i16> @sqabs_4h(<4 x i16>* %A) nounwind { 395 ;CHECK-LABEL: sqabs_4h: 396 ;CHECK: sqabs.4h 397 %tmp1 = load <4 x i16>, <4 x i16>* %A 398 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16> %tmp1) 399 ret <4 x i16> %tmp3 400 } 401 402 define <8 x i16> @sqabs_8h(<8 x i16>* %A) nounwind { 403 ;CHECK-LABEL: sqabs_8h: 404 ;CHECK: sqabs.8h 405 %tmp1 = load <8 x i16>, <8 x i16>* %A 406 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16> %tmp1) 407 ret <8 x i16> %tmp3 408 } 409 410 define <2 x i32> @sqabs_2s(<2 x i32>* %A) nounwind { 411 ;CHECK-LABEL: sqabs_2s: 412 ;CHECK: sqabs.2s 413 %tmp1 = load <2 x i32>, <2 x i32>* %A 414 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32> %tmp1) 415 ret <2 x i32> %tmp3 416 } 417 418 define <4 x i32> @sqabs_4s(<4 x i32>* %A) nounwind { 419 ;CHECK-LABEL: sqabs_4s: 420 ;CHECK: sqabs.4s 421 %tmp1 = load <4 x i32>, <4 x i32>* %A 422 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32> %tmp1) 423 ret <4 x i32> %tmp3 424 } 425 426 declare <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8>) nounwind readnone 427 declare <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8>) nounwind readnone 428 declare <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16>) nounwind readnone 429 declare <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16>) nounwind readnone 430 declare <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32>) nounwind readnone 431 declare <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32>) nounwind readnone 432 433 define <8 x i8> @sqneg_8b(<8 x i8>* %A) nounwind { 434 ;CHECK-LABEL: sqneg_8b: 435 ;CHECK: sqneg.8b 436 %tmp1 = load <8 x i8>, <8 x i8>* %A 437 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8> %tmp1) 438 ret <8 x i8> %tmp3 439 } 440 441 define <16 x i8> @sqneg_16b(<16 x i8>* %A) nounwind { 442 ;CHECK-LABEL: sqneg_16b: 443 ;CHECK: sqneg.16b 444 %tmp1 = load <16 x i8>, <16 x i8>* %A 445 %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8> %tmp1) 446 ret <16 x i8> %tmp3 447 } 448 449 define <4 x i16> @sqneg_4h(<4 x i16>* %A) nounwind { 450 ;CHECK-LABEL: sqneg_4h: 451 ;CHECK: sqneg.4h 452 %tmp1 = load <4 x i16>, <4 x i16>* %A 453 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16> %tmp1) 454 ret <4 x i16> %tmp3 455 } 456 457 define <8 x i16> @sqneg_8h(<8 x i16>* %A) nounwind { 458 ;CHECK-LABEL: sqneg_8h: 459 ;CHECK: sqneg.8h 460 %tmp1 = load <8 x i16>, <8 x i16>* %A 461 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> %tmp1) 462 ret <8 x i16> %tmp3 463 } 464 465 define <2 x i32> @sqneg_2s(<2 x i32>* %A) nounwind { 466 ;CHECK-LABEL: sqneg_2s: 467 ;CHECK: sqneg.2s 468 %tmp1 = load <2 x i32>, <2 x i32>* %A 469 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32> %tmp1) 470 ret <2 x i32> %tmp3 471 } 472 473 define <4 x i32> @sqneg_4s(<4 x i32>* %A) nounwind { 474 ;CHECK-LABEL: sqneg_4s: 475 ;CHECK: sqneg.4s 476 %tmp1 = load <4 x i32>, <4 x i32>* %A 477 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32> %tmp1) 478 ret <4 x i32> %tmp3 479 } 480 481 declare <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8>) nounwind readnone 482 declare <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8>) nounwind readnone 483 declare <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16>) nounwind readnone 484 declare <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16>) nounwind readnone 485 declare <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32>) nounwind readnone 486 declare <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32>) nounwind readnone 487 488 define <8 x i8> @abs_8b(<8 x i8>* %A) nounwind { 489 ;CHECK-LABEL: abs_8b: 490 ;CHECK: abs.8b 491 %tmp1 = load <8 x i8>, <8 x i8>* %A 492 %tmp3 = call <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8> %tmp1) 493 ret <8 x i8> %tmp3 494 } 495 496 define <16 x i8> @abs_16b(<16 x i8>* %A) nounwind { 497 ;CHECK-LABEL: abs_16b: 498 ;CHECK: abs.16b 499 %tmp1 = load <16 x i8>, <16 x i8>* %A 500 %tmp3 = call <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8> %tmp1) 501 ret <16 x i8> %tmp3 502 } 503 504 define <4 x i16> @abs_4h(<4 x i16>* %A) nounwind { 505 ;CHECK-LABEL: abs_4h: 506 ;CHECK: abs.4h 507 %tmp1 = load <4 x i16>, <4 x i16>* %A 508 %tmp3 = call <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16> %tmp1) 509 ret <4 x i16> %tmp3 510 } 511 512 define <8 x i16> @abs_8h(<8 x i16>* %A) nounwind { 513 ;CHECK-LABEL: abs_8h: 514 ;CHECK: abs.8h 515 %tmp1 = load <8 x i16>, <8 x i16>* %A 516 %tmp3 = call <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16> %tmp1) 517 ret <8 x i16> %tmp3 518 } 519 520 define <2 x i32> @abs_2s(<2 x i32>* %A) nounwind { 521 ;CHECK-LABEL: abs_2s: 522 ;CHECK: abs.2s 523 %tmp1 = load <2 x i32>, <2 x i32>* %A 524 %tmp3 = call <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32> %tmp1) 525 ret <2 x i32> %tmp3 526 } 527 528 define <4 x i32> @abs_4s(<4 x i32>* %A) nounwind { 529 ;CHECK-LABEL: abs_4s: 530 ;CHECK: abs.4s 531 %tmp1 = load <4 x i32>, <4 x i32>* %A 532 %tmp3 = call <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32> %tmp1) 533 ret <4 x i32> %tmp3 534 } 535 536 define <1 x i64> @abs_1d(<1 x i64> %A) nounwind { 537 ; CHECK-LABEL: abs_1d: 538 ; CHECK: abs d0, d0 539 %abs = call <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64> %A) 540 ret <1 x i64> %abs 541 } 542 543 define i64 @abs_1d_honestly(i64 %A) nounwind { 544 ; CHECK-LABEL: abs_1d_honestly: 545 ; CHECK: abs d0, d0 546 %abs = call i64 @llvm.aarch64.neon.abs.i64(i64 %A) 547 ret i64 %abs 548 } 549 550 declare <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8>) nounwind readnone 551 declare <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8>) nounwind readnone 552 declare <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16>) nounwind readnone 553 declare <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16>) nounwind readnone 554 declare <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32>) nounwind readnone 555 declare <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32>) nounwind readnone 556 declare <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64>) nounwind readnone 557 declare i64 @llvm.aarch64.neon.abs.i64(i64) nounwind readnone 558 559 define <8 x i16> @sabal8h(<8 x i8>* %A, <8 x i8>* %B, <8 x i16>* %C) nounwind { 560 ;CHECK-LABEL: sabal8h: 561 ;CHECK: sabal.8h 562 %tmp1 = load <8 x i8>, <8 x i8>* %A 563 %tmp2 = load <8 x i8>, <8 x i8>* %B 564 %tmp3 = load <8 x i16>, <8 x i16>* %C 565 %tmp4 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 566 %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16> 567 %tmp5 = add <8 x i16> %tmp3, %tmp4.1 568 ret <8 x i16> %tmp5 569 } 570 571 define <4 x i32> @sabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { 572 ;CHECK-LABEL: sabal4s: 573 ;CHECK: sabal.4s 574 %tmp1 = load <4 x i16>, <4 x i16>* %A 575 %tmp2 = load <4 x i16>, <4 x i16>* %B 576 %tmp3 = load <4 x i32>, <4 x i32>* %C 577 %tmp4 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 578 %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32> 579 %tmp5 = add <4 x i32> %tmp3, %tmp4.1 580 ret <4 x i32> %tmp5 581 } 582 583 define <2 x i64> @sabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { 584 ;CHECK-LABEL: sabal2d: 585 ;CHECK: sabal.2d 586 %tmp1 = load <2 x i32>, <2 x i32>* %A 587 %tmp2 = load <2 x i32>, <2 x i32>* %B 588 %tmp3 = load <2 x i64>, <2 x i64>* %C 589 %tmp4 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 590 %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64> 591 %tmp4.1.1 = zext <2 x i32> %tmp4 to <2 x i64> 592 %tmp5 = add <2 x i64> %tmp3, %tmp4.1 593 ret <2 x i64> %tmp5 594 } 595 596 define <8 x i16> @sabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind { 597 ;CHECK-LABEL: sabal2_8h: 598 ;CHECK: sabal.8h 599 %load1 = load <16 x i8>, <16 x i8>* %A 600 %load2 = load <16 x i8>, <16 x i8>* %B 601 %tmp3 = load <8 x i16>, <8 x i16>* %C 602 %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 603 %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 604 %tmp4 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 605 %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16> 606 %tmp5 = add <8 x i16> %tmp3, %tmp4.1 607 ret <8 x i16> %tmp5 608 } 609 610 define <4 x i32> @sabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind { 611 ;CHECK-LABEL: sabal2_4s: 612 ;CHECK: sabal.4s 613 %load1 = load <8 x i16>, <8 x i16>* %A 614 %load2 = load <8 x i16>, <8 x i16>* %B 615 %tmp3 = load <4 x i32>, <4 x i32>* %C 616 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 617 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 618 %tmp4 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 619 %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32> 620 %tmp5 = add <4 x i32> %tmp3, %tmp4.1 621 ret <4 x i32> %tmp5 622 } 623 624 define <2 x i64> @sabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind { 625 ;CHECK-LABEL: sabal2_2d: 626 ;CHECK: sabal.2d 627 %load1 = load <4 x i32>, <4 x i32>* %A 628 %load2 = load <4 x i32>, <4 x i32>* %B 629 %tmp3 = load <2 x i64>, <2 x i64>* %C 630 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 631 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 632 %tmp4 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 633 %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64> 634 %tmp5 = add <2 x i64> %tmp3, %tmp4.1 635 ret <2 x i64> %tmp5 636 } 637 638 define <8 x i16> @uabal8h(<8 x i8>* %A, <8 x i8>* %B, <8 x i16>* %C) nounwind { 639 ;CHECK-LABEL: uabal8h: 640 ;CHECK: uabal.8h 641 %tmp1 = load <8 x i8>, <8 x i8>* %A 642 %tmp2 = load <8 x i8>, <8 x i8>* %B 643 %tmp3 = load <8 x i16>, <8 x i16>* %C 644 %tmp4 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 645 %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16> 646 %tmp5 = add <8 x i16> %tmp3, %tmp4.1 647 ret <8 x i16> %tmp5 648 } 649 650 define <4 x i32> @uabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { 651 ;CHECK-LABEL: uabal4s: 652 ;CHECK: uabal.4s 653 %tmp1 = load <4 x i16>, <4 x i16>* %A 654 %tmp2 = load <4 x i16>, <4 x i16>* %B 655 %tmp3 = load <4 x i32>, <4 x i32>* %C 656 %tmp4 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 657 %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32> 658 %tmp5 = add <4 x i32> %tmp3, %tmp4.1 659 ret <4 x i32> %tmp5 660 } 661 662 define <2 x i64> @uabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { 663 ;CHECK-LABEL: uabal2d: 664 ;CHECK: uabal.2d 665 %tmp1 = load <2 x i32>, <2 x i32>* %A 666 %tmp2 = load <2 x i32>, <2 x i32>* %B 667 %tmp3 = load <2 x i64>, <2 x i64>* %C 668 %tmp4 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 669 %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64> 670 %tmp5 = add <2 x i64> %tmp3, %tmp4.1 671 ret <2 x i64> %tmp5 672 } 673 674 define <8 x i16> @uabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind { 675 ;CHECK-LABEL: uabal2_8h: 676 ;CHECK: uabal.8h 677 %load1 = load <16 x i8>, <16 x i8>* %A 678 %load2 = load <16 x i8>, <16 x i8>* %B 679 %tmp3 = load <8 x i16>, <8 x i16>* %C 680 %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 681 %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 682 %tmp4 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 683 %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16> 684 %tmp5 = add <8 x i16> %tmp3, %tmp4.1 685 ret <8 x i16> %tmp5 686 } 687 688 define <4 x i32> @uabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind { 689 ;CHECK-LABEL: uabal2_4s: 690 ;CHECK: uabal.4s 691 %load1 = load <8 x i16>, <8 x i16>* %A 692 %load2 = load <8 x i16>, <8 x i16>* %B 693 %tmp3 = load <4 x i32>, <4 x i32>* %C 694 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 695 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 696 %tmp4 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 697 %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32> 698 %tmp5 = add <4 x i32> %tmp3, %tmp4.1 699 ret <4 x i32> %tmp5 700 } 701 702 define <2 x i64> @uabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind { 703 ;CHECK-LABEL: uabal2_2d: 704 ;CHECK: uabal.2d 705 %load1 = load <4 x i32>, <4 x i32>* %A 706 %load2 = load <4 x i32>, <4 x i32>* %B 707 %tmp3 = load <2 x i64>, <2 x i64>* %C 708 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 709 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 710 %tmp4 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 711 %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64> 712 %tmp5 = add <2 x i64> %tmp3, %tmp4.1 713 ret <2 x i64> %tmp5 714 } 715 716 define <8 x i8> @saba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { 717 ;CHECK-LABEL: saba_8b: 718 ;CHECK: saba.8b 719 %tmp1 = load <8 x i8>, <8 x i8>* %A 720 %tmp2 = load <8 x i8>, <8 x i8>* %B 721 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 722 %tmp4 = load <8 x i8>, <8 x i8>* %C 723 %tmp5 = add <8 x i8> %tmp3, %tmp4 724 ret <8 x i8> %tmp5 725 } 726 727 define <16 x i8> @saba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind { 728 ;CHECK-LABEL: saba_16b: 729 ;CHECK: saba.16b 730 %tmp1 = load <16 x i8>, <16 x i8>* %A 731 %tmp2 = load <16 x i8>, <16 x i8>* %B 732 %tmp3 = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) 733 %tmp4 = load <16 x i8>, <16 x i8>* %C 734 %tmp5 = add <16 x i8> %tmp3, %tmp4 735 ret <16 x i8> %tmp5 736 } 737 738 define <4 x i16> @saba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { 739 ;CHECK-LABEL: saba_4h: 740 ;CHECK: saba.4h 741 %tmp1 = load <4 x i16>, <4 x i16>* %A 742 %tmp2 = load <4 x i16>, <4 x i16>* %B 743 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 744 %tmp4 = load <4 x i16>, <4 x i16>* %C 745 %tmp5 = add <4 x i16> %tmp3, %tmp4 746 ret <4 x i16> %tmp5 747 } 748 749 define <8 x i16> @saba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind { 750 ;CHECK-LABEL: saba_8h: 751 ;CHECK: saba.8h 752 %tmp1 = load <8 x i16>, <8 x i16>* %A 753 %tmp2 = load <8 x i16>, <8 x i16>* %B 754 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) 755 %tmp4 = load <8 x i16>, <8 x i16>* %C 756 %tmp5 = add <8 x i16> %tmp3, %tmp4 757 ret <8 x i16> %tmp5 758 } 759 760 define <2 x i32> @saba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { 761 ;CHECK-LABEL: saba_2s: 762 ;CHECK: saba.2s 763 %tmp1 = load <2 x i32>, <2 x i32>* %A 764 %tmp2 = load <2 x i32>, <2 x i32>* %B 765 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 766 %tmp4 = load <2 x i32>, <2 x i32>* %C 767 %tmp5 = add <2 x i32> %tmp3, %tmp4 768 ret <2 x i32> %tmp5 769 } 770 771 define <4 x i32> @saba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind { 772 ;CHECK-LABEL: saba_4s: 773 ;CHECK: saba.4s 774 %tmp1 = load <4 x i32>, <4 x i32>* %A 775 %tmp2 = load <4 x i32>, <4 x i32>* %B 776 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) 777 %tmp4 = load <4 x i32>, <4 x i32>* %C 778 %tmp5 = add <4 x i32> %tmp3, %tmp4 779 ret <4 x i32> %tmp5 780 } 781 782 define <8 x i8> @uaba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { 783 ;CHECK-LABEL: uaba_8b: 784 ;CHECK: uaba.8b 785 %tmp1 = load <8 x i8>, <8 x i8>* %A 786 %tmp2 = load <8 x i8>, <8 x i8>* %B 787 %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 788 %tmp4 = load <8 x i8>, <8 x i8>* %C 789 %tmp5 = add <8 x i8> %tmp3, %tmp4 790 ret <8 x i8> %tmp5 791 } 792 793 define <16 x i8> @uaba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind { 794 ;CHECK-LABEL: uaba_16b: 795 ;CHECK: uaba.16b 796 %tmp1 = load <16 x i8>, <16 x i8>* %A 797 %tmp2 = load <16 x i8>, <16 x i8>* %B 798 %tmp3 = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) 799 %tmp4 = load <16 x i8>, <16 x i8>* %C 800 %tmp5 = add <16 x i8> %tmp3, %tmp4 801 ret <16 x i8> %tmp5 802 } 803 804 define <4 x i16> @uaba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { 805 ;CHECK-LABEL: uaba_4h: 806 ;CHECK: uaba.4h 807 %tmp1 = load <4 x i16>, <4 x i16>* %A 808 %tmp2 = load <4 x i16>, <4 x i16>* %B 809 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 810 %tmp4 = load <4 x i16>, <4 x i16>* %C 811 %tmp5 = add <4 x i16> %tmp3, %tmp4 812 ret <4 x i16> %tmp5 813 } 814 815 define <8 x i16> @uaba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind { 816 ;CHECK-LABEL: uaba_8h: 817 ;CHECK: uaba.8h 818 %tmp1 = load <8 x i16>, <8 x i16>* %A 819 %tmp2 = load <8 x i16>, <8 x i16>* %B 820 %tmp3 = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) 821 %tmp4 = load <8 x i16>, <8 x i16>* %C 822 %tmp5 = add <8 x i16> %tmp3, %tmp4 823 ret <8 x i16> %tmp5 824 } 825 826 define <2 x i32> @uaba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { 827 ;CHECK-LABEL: uaba_2s: 828 ;CHECK: uaba.2s 829 %tmp1 = load <2 x i32>, <2 x i32>* %A 830 %tmp2 = load <2 x i32>, <2 x i32>* %B 831 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 832 %tmp4 = load <2 x i32>, <2 x i32>* %C 833 %tmp5 = add <2 x i32> %tmp3, %tmp4 834 ret <2 x i32> %tmp5 835 } 836 837 define <4 x i32> @uaba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind { 838 ;CHECK-LABEL: uaba_4s: 839 ;CHECK: uaba.4s 840 %tmp1 = load <4 x i32>, <4 x i32>* %A 841 %tmp2 = load <4 x i32>, <4 x i32>* %B 842 %tmp3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) 843 %tmp4 = load <4 x i32>, <4 x i32>* %C 844 %tmp5 = add <4 x i32> %tmp3, %tmp4 845 ret <4 x i32> %tmp5 846 } 847 848 ; Scalar FABD 849 define float @fabds(float %a, float %b) nounwind { 850 ; CHECK-LABEL: fabds: 851 ; CHECK: fabd s0, s0, s1 852 %vabd.i = tail call float @llvm.aarch64.sisd.fabd.f32(float %a, float %b) nounwind 853 ret float %vabd.i 854 } 855 856 define double @fabdd(double %a, double %b) nounwind { 857 ; CHECK-LABEL: fabdd: 858 ; CHECK: fabd d0, d0, d1 859 %vabd.i = tail call double @llvm.aarch64.sisd.fabd.f64(double %a, double %b) nounwind 860 ret double %vabd.i 861 } 862 863 declare double @llvm.aarch64.sisd.fabd.f64(double, double) nounwind readnone 864 declare float @llvm.aarch64.sisd.fabd.f32(float, float) nounwind readnone 865 866 define float @fabds_from_fsub_fabs(float %a, float %b) nounwind { 867 ; CHECK-LABEL: fabds_from_fsub_fabs: 868 ; CHECK: fabd s0, s0, s1 869 %sub = fsub float %a, %b 870 %abs = tail call float @llvm.fabs.f32(float %sub) 871 ret float %abs 872 } 873 874 define double @fabdd_from_fsub_fabs(double %a, double %b) nounwind { 875 ; CHECK-LABEL: fabdd_from_fsub_fabs: 876 ; CHECK: fabd d0, d0, d1 877 %sub = fsub double %a, %b 878 %abs = tail call double @llvm.fabs.f64(double %sub) 879 ret double %abs 880 } 881 882 declare float @llvm.fabs.f32(float) nounwind readnone 883 declare double @llvm.fabs.f64(double) nounwind readnone 884 885 define <2 x i64> @uabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) { 886 ; CHECK-LABEL: uabdl_from_extract_dup: 887 ; CHECK-NOT: ext.16b 888 ; CHECK: uabdl2.2d 889 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0 890 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1 891 892 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 893 894 %res = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind 895 %res1 = zext <2 x i32> %res to <2 x i64> 896 ret <2 x i64> %res1 897 } 898 899 define <2 x i64> @sabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) { 900 ; CHECK-LABEL: sabdl_from_extract_dup: 901 ; CHECK-NOT: ext.16b 902 ; CHECK: sabdl2.2d 903 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0 904 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1 905 906 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 907 908 %res = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind 909 %res1 = zext <2 x i32> %res to <2 x i64> 910 ret <2 x i64> %res1 911 } 912 913 define <2 x i32> @abspattern1(<2 x i32> %a) nounwind { 914 ; CHECK-LABEL: abspattern1: 915 ; CHECK: abs.2s 916 ; CHECK-NEXT: ret 917 %tmp1neg = sub <2 x i32> zeroinitializer, %a 918 %b = icmp sge <2 x i32> %a, zeroinitializer 919 %abs = select <2 x i1> %b, <2 x i32> %a, <2 x i32> %tmp1neg 920 ret <2 x i32> %abs 921 } 922 923 define <4 x i16> @abspattern2(<4 x i16> %a) nounwind { 924 ; CHECK-LABEL: abspattern2: 925 ; CHECK: abs.4h 926 ; CHECK-NEXT: ret 927 %tmp1neg = sub <4 x i16> zeroinitializer, %a 928 %b = icmp sgt <4 x i16> %a, zeroinitializer 929 %abs = select <4 x i1> %b, <4 x i16> %a, <4 x i16> %tmp1neg 930 ret <4 x i16> %abs 931 } 932 933 define <8 x i8> @abspattern3(<8 x i8> %a) nounwind { 934 ; CHECK-LABEL: abspattern3: 935 ; CHECK: abs.8b 936 ; CHECK-NEXT: ret 937 %tmp1neg = sub <8 x i8> zeroinitializer, %a 938 %b = icmp slt <8 x i8> %a, zeroinitializer 939 %abs = select <8 x i1> %b, <8 x i8> %tmp1neg, <8 x i8> %a 940 ret <8 x i8> %abs 941 } 942 943 define <4 x i32> @abspattern4(<4 x i32> %a) nounwind { 944 ; CHECK-LABEL: abspattern4: 945 ; CHECK: abs.4s 946 ; CHECK-NEXT: ret 947 %tmp1neg = sub <4 x i32> zeroinitializer, %a 948 %b = icmp sge <4 x i32> %a, zeroinitializer 949 %abs = select <4 x i1> %b, <4 x i32> %a, <4 x i32> %tmp1neg 950 ret <4 x i32> %abs 951 } 952 953 define <8 x i16> @abspattern5(<8 x i16> %a) nounwind { 954 ; CHECK-LABEL: abspattern5: 955 ; CHECK: abs.8h 956 ; CHECK-NEXT: ret 957 %tmp1neg = sub <8 x i16> zeroinitializer, %a 958 %b = icmp sgt <8 x i16> %a, zeroinitializer 959 %abs = select <8 x i1> %b, <8 x i16> %a, <8 x i16> %tmp1neg 960 ret <8 x i16> %abs 961 } 962 963 define <16 x i8> @abspattern6(<16 x i8> %a) nounwind { 964 ; CHECK-LABEL: abspattern6: 965 ; CHECK: abs.16b 966 ; CHECK-NEXT: ret 967 %tmp1neg = sub <16 x i8> zeroinitializer, %a 968 %b = icmp slt <16 x i8> %a, zeroinitializer 969 %abs = select <16 x i1> %b, <16 x i8> %tmp1neg, <16 x i8> %a 970 ret <16 x i8> %abs 971 } 972 973 define <2 x i64> @abspattern7(<2 x i64> %a) nounwind { 974 ; CHECK-LABEL: abspattern7: 975 ; CHECK: abs.2d 976 ; CHECK-NEXT: ret 977 %tmp1neg = sub <2 x i64> zeroinitializer, %a 978 %b = icmp sle <2 x i64> %a, zeroinitializer 979 %abs = select <2 x i1> %b, <2 x i64> %tmp1neg, <2 x i64> %a 980 ret <2 x i64> %abs 981 } 982