1 ; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s 2 3 4 define <8 x i16> @sabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind { 5 ;CHECK-LABEL: sabdl8h: 6 ;CHECK: sabdl.8h 7 %tmp1 = load <8 x i8>, <8 x i8>* %A 8 %tmp2 = load <8 x i8>, <8 x i8>* %B 9 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 10 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16> 11 ret <8 x i16> %tmp4 12 } 13 14 define <4 x i32> @sabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind { 15 ;CHECK-LABEL: sabdl4s: 16 ;CHECK: sabdl.4s 17 %tmp1 = load <4 x i16>, <4 x i16>* %A 18 %tmp2 = load <4 x i16>, <4 x i16>* %B 19 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 20 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32> 21 ret <4 x i32> %tmp4 22 } 23 24 define <2 x i64> @sabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind { 25 ;CHECK-LABEL: sabdl2d: 26 ;CHECK: sabdl.2d 27 %tmp1 = load <2 x i32>, <2 x i32>* %A 28 %tmp2 = load <2 x i32>, <2 x i32>* %B 29 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 30 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64> 31 ret <2 x i64> %tmp4 32 } 33 34 define <8 x i16> @sabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind { 35 ;CHECK-LABEL: sabdl2_8h: 36 ;CHECK: sabdl2.8h 37 %load1 = load <16 x i8>, <16 x i8>* %A 38 %load2 = load <16 x i8>, <16 x i8>* %B 39 %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 40 %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 41 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 42 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16> 43 ret <8 x i16> %tmp4 44 } 45 46 define <4 x i32> @sabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind { 47 ;CHECK-LABEL: sabdl2_4s: 48 ;CHECK: sabdl2.4s 49 %load1 = load <8 x i16>, <8 x i16>* %A 50 %load2 = load <8 x i16>, <8 x i16>* %B 51 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 52 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 53 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 54 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32> 55 ret <4 x i32> %tmp4 56 } 57 58 define <2 x i64> @sabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind { 59 ;CHECK-LABEL: sabdl2_2d: 60 ;CHECK: sabdl2.2d 61 %load1 = load <4 x i32>, <4 x i32>* %A 62 %load2 = load <4 x i32>, <4 x i32>* %B 63 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 64 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 65 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 66 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64> 67 ret <2 x i64> %tmp4 68 } 69 70 define <8 x i16> @uabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind { 71 ;CHECK-LABEL: uabdl8h: 72 ;CHECK: uabdl.8h 73 %tmp1 = load <8 x i8>, <8 x i8>* %A 74 %tmp2 = load <8 x i8>, <8 x i8>* %B 75 %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 76 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16> 77 ret <8 x i16> %tmp4 78 } 79 80 define <4 x i32> @uabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind { 81 ;CHECK-LABEL: uabdl4s: 82 ;CHECK: uabdl.4s 83 %tmp1 = load <4 x i16>, <4 x i16>* %A 84 %tmp2 = load <4 x i16>, <4 x i16>* %B 85 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 86 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32> 87 ret <4 x i32> %tmp4 88 } 89 90 define <2 x i64> @uabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind { 91 ;CHECK-LABEL: uabdl2d: 92 ;CHECK: uabdl.2d 93 %tmp1 = load <2 x i32>, <2 x i32>* %A 94 %tmp2 = load <2 x i32>, <2 x i32>* %B 95 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 96 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64> 97 ret <2 x i64> %tmp4 98 } 99 100 define <8 x i16> @uabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind { 101 ;CHECK-LABEL: uabdl2_8h: 102 ;CHECK: uabdl2.8h 103 %load1 = load <16 x i8>, <16 x i8>* %A 104 %load2 = load <16 x i8>, <16 x i8>* %B 105 %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 106 %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 107 108 %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 109 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16> 110 ret <8 x i16> %tmp4 111 } 112 113 define <4 x i32> @uabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind { 114 ;CHECK-LABEL: uabdl2_4s: 115 ;CHECK: uabdl2.4s 116 %load1 = load <8 x i16>, <8 x i16>* %A 117 %load2 = load <8 x i16>, <8 x i16>* %B 118 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 119 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 120 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 121 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32> 122 ret <4 x i32> %tmp4 123 } 124 125 define <2 x i64> @uabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind { 126 ;CHECK-LABEL: uabdl2_2d: 127 ;CHECK: uabdl2.2d 128 %load1 = load <4 x i32>, <4 x i32>* %A 129 %load2 = load <4 x i32>, <4 x i32>* %B 130 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 131 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 132 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 133 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64> 134 ret <2 x i64> %tmp4 135 } 136 137 define i16 @uabdl8h_log2_shuffle(<16 x i8>* %a, <16 x i8>* %b) { 138 ; CHECK-LABEL: uabdl8h_log2_shuffle 139 ; CHECK: uabdl2.8h 140 ; CHECK: uabdl.8h 141 %aload = load <16 x i8>, <16 x i8>* %a, align 1 142 %bload = load <16 x i8>, <16 x i8>* %b, align 1 143 %aext = zext <16 x i8> %aload to <16 x i16> 144 %bext = zext <16 x i8> %bload to <16 x i16> 145 %abdiff = sub nsw <16 x i16> %aext, %bext 146 %abcmp = icmp slt <16 x i16> %abdiff, zeroinitializer 147 %ababs = sub nsw <16 x i16> zeroinitializer, %abdiff 148 %absel = select <16 x i1> %abcmp, <16 x i16> %ababs, <16 x i16> %abdiff 149 %rdx.shuf = shufflevector <16 x i16> %absel, <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 150 %bin1.rdx = add <16 x i16> %absel, %rdx.shuf 151 %rdx.shufx = shufflevector <16 x i16> %bin1.rdx, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 152 %bin.rdx = add <16 x i16> %bin1.rdx, %rdx.shufx 153 %rdx.shuf136 = shufflevector <16 x i16> %bin.rdx, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 154 %bin.rdx137 = add <16 x i16> %bin.rdx, %rdx.shuf136 155 %rdx.shuf138 = shufflevector <16 x i16> %bin.rdx137, <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 156 %bin.rdx139 = add <16 x i16> %bin.rdx137, %rdx.shuf138 157 %reduced_v = extractelement <16 x i16> %bin.rdx139, i16 0 158 ret i16 %reduced_v 159 } 160 161 define i32 @uabdl4s_log2_shuffle(<8 x i16>* %a, <8 x i16>* %b) { 162 ; CHECK-LABEL: uabdl4s_log2_shuffle 163 ; CHECK: uabdl2.4s 164 ; CHECK: uabdl.4s 165 %aload = load <8 x i16>, <8 x i16>* %a, align 1 166 %bload = load <8 x i16>, <8 x i16>* %b, align 1 167 %aext = zext <8 x i16> %aload to <8 x i32> 168 %bext = zext <8 x i16> %bload to <8 x i32> 169 %abdiff = sub nsw <8 x i32> %aext, %bext 170 %abcmp = icmp slt <8 x i32> %abdiff, zeroinitializer 171 %ababs = sub nsw <8 x i32> zeroinitializer, %abdiff 172 %absel = select <8 x i1> %abcmp, <8 x i32> %ababs, <8 x i32> %abdiff 173 %rdx.shuf = shufflevector <8 x i32> %absel, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 174 %bin.rdx = add <8 x i32> %absel, %rdx.shuf 175 %rdx.shuf136 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 176 %bin.rdx137 = add <8 x i32> %bin.rdx, %rdx.shuf136 177 %rdx.shuf138 = shufflevector <8 x i32> %bin.rdx137, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 178 %bin.rdx139 = add <8 x i32> %bin.rdx137, %rdx.shuf138 179 %reduced_v = extractelement <8 x i32> %bin.rdx139, i32 0 180 ret i32 %reduced_v 181 } 182 183 define i64 @uabdl2d_log2_shuffle(<4 x i32>* %a, <4 x i32>* %b, i32 %h) { 184 ; CHECK: uabdl2d_log2_shuffle 185 ; CHECK: uabdl2.2d 186 ; CHECK: uabdl.2d 187 %aload = load <4 x i32>, <4 x i32>* %a, align 1 188 %bload = load <4 x i32>, <4 x i32>* %b, align 1 189 %aext = zext <4 x i32> %aload to <4 x i64> 190 %bext = zext <4 x i32> %bload to <4 x i64> 191 %abdiff = sub nsw <4 x i64> %aext, %bext 192 %abcmp = icmp slt <4 x i64> %abdiff, zeroinitializer 193 %ababs = sub nsw <4 x i64> zeroinitializer, %abdiff 194 %absel = select <4 x i1> %abcmp, <4 x i64> %ababs, <4 x i64> %abdiff 195 %rdx.shuf136 = shufflevector <4 x i64> %absel, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 196 %bin.rdx137 = add <4 x i64> %absel, %rdx.shuf136 197 %rdx.shuf138 = shufflevector <4 x i64> %bin.rdx137, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 198 %bin.rdx139 = add <4 x i64> %bin.rdx137, %rdx.shuf138 199 %reduced_v = extractelement <4 x i64> %bin.rdx139, i16 0 200 ret i64 %reduced_v 201 } 202 203 define <2 x float> @fabd_2s(<2 x float>* %A, <2 x float>* %B) nounwind { 204 ;CHECK-LABEL: fabd_2s: 205 ;CHECK: fabd.2s 206 %tmp1 = load <2 x float>, <2 x float>* %A 207 %tmp2 = load <2 x float>, <2 x float>* %B 208 %tmp3 = call <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float> %tmp1, <2 x float> %tmp2) 209 ret <2 x float> %tmp3 210 } 211 212 define <4 x float> @fabd_4s(<4 x float>* %A, <4 x float>* %B) nounwind { 213 ;CHECK-LABEL: fabd_4s: 214 ;CHECK: fabd.4s 215 %tmp1 = load <4 x float>, <4 x float>* %A 216 %tmp2 = load <4 x float>, <4 x float>* %B 217 %tmp3 = call <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float> %tmp1, <4 x float> %tmp2) 218 ret <4 x float> %tmp3 219 } 220 221 define <2 x double> @fabd_2d(<2 x double>* %A, <2 x double>* %B) nounwind { 222 ;CHECK-LABEL: fabd_2d: 223 ;CHECK: fabd.2d 224 %tmp1 = load <2 x double>, <2 x double>* %A 225 %tmp2 = load <2 x double>, <2 x double>* %B 226 %tmp3 = call <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double> %tmp1, <2 x double> %tmp2) 227 ret <2 x double> %tmp3 228 } 229 230 declare <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float>, <2 x float>) nounwind readnone 231 declare <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float>, <4 x float>) nounwind readnone 232 declare <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double>, <2 x double>) nounwind readnone 233 234 define <8 x i8> @sabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind { 235 ;CHECK-LABEL: sabd_8b: 236 ;CHECK: sabd.8b 237 %tmp1 = load <8 x i8>, <8 x i8>* %A 238 %tmp2 = load <8 x i8>, <8 x i8>* %B 239 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 240 ret <8 x i8> %tmp3 241 } 242 243 define <16 x i8> @sabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind { 244 ;CHECK-LABEL: sabd_16b: 245 ;CHECK: sabd.16b 246 %tmp1 = load <16 x i8>, <16 x i8>* %A 247 %tmp2 = load <16 x i8>, <16 x i8>* %B 248 %tmp3 = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) 249 ret <16 x i8> %tmp3 250 } 251 252 define <4 x i16> @sabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind { 253 ;CHECK-LABEL: sabd_4h: 254 ;CHECK: sabd.4h 255 %tmp1 = load <4 x i16>, <4 x i16>* %A 256 %tmp2 = load <4 x i16>, <4 x i16>* %B 257 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 258 ret <4 x i16> %tmp3 259 } 260 261 define <8 x i16> @sabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind { 262 ;CHECK-LABEL: sabd_8h: 263 ;CHECK: sabd.8h 264 %tmp1 = load <8 x i16>, <8 x i16>* %A 265 %tmp2 = load <8 x i16>, <8 x i16>* %B 266 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) 267 ret <8 x i16> %tmp3 268 } 269 270 define <2 x i32> @sabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind { 271 ;CHECK-LABEL: sabd_2s: 272 ;CHECK: sabd.2s 273 %tmp1 = load <2 x i32>, <2 x i32>* %A 274 %tmp2 = load <2 x i32>, <2 x i32>* %B 275 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 276 ret <2 x i32> %tmp3 277 } 278 279 define <4 x i32> @sabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind { 280 ;CHECK-LABEL: sabd_4s: 281 ;CHECK: sabd.4s 282 %tmp1 = load <4 x i32>, <4 x i32>* %A 283 %tmp2 = load <4 x i32>, <4 x i32>* %B 284 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) 285 ret <4 x i32> %tmp3 286 } 287 288 declare <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone 289 declare <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone 290 declare <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone 291 declare <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone 292 declare <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 293 declare <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 294 295 define <8 x i8> @uabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind { 296 ;CHECK-LABEL: uabd_8b: 297 ;CHECK: uabd.8b 298 %tmp1 = load <8 x i8>, <8 x i8>* %A 299 %tmp2 = load <8 x i8>, <8 x i8>* %B 300 %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 301 ret <8 x i8> %tmp3 302 } 303 304 define <16 x i8> @uabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind { 305 ;CHECK-LABEL: uabd_16b: 306 ;CHECK: uabd.16b 307 %tmp1 = load <16 x i8>, <16 x i8>* %A 308 %tmp2 = load <16 x i8>, <16 x i8>* %B 309 %tmp3 = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) 310 ret <16 x i8> %tmp3 311 } 312 313 define <4 x i16> @uabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind { 314 ;CHECK-LABEL: uabd_4h: 315 ;CHECK: uabd.4h 316 %tmp1 = load <4 x i16>, <4 x i16>* %A 317 %tmp2 = load <4 x i16>, <4 x i16>* %B 318 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 319 ret <4 x i16> %tmp3 320 } 321 322 define <8 x i16> @uabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind { 323 ;CHECK-LABEL: uabd_8h: 324 ;CHECK: uabd.8h 325 %tmp1 = load <8 x i16>, <8 x i16>* %A 326 %tmp2 = load <8 x i16>, <8 x i16>* %B 327 %tmp3 = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) 328 ret <8 x i16> %tmp3 329 } 330 331 define <2 x i32> @uabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind { 332 ;CHECK-LABEL: uabd_2s: 333 ;CHECK: uabd.2s 334 %tmp1 = load <2 x i32>, <2 x i32>* %A 335 %tmp2 = load <2 x i32>, <2 x i32>* %B 336 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 337 ret <2 x i32> %tmp3 338 } 339 340 define <4 x i32> @uabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind { 341 ;CHECK-LABEL: uabd_4s: 342 ;CHECK: uabd.4s 343 %tmp1 = load <4 x i32>, <4 x i32>* %A 344 %tmp2 = load <4 x i32>, <4 x i32>* %B 345 %tmp3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) 346 ret <4 x i32> %tmp3 347 } 348 349 declare <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone 350 declare <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone 351 declare <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone 352 declare <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone 353 declare <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 354 declare <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 355 356 define <8 x i8> @sqabs_8b(<8 x i8>* %A) nounwind { 357 ;CHECK-LABEL: sqabs_8b: 358 ;CHECK: sqabs.8b 359 %tmp1 = load <8 x i8>, <8 x i8>* %A 360 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8> %tmp1) 361 ret <8 x i8> %tmp3 362 } 363 364 define <16 x i8> @sqabs_16b(<16 x i8>* %A) nounwind { 365 ;CHECK-LABEL: sqabs_16b: 366 ;CHECK: sqabs.16b 367 %tmp1 = load <16 x i8>, <16 x i8>* %A 368 %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8> %tmp1) 369 ret <16 x i8> %tmp3 370 } 371 372 define <4 x i16> @sqabs_4h(<4 x i16>* %A) nounwind { 373 ;CHECK-LABEL: sqabs_4h: 374 ;CHECK: sqabs.4h 375 %tmp1 = load <4 x i16>, <4 x i16>* %A 376 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16> %tmp1) 377 ret <4 x i16> %tmp3 378 } 379 380 define <8 x i16> @sqabs_8h(<8 x i16>* %A) nounwind { 381 ;CHECK-LABEL: sqabs_8h: 382 ;CHECK: sqabs.8h 383 %tmp1 = load <8 x i16>, <8 x i16>* %A 384 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16> %tmp1) 385 ret <8 x i16> %tmp3 386 } 387 388 define <2 x i32> @sqabs_2s(<2 x i32>* %A) nounwind { 389 ;CHECK-LABEL: sqabs_2s: 390 ;CHECK: sqabs.2s 391 %tmp1 = load <2 x i32>, <2 x i32>* %A 392 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32> %tmp1) 393 ret <2 x i32> %tmp3 394 } 395 396 define <4 x i32> @sqabs_4s(<4 x i32>* %A) nounwind { 397 ;CHECK-LABEL: sqabs_4s: 398 ;CHECK: sqabs.4s 399 %tmp1 = load <4 x i32>, <4 x i32>* %A 400 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32> %tmp1) 401 ret <4 x i32> %tmp3 402 } 403 404 declare <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8>) nounwind readnone 405 declare <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8>) nounwind readnone 406 declare <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16>) nounwind readnone 407 declare <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16>) nounwind readnone 408 declare <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32>) nounwind readnone 409 declare <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32>) nounwind readnone 410 411 define <8 x i8> @sqneg_8b(<8 x i8>* %A) nounwind { 412 ;CHECK-LABEL: sqneg_8b: 413 ;CHECK: sqneg.8b 414 %tmp1 = load <8 x i8>, <8 x i8>* %A 415 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8> %tmp1) 416 ret <8 x i8> %tmp3 417 } 418 419 define <16 x i8> @sqneg_16b(<16 x i8>* %A) nounwind { 420 ;CHECK-LABEL: sqneg_16b: 421 ;CHECK: sqneg.16b 422 %tmp1 = load <16 x i8>, <16 x i8>* %A 423 %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8> %tmp1) 424 ret <16 x i8> %tmp3 425 } 426 427 define <4 x i16> @sqneg_4h(<4 x i16>* %A) nounwind { 428 ;CHECK-LABEL: sqneg_4h: 429 ;CHECK: sqneg.4h 430 %tmp1 = load <4 x i16>, <4 x i16>* %A 431 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16> %tmp1) 432 ret <4 x i16> %tmp3 433 } 434 435 define <8 x i16> @sqneg_8h(<8 x i16>* %A) nounwind { 436 ;CHECK-LABEL: sqneg_8h: 437 ;CHECK: sqneg.8h 438 %tmp1 = load <8 x i16>, <8 x i16>* %A 439 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> %tmp1) 440 ret <8 x i16> %tmp3 441 } 442 443 define <2 x i32> @sqneg_2s(<2 x i32>* %A) nounwind { 444 ;CHECK-LABEL: sqneg_2s: 445 ;CHECK: sqneg.2s 446 %tmp1 = load <2 x i32>, <2 x i32>* %A 447 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32> %tmp1) 448 ret <2 x i32> %tmp3 449 } 450 451 define <4 x i32> @sqneg_4s(<4 x i32>* %A) nounwind { 452 ;CHECK-LABEL: sqneg_4s: 453 ;CHECK: sqneg.4s 454 %tmp1 = load <4 x i32>, <4 x i32>* %A 455 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32> %tmp1) 456 ret <4 x i32> %tmp3 457 } 458 459 declare <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8>) nounwind readnone 460 declare <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8>) nounwind readnone 461 declare <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16>) nounwind readnone 462 declare <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16>) nounwind readnone 463 declare <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32>) nounwind readnone 464 declare <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32>) nounwind readnone 465 466 define <8 x i8> @abs_8b(<8 x i8>* %A) nounwind { 467 ;CHECK-LABEL: abs_8b: 468 ;CHECK: abs.8b 469 %tmp1 = load <8 x i8>, <8 x i8>* %A 470 %tmp3 = call <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8> %tmp1) 471 ret <8 x i8> %tmp3 472 } 473 474 define <16 x i8> @abs_16b(<16 x i8>* %A) nounwind { 475 ;CHECK-LABEL: abs_16b: 476 ;CHECK: abs.16b 477 %tmp1 = load <16 x i8>, <16 x i8>* %A 478 %tmp3 = call <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8> %tmp1) 479 ret <16 x i8> %tmp3 480 } 481 482 define <4 x i16> @abs_4h(<4 x i16>* %A) nounwind { 483 ;CHECK-LABEL: abs_4h: 484 ;CHECK: abs.4h 485 %tmp1 = load <4 x i16>, <4 x i16>* %A 486 %tmp3 = call <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16> %tmp1) 487 ret <4 x i16> %tmp3 488 } 489 490 define <8 x i16> @abs_8h(<8 x i16>* %A) nounwind { 491 ;CHECK-LABEL: abs_8h: 492 ;CHECK: abs.8h 493 %tmp1 = load <8 x i16>, <8 x i16>* %A 494 %tmp3 = call <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16> %tmp1) 495 ret <8 x i16> %tmp3 496 } 497 498 define <2 x i32> @abs_2s(<2 x i32>* %A) nounwind { 499 ;CHECK-LABEL: abs_2s: 500 ;CHECK: abs.2s 501 %tmp1 = load <2 x i32>, <2 x i32>* %A 502 %tmp3 = call <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32> %tmp1) 503 ret <2 x i32> %tmp3 504 } 505 506 define <4 x i32> @abs_4s(<4 x i32>* %A) nounwind { 507 ;CHECK-LABEL: abs_4s: 508 ;CHECK: abs.4s 509 %tmp1 = load <4 x i32>, <4 x i32>* %A 510 %tmp3 = call <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32> %tmp1) 511 ret <4 x i32> %tmp3 512 } 513 514 define <1 x i64> @abs_1d(<1 x i64> %A) nounwind { 515 ; CHECK-LABEL: abs_1d: 516 ; CHECK: abs d0, d0 517 %abs = call <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64> %A) 518 ret <1 x i64> %abs 519 } 520 521 define i64 @abs_1d_honestly(i64 %A) nounwind { 522 ; CHECK-LABEL: abs_1d_honestly: 523 ; CHECK: abs d0, d0 524 %abs = call i64 @llvm.aarch64.neon.abs.i64(i64 %A) 525 ret i64 %abs 526 } 527 528 declare <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8>) nounwind readnone 529 declare <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8>) nounwind readnone 530 declare <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16>) nounwind readnone 531 declare <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16>) nounwind readnone 532 declare <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32>) nounwind readnone 533 declare <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32>) nounwind readnone 534 declare <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64>) nounwind readnone 535 declare i64 @llvm.aarch64.neon.abs.i64(i64) nounwind readnone 536 537 define <8 x i16> @sabal8h(<8 x i8>* %A, <8 x i8>* %B, <8 x i16>* %C) nounwind { 538 ;CHECK-LABEL: sabal8h: 539 ;CHECK: sabal.8h 540 %tmp1 = load <8 x i8>, <8 x i8>* %A 541 %tmp2 = load <8 x i8>, <8 x i8>* %B 542 %tmp3 = load <8 x i16>, <8 x i16>* %C 543 %tmp4 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 544 %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16> 545 %tmp5 = add <8 x i16> %tmp3, %tmp4.1 546 ret <8 x i16> %tmp5 547 } 548 549 define <4 x i32> @sabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { 550 ;CHECK-LABEL: sabal4s: 551 ;CHECK: sabal.4s 552 %tmp1 = load <4 x i16>, <4 x i16>* %A 553 %tmp2 = load <4 x i16>, <4 x i16>* %B 554 %tmp3 = load <4 x i32>, <4 x i32>* %C 555 %tmp4 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 556 %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32> 557 %tmp5 = add <4 x i32> %tmp3, %tmp4.1 558 ret <4 x i32> %tmp5 559 } 560 561 define <2 x i64> @sabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { 562 ;CHECK-LABEL: sabal2d: 563 ;CHECK: sabal.2d 564 %tmp1 = load <2 x i32>, <2 x i32>* %A 565 %tmp2 = load <2 x i32>, <2 x i32>* %B 566 %tmp3 = load <2 x i64>, <2 x i64>* %C 567 %tmp4 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 568 %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64> 569 %tmp4.1.1 = zext <2 x i32> %tmp4 to <2 x i64> 570 %tmp5 = add <2 x i64> %tmp3, %tmp4.1 571 ret <2 x i64> %tmp5 572 } 573 574 define <8 x i16> @sabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind { 575 ;CHECK-LABEL: sabal2_8h: 576 ;CHECK: sabal2.8h 577 %load1 = load <16 x i8>, <16 x i8>* %A 578 %load2 = load <16 x i8>, <16 x i8>* %B 579 %tmp3 = load <8 x i16>, <8 x i16>* %C 580 %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 581 %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 582 %tmp4 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 583 %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16> 584 %tmp5 = add <8 x i16> %tmp3, %tmp4.1 585 ret <8 x i16> %tmp5 586 } 587 588 define <4 x i32> @sabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind { 589 ;CHECK-LABEL: sabal2_4s: 590 ;CHECK: sabal2.4s 591 %load1 = load <8 x i16>, <8 x i16>* %A 592 %load2 = load <8 x i16>, <8 x i16>* %B 593 %tmp3 = load <4 x i32>, <4 x i32>* %C 594 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 595 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 596 %tmp4 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 597 %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32> 598 %tmp5 = add <4 x i32> %tmp3, %tmp4.1 599 ret <4 x i32> %tmp5 600 } 601 602 define <2 x i64> @sabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind { 603 ;CHECK-LABEL: sabal2_2d: 604 ;CHECK: sabal2.2d 605 %load1 = load <4 x i32>, <4 x i32>* %A 606 %load2 = load <4 x i32>, <4 x i32>* %B 607 %tmp3 = load <2 x i64>, <2 x i64>* %C 608 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 609 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 610 %tmp4 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 611 %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64> 612 %tmp5 = add <2 x i64> %tmp3, %tmp4.1 613 ret <2 x i64> %tmp5 614 } 615 616 define <8 x i16> @uabal8h(<8 x i8>* %A, <8 x i8>* %B, <8 x i16>* %C) nounwind { 617 ;CHECK-LABEL: uabal8h: 618 ;CHECK: uabal.8h 619 %tmp1 = load <8 x i8>, <8 x i8>* %A 620 %tmp2 = load <8 x i8>, <8 x i8>* %B 621 %tmp3 = load <8 x i16>, <8 x i16>* %C 622 %tmp4 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 623 %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16> 624 %tmp5 = add <8 x i16> %tmp3, %tmp4.1 625 ret <8 x i16> %tmp5 626 } 627 628 define <4 x i32> @uabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { 629 ;CHECK-LABEL: uabal4s: 630 ;CHECK: uabal.4s 631 %tmp1 = load <4 x i16>, <4 x i16>* %A 632 %tmp2 = load <4 x i16>, <4 x i16>* %B 633 %tmp3 = load <4 x i32>, <4 x i32>* %C 634 %tmp4 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 635 %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32> 636 %tmp5 = add <4 x i32> %tmp3, %tmp4.1 637 ret <4 x i32> %tmp5 638 } 639 640 define <2 x i64> @uabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { 641 ;CHECK-LABEL: uabal2d: 642 ;CHECK: uabal.2d 643 %tmp1 = load <2 x i32>, <2 x i32>* %A 644 %tmp2 = load <2 x i32>, <2 x i32>* %B 645 %tmp3 = load <2 x i64>, <2 x i64>* %C 646 %tmp4 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 647 %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64> 648 %tmp5 = add <2 x i64> %tmp3, %tmp4.1 649 ret <2 x i64> %tmp5 650 } 651 652 define <8 x i16> @uabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind { 653 ;CHECK-LABEL: uabal2_8h: 654 ;CHECK: uabal2.8h 655 %load1 = load <16 x i8>, <16 x i8>* %A 656 %load2 = load <16 x i8>, <16 x i8>* %B 657 %tmp3 = load <8 x i16>, <8 x i16>* %C 658 %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 659 %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 660 %tmp4 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 661 %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16> 662 %tmp5 = add <8 x i16> %tmp3, %tmp4.1 663 ret <8 x i16> %tmp5 664 } 665 666 define <4 x i32> @uabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind { 667 ;CHECK-LABEL: uabal2_4s: 668 ;CHECK: uabal2.4s 669 %load1 = load <8 x i16>, <8 x i16>* %A 670 %load2 = load <8 x i16>, <8 x i16>* %B 671 %tmp3 = load <4 x i32>, <4 x i32>* %C 672 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 673 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 674 %tmp4 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 675 %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32> 676 %tmp5 = add <4 x i32> %tmp3, %tmp4.1 677 ret <4 x i32> %tmp5 678 } 679 680 define <2 x i64> @uabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind { 681 ;CHECK-LABEL: uabal2_2d: 682 ;CHECK: uabal2.2d 683 %load1 = load <4 x i32>, <4 x i32>* %A 684 %load2 = load <4 x i32>, <4 x i32>* %B 685 %tmp3 = load <2 x i64>, <2 x i64>* %C 686 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 687 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 688 %tmp4 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 689 %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64> 690 %tmp5 = add <2 x i64> %tmp3, %tmp4.1 691 ret <2 x i64> %tmp5 692 } 693 694 define <8 x i8> @saba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { 695 ;CHECK-LABEL: saba_8b: 696 ;CHECK: saba.8b 697 %tmp1 = load <8 x i8>, <8 x i8>* %A 698 %tmp2 = load <8 x i8>, <8 x i8>* %B 699 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 700 %tmp4 = load <8 x i8>, <8 x i8>* %C 701 %tmp5 = add <8 x i8> %tmp3, %tmp4 702 ret <8 x i8> %tmp5 703 } 704 705 define <16 x i8> @saba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind { 706 ;CHECK-LABEL: saba_16b: 707 ;CHECK: saba.16b 708 %tmp1 = load <16 x i8>, <16 x i8>* %A 709 %tmp2 = load <16 x i8>, <16 x i8>* %B 710 %tmp3 = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) 711 %tmp4 = load <16 x i8>, <16 x i8>* %C 712 %tmp5 = add <16 x i8> %tmp3, %tmp4 713 ret <16 x i8> %tmp5 714 } 715 716 define <4 x i16> @saba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { 717 ;CHECK-LABEL: saba_4h: 718 ;CHECK: saba.4h 719 %tmp1 = load <4 x i16>, <4 x i16>* %A 720 %tmp2 = load <4 x i16>, <4 x i16>* %B 721 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 722 %tmp4 = load <4 x i16>, <4 x i16>* %C 723 %tmp5 = add <4 x i16> %tmp3, %tmp4 724 ret <4 x i16> %tmp5 725 } 726 727 define <8 x i16> @saba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind { 728 ;CHECK-LABEL: saba_8h: 729 ;CHECK: saba.8h 730 %tmp1 = load <8 x i16>, <8 x i16>* %A 731 %tmp2 = load <8 x i16>, <8 x i16>* %B 732 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) 733 %tmp4 = load <8 x i16>, <8 x i16>* %C 734 %tmp5 = add <8 x i16> %tmp3, %tmp4 735 ret <8 x i16> %tmp5 736 } 737 738 define <2 x i32> @saba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { 739 ;CHECK-LABEL: saba_2s: 740 ;CHECK: saba.2s 741 %tmp1 = load <2 x i32>, <2 x i32>* %A 742 %tmp2 = load <2 x i32>, <2 x i32>* %B 743 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 744 %tmp4 = load <2 x i32>, <2 x i32>* %C 745 %tmp5 = add <2 x i32> %tmp3, %tmp4 746 ret <2 x i32> %tmp5 747 } 748 749 define <4 x i32> @saba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind { 750 ;CHECK-LABEL: saba_4s: 751 ;CHECK: saba.4s 752 %tmp1 = load <4 x i32>, <4 x i32>* %A 753 %tmp2 = load <4 x i32>, <4 x i32>* %B 754 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) 755 %tmp4 = load <4 x i32>, <4 x i32>* %C 756 %tmp5 = add <4 x i32> %tmp3, %tmp4 757 ret <4 x i32> %tmp5 758 } 759 760 define <8 x i8> @uaba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { 761 ;CHECK-LABEL: uaba_8b: 762 ;CHECK: uaba.8b 763 %tmp1 = load <8 x i8>, <8 x i8>* %A 764 %tmp2 = load <8 x i8>, <8 x i8>* %B 765 %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 766 %tmp4 = load <8 x i8>, <8 x i8>* %C 767 %tmp5 = add <8 x i8> %tmp3, %tmp4 768 ret <8 x i8> %tmp5 769 } 770 771 define <16 x i8> @uaba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind { 772 ;CHECK-LABEL: uaba_16b: 773 ;CHECK: uaba.16b 774 %tmp1 = load <16 x i8>, <16 x i8>* %A 775 %tmp2 = load <16 x i8>, <16 x i8>* %B 776 %tmp3 = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) 777 %tmp4 = load <16 x i8>, <16 x i8>* %C 778 %tmp5 = add <16 x i8> %tmp3, %tmp4 779 ret <16 x i8> %tmp5 780 } 781 782 define <4 x i16> @uaba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { 783 ;CHECK-LABEL: uaba_4h: 784 ;CHECK: uaba.4h 785 %tmp1 = load <4 x i16>, <4 x i16>* %A 786 %tmp2 = load <4 x i16>, <4 x i16>* %B 787 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 788 %tmp4 = load <4 x i16>, <4 x i16>* %C 789 %tmp5 = add <4 x i16> %tmp3, %tmp4 790 ret <4 x i16> %tmp5 791 } 792 793 define <8 x i16> @uaba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind { 794 ;CHECK-LABEL: uaba_8h: 795 ;CHECK: uaba.8h 796 %tmp1 = load <8 x i16>, <8 x i16>* %A 797 %tmp2 = load <8 x i16>, <8 x i16>* %B 798 %tmp3 = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) 799 %tmp4 = load <8 x i16>, <8 x i16>* %C 800 %tmp5 = add <8 x i16> %tmp3, %tmp4 801 ret <8 x i16> %tmp5 802 } 803 804 define <2 x i32> @uaba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { 805 ;CHECK-LABEL: uaba_2s: 806 ;CHECK: uaba.2s 807 %tmp1 = load <2 x i32>, <2 x i32>* %A 808 %tmp2 = load <2 x i32>, <2 x i32>* %B 809 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 810 %tmp4 = load <2 x i32>, <2 x i32>* %C 811 %tmp5 = add <2 x i32> %tmp3, %tmp4 812 ret <2 x i32> %tmp5 813 } 814 815 define <4 x i32> @uaba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind { 816 ;CHECK-LABEL: uaba_4s: 817 ;CHECK: uaba.4s 818 %tmp1 = load <4 x i32>, <4 x i32>* %A 819 %tmp2 = load <4 x i32>, <4 x i32>* %B 820 %tmp3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) 821 %tmp4 = load <4 x i32>, <4 x i32>* %C 822 %tmp5 = add <4 x i32> %tmp3, %tmp4 823 ret <4 x i32> %tmp5 824 } 825 826 ; Scalar FABD 827 define float @fabds(float %a, float %b) nounwind { 828 ; CHECK-LABEL: fabds: 829 ; CHECK: fabd s0, s0, s1 830 %vabd.i = tail call float @llvm.aarch64.sisd.fabd.f32(float %a, float %b) nounwind 831 ret float %vabd.i 832 } 833 834 define double @fabdd(double %a, double %b) nounwind { 835 ; CHECK-LABEL: fabdd: 836 ; CHECK: fabd d0, d0, d1 837 %vabd.i = tail call double @llvm.aarch64.sisd.fabd.f64(double %a, double %b) nounwind 838 ret double %vabd.i 839 } 840 841 declare double @llvm.aarch64.sisd.fabd.f64(double, double) nounwind readnone 842 declare float @llvm.aarch64.sisd.fabd.f32(float, float) nounwind readnone 843 844 define <2 x i64> @uabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) { 845 ; CHECK-LABEL: uabdl_from_extract_dup: 846 ; CHECK-NOT: ext.16b 847 ; CHECK: uabdl2.2d 848 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0 849 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1 850 851 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 852 853 %res = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind 854 %res1 = zext <2 x i32> %res to <2 x i64> 855 ret <2 x i64> %res1 856 } 857 858 define <2 x i64> @sabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) { 859 ; CHECK-LABEL: sabdl_from_extract_dup: 860 ; CHECK-NOT: ext.16b 861 ; CHECK: sabdl2.2d 862 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0 863 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1 864 865 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 866 867 %res = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind 868 %res1 = zext <2 x i32> %res to <2 x i64> 869 ret <2 x i64> %res1 870 } 871 872 define <2 x i32> @abspattern1(<2 x i32> %a) nounwind { 873 ; CHECK-LABEL: abspattern1: 874 ; CHECK: abs.2s 875 ; CHECK-NEXT: ret 876 %tmp1neg = sub <2 x i32> zeroinitializer, %a 877 %b = icmp sge <2 x i32> %a, zeroinitializer 878 %abs = select <2 x i1> %b, <2 x i32> %a, <2 x i32> %tmp1neg 879 ret <2 x i32> %abs 880 } 881 882 define <4 x i16> @abspattern2(<4 x i16> %a) nounwind { 883 ; CHECK-LABEL: abspattern2: 884 ; CHECK: abs.4h 885 ; CHECK-NEXT: ret 886 %tmp1neg = sub <4 x i16> zeroinitializer, %a 887 %b = icmp sgt <4 x i16> %a, zeroinitializer 888 %abs = select <4 x i1> %b, <4 x i16> %a, <4 x i16> %tmp1neg 889 ret <4 x i16> %abs 890 } 891 892 define <8 x i8> @abspattern3(<8 x i8> %a) nounwind { 893 ; CHECK-LABEL: abspattern3: 894 ; CHECK: abs.8b 895 ; CHECK-NEXT: ret 896 %tmp1neg = sub <8 x i8> zeroinitializer, %a 897 %b = icmp slt <8 x i8> %a, zeroinitializer 898 %abs = select <8 x i1> %b, <8 x i8> %tmp1neg, <8 x i8> %a 899 ret <8 x i8> %abs 900 } 901 902 define <4 x i32> @abspattern4(<4 x i32> %a) nounwind { 903 ; CHECK-LABEL: abspattern4: 904 ; CHECK: abs.4s 905 ; CHECK-NEXT: ret 906 %tmp1neg = sub <4 x i32> zeroinitializer, %a 907 %b = icmp sge <4 x i32> %a, zeroinitializer 908 %abs = select <4 x i1> %b, <4 x i32> %a, <4 x i32> %tmp1neg 909 ret <4 x i32> %abs 910 } 911 912 define <8 x i16> @abspattern5(<8 x i16> %a) nounwind { 913 ; CHECK-LABEL: abspattern5: 914 ; CHECK: abs.8h 915 ; CHECK-NEXT: ret 916 %tmp1neg = sub <8 x i16> zeroinitializer, %a 917 %b = icmp sgt <8 x i16> %a, zeroinitializer 918 %abs = select <8 x i1> %b, <8 x i16> %a, <8 x i16> %tmp1neg 919 ret <8 x i16> %abs 920 } 921 922 define <16 x i8> @abspattern6(<16 x i8> %a) nounwind { 923 ; CHECK-LABEL: abspattern6: 924 ; CHECK: abs.16b 925 ; CHECK-NEXT: ret 926 %tmp1neg = sub <16 x i8> zeroinitializer, %a 927 %b = icmp slt <16 x i8> %a, zeroinitializer 928 %abs = select <16 x i1> %b, <16 x i8> %tmp1neg, <16 x i8> %a 929 ret <16 x i8> %abs 930 } 931 932 define <2 x i64> @abspattern7(<2 x i64> %a) nounwind { 933 ; CHECK-LABEL: abspattern7: 934 ; CHECK: abs.2d 935 ; CHECK-NEXT: ret 936 %tmp1neg = sub <2 x i64> zeroinitializer, %a 937 %b = icmp sle <2 x i64> %a, zeroinitializer 938 %abs = select <2 x i1> %b, <2 x i64> %tmp1neg, <2 x i64> %a 939 ret <2 x i64> %abs 940 } 941