1 ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s 2 3 define <8 x i8> @addhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind { 4 ;CHECK-LABEL: addhn8b: 5 ;CHECK: addhn.8b 6 %tmp1 = load <8 x i16>, <8 x i16>* %A 7 %tmp2 = load <8 x i16>, <8 x i16>* %B 8 %tmp3 = call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2) 9 ret <8 x i8> %tmp3 10 } 11 12 define <4 x i16> @addhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind { 13 ;CHECK-LABEL: addhn4h: 14 ;CHECK: addhn.4h 15 %tmp1 = load <4 x i32>, <4 x i32>* %A 16 %tmp2 = load <4 x i32>, <4 x i32>* %B 17 %tmp3 = call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2) 18 ret <4 x i16> %tmp3 19 } 20 21 define <2 x i32> @addhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind { 22 ;CHECK-LABEL: addhn2s: 23 ;CHECK: addhn.2s 24 %tmp1 = load <2 x i64>, <2 x i64>* %A 25 %tmp2 = load <2 x i64>, <2 x i64>* %B 26 %tmp3 = call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2) 27 ret <2 x i32> %tmp3 28 } 29 30 define <16 x i8> @addhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind { 31 ;CHECK-LABEL: addhn2_16b: 32 ;CHECK: addhn.8b 33 ;CHECK-NEXT: addhn2.16b 34 %vaddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind 35 %vaddhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind 36 %res = shufflevector <8 x i8> %vaddhn2.i, <8 x i8> %vaddhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 37 ret <16 x i8> %res 38 } 39 40 define <8 x i16> @addhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind { 41 ;CHECK-LABEL: addhn2_8h: 42 ;CHECK: addhn.4h 43 ;CHECK-NEXT: addhn2.8h 44 %vaddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind 45 %vaddhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind 46 %res = shufflevector <4 x i16> %vaddhn2.i, <4 x i16> %vaddhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 47 ret <8 x i16> %res 48 } 49 50 define <4 x i32> @addhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind { 51 ;CHECK-LABEL: addhn2_4s: 52 ;CHECK: addhn.2s 53 ;CHECK-NEXT: addhn2.4s 54 %vaddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind 55 %vaddhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind 56 %res = shufflevector <2 x i32> %vaddhn2.i, <2 x i32> %vaddhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 57 ret <4 x i32> %res 58 } 59 60 declare <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone 61 declare <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone 62 declare <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone 63 64 65 define <8 x i8> @raddhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind { 66 ;CHECK-LABEL: raddhn8b: 67 ;CHECK: raddhn.8b 68 %tmp1 = load <8 x i16>, <8 x i16>* %A 69 %tmp2 = load <8 x i16>, <8 x i16>* %B 70 %tmp3 = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2) 71 ret <8 x i8> %tmp3 72 } 73 74 define <4 x i16> @raddhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind { 75 ;CHECK-LABEL: raddhn4h: 76 ;CHECK: raddhn.4h 77 %tmp1 = load <4 x i32>, <4 x i32>* %A 78 %tmp2 = load <4 x i32>, <4 x i32>* %B 79 %tmp3 = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2) 80 ret <4 x i16> %tmp3 81 } 82 83 define <2 x i32> @raddhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind { 84 ;CHECK-LABEL: raddhn2s: 85 ;CHECK: raddhn.2s 86 %tmp1 = load <2 x i64>, <2 x i64>* %A 87 %tmp2 = load <2 x i64>, <2 x i64>* %B 88 %tmp3 = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2) 89 ret <2 x i32> %tmp3 90 } 91 92 define <16 x i8> @raddhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind { 93 ;CHECK-LABEL: raddhn2_16b: 94 ;CHECK: raddhn.8b 95 ;CHECK-NEXT: raddhn2.16b 96 %vraddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind 97 %vraddhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind 98 %res = shufflevector <8 x i8> %vraddhn2.i, <8 x i8> %vraddhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 99 ret <16 x i8> %res 100 } 101 102 define <8 x i16> @raddhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind { 103 ;CHECK-LABEL: raddhn2_8h: 104 ;CHECK: raddhn.4h 105 ;CHECK-NEXT: raddhn2.8h 106 %vraddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind 107 %vraddhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind 108 %res = shufflevector <4 x i16> %vraddhn2.i, <4 x i16> %vraddhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 109 ret <8 x i16> %res 110 } 111 112 define <4 x i32> @raddhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind { 113 ;CHECK-LABEL: raddhn2_4s: 114 ;CHECK: raddhn.2s 115 ;CHECK-NEXT: raddhn2.4s 116 %vraddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind 117 %vraddhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind 118 %res = shufflevector <2 x i32> %vraddhn2.i, <2 x i32> %vraddhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 119 ret <4 x i32> %res 120 } 121 122 declare <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone 123 declare <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone 124 declare <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone 125 126 define <8 x i16> @saddl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind { 127 ;CHECK-LABEL: saddl8h: 128 ;CHECK: saddl.8h 129 %tmp1 = load <8 x i8>, <8 x i8>* %A 130 %tmp2 = load <8 x i8>, <8 x i8>* %B 131 %tmp3 = sext <8 x i8> %tmp1 to <8 x i16> 132 %tmp4 = sext <8 x i8> %tmp2 to <8 x i16> 133 %tmp5 = add <8 x i16> %tmp3, %tmp4 134 ret <8 x i16> %tmp5 135 } 136 137 define <4 x i32> @saddl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind { 138 ;CHECK-LABEL: saddl4s: 139 ;CHECK: saddl.4s 140 %tmp1 = load <4 x i16>, <4 x i16>* %A 141 %tmp2 = load <4 x i16>, <4 x i16>* %B 142 %tmp3 = sext <4 x i16> %tmp1 to <4 x i32> 143 %tmp4 = sext <4 x i16> %tmp2 to <4 x i32> 144 %tmp5 = add <4 x i32> %tmp3, %tmp4 145 ret <4 x i32> %tmp5 146 } 147 148 define <2 x i64> @saddl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind { 149 ;CHECK-LABEL: saddl2d: 150 ;CHECK: saddl.2d 151 %tmp1 = load <2 x i32>, <2 x i32>* %A 152 %tmp2 = load <2 x i32>, <2 x i32>* %B 153 %tmp3 = sext <2 x i32> %tmp1 to <2 x i64> 154 %tmp4 = sext <2 x i32> %tmp2 to <2 x i64> 155 %tmp5 = add <2 x i64> %tmp3, %tmp4 156 ret <2 x i64> %tmp5 157 } 158 159 define <8 x i16> @saddl2_8h(<16 x i8> %a, <16 x i8> %b) nounwind { 160 ; CHECK-LABEL: saddl2_8h: 161 ; CHECK-NEXT: saddl2.8h v0, v0, v1 162 ; CHECK-NEXT: ret 163 %tmp = bitcast <16 x i8> %a to <2 x i64> 164 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 165 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8> 166 %vmovl.i.i.i = sext <8 x i8> %tmp1 to <8 x i16> 167 %tmp2 = bitcast <16 x i8> %b to <2 x i64> 168 %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 169 %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <8 x i8> 170 %vmovl.i.i5.i = sext <8 x i8> %tmp3 to <8 x i16> 171 %add.i = add <8 x i16> %vmovl.i.i.i, %vmovl.i.i5.i 172 ret <8 x i16> %add.i 173 } 174 175 define <4 x i32> @saddl2_4s(<8 x i16> %a, <8 x i16> %b) nounwind { 176 ; CHECK-LABEL: saddl2_4s: 177 ; CHECK-NEXT: saddl2.4s v0, v0, v1 178 ; CHECK-NEXT: ret 179 %tmp = bitcast <8 x i16> %a to <2 x i64> 180 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 181 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16> 182 %vmovl.i.i.i = sext <4 x i16> %tmp1 to <4 x i32> 183 %tmp2 = bitcast <8 x i16> %b to <2 x i64> 184 %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 185 %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <4 x i16> 186 %vmovl.i.i5.i = sext <4 x i16> %tmp3 to <4 x i32> 187 %add.i = add <4 x i32> %vmovl.i.i.i, %vmovl.i.i5.i 188 ret <4 x i32> %add.i 189 } 190 191 define <2 x i64> @saddl2_2d(<4 x i32> %a, <4 x i32> %b) nounwind { 192 ; CHECK-LABEL: saddl2_2d: 193 ; CHECK-NEXT: saddl2.2d v0, v0, v1 194 ; CHECK-NEXT: ret 195 %tmp = bitcast <4 x i32> %a to <2 x i64> 196 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 197 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32> 198 %vmovl.i.i.i = sext <2 x i32> %tmp1 to <2 x i64> 199 %tmp2 = bitcast <4 x i32> %b to <2 x i64> 200 %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 201 %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <2 x i32> 202 %vmovl.i.i5.i = sext <2 x i32> %tmp3 to <2 x i64> 203 %add.i = add <2 x i64> %vmovl.i.i.i, %vmovl.i.i5.i 204 ret <2 x i64> %add.i 205 } 206 207 define <8 x i16> @uaddl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind { 208 ;CHECK-LABEL: uaddl8h: 209 ;CHECK: uaddl.8h 210 %tmp1 = load <8 x i8>, <8 x i8>* %A 211 %tmp2 = load <8 x i8>, <8 x i8>* %B 212 %tmp3 = zext <8 x i8> %tmp1 to <8 x i16> 213 %tmp4 = zext <8 x i8> %tmp2 to <8 x i16> 214 %tmp5 = add <8 x i16> %tmp3, %tmp4 215 ret <8 x i16> %tmp5 216 } 217 218 define <4 x i32> @uaddl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind { 219 ;CHECK-LABEL: uaddl4s: 220 ;CHECK: uaddl.4s 221 %tmp1 = load <4 x i16>, <4 x i16>* %A 222 %tmp2 = load <4 x i16>, <4 x i16>* %B 223 %tmp3 = zext <4 x i16> %tmp1 to <4 x i32> 224 %tmp4 = zext <4 x i16> %tmp2 to <4 x i32> 225 %tmp5 = add <4 x i32> %tmp3, %tmp4 226 ret <4 x i32> %tmp5 227 } 228 229 define <2 x i64> @uaddl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind { 230 ;CHECK-LABEL: uaddl2d: 231 ;CHECK: uaddl.2d 232 %tmp1 = load <2 x i32>, <2 x i32>* %A 233 %tmp2 = load <2 x i32>, <2 x i32>* %B 234 %tmp3 = zext <2 x i32> %tmp1 to <2 x i64> 235 %tmp4 = zext <2 x i32> %tmp2 to <2 x i64> 236 %tmp5 = add <2 x i64> %tmp3, %tmp4 237 ret <2 x i64> %tmp5 238 } 239 240 241 define <8 x i16> @uaddl2_8h(<16 x i8> %a, <16 x i8> %b) nounwind { 242 ; CHECK-LABEL: uaddl2_8h: 243 ; CHECK-NEXT: uaddl2.8h v0, v0, v1 244 ; CHECK-NEXT: ret 245 %tmp = bitcast <16 x i8> %a to <2 x i64> 246 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 247 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8> 248 %vmovl.i.i.i = zext <8 x i8> %tmp1 to <8 x i16> 249 %tmp2 = bitcast <16 x i8> %b to <2 x i64> 250 %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 251 %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <8 x i8> 252 %vmovl.i.i5.i = zext <8 x i8> %tmp3 to <8 x i16> 253 %add.i = add <8 x i16> %vmovl.i.i.i, %vmovl.i.i5.i 254 ret <8 x i16> %add.i 255 } 256 257 define <4 x i32> @uaddl2_4s(<8 x i16> %a, <8 x i16> %b) nounwind { 258 ; CHECK-LABEL: uaddl2_4s: 259 ; CHECK-NEXT: uaddl2.4s v0, v0, v1 260 ; CHECK-NEXT: ret 261 %tmp = bitcast <8 x i16> %a to <2 x i64> 262 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 263 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16> 264 %vmovl.i.i.i = zext <4 x i16> %tmp1 to <4 x i32> 265 %tmp2 = bitcast <8 x i16> %b to <2 x i64> 266 %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 267 %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <4 x i16> 268 %vmovl.i.i5.i = zext <4 x i16> %tmp3 to <4 x i32> 269 %add.i = add <4 x i32> %vmovl.i.i.i, %vmovl.i.i5.i 270 ret <4 x i32> %add.i 271 } 272 273 define <2 x i64> @uaddl2_2d(<4 x i32> %a, <4 x i32> %b) nounwind { 274 ; CHECK-LABEL: uaddl2_2d: 275 ; CHECK-NEXT: uaddl2.2d v0, v0, v1 276 ; CHECK-NEXT: ret 277 %tmp = bitcast <4 x i32> %a to <2 x i64> 278 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 279 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32> 280 %vmovl.i.i.i = zext <2 x i32> %tmp1 to <2 x i64> 281 %tmp2 = bitcast <4 x i32> %b to <2 x i64> 282 %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 283 %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <2 x i32> 284 %vmovl.i.i5.i = zext <2 x i32> %tmp3 to <2 x i64> 285 %add.i = add <2 x i64> %vmovl.i.i.i, %vmovl.i.i5.i 286 ret <2 x i64> %add.i 287 } 288 289 define <8 x i16> @uaddw8h(<8 x i16>* %A, <8 x i8>* %B) nounwind { 290 ;CHECK-LABEL: uaddw8h: 291 ;CHECK: uaddw.8h 292 %tmp1 = load <8 x i16>, <8 x i16>* %A 293 %tmp2 = load <8 x i8>, <8 x i8>* %B 294 %tmp3 = zext <8 x i8> %tmp2 to <8 x i16> 295 %tmp4 = add <8 x i16> %tmp1, %tmp3 296 ret <8 x i16> %tmp4 297 } 298 299 define <4 x i32> @uaddw4s(<4 x i32>* %A, <4 x i16>* %B) nounwind { 300 ;CHECK-LABEL: uaddw4s: 301 ;CHECK: uaddw.4s 302 %tmp1 = load <4 x i32>, <4 x i32>* %A 303 %tmp2 = load <4 x i16>, <4 x i16>* %B 304 %tmp3 = zext <4 x i16> %tmp2 to <4 x i32> 305 %tmp4 = add <4 x i32> %tmp1, %tmp3 306 ret <4 x i32> %tmp4 307 } 308 309 define <2 x i64> @uaddw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind { 310 ;CHECK-LABEL: uaddw2d: 311 ;CHECK: uaddw.2d 312 %tmp1 = load <2 x i64>, <2 x i64>* %A 313 %tmp2 = load <2 x i32>, <2 x i32>* %B 314 %tmp3 = zext <2 x i32> %tmp2 to <2 x i64> 315 %tmp4 = add <2 x i64> %tmp1, %tmp3 316 ret <2 x i64> %tmp4 317 } 318 319 define <8 x i16> @uaddw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind { 320 ;CHECK-LABEL: uaddw2_8h: 321 ;CHECK: uaddw.8h 322 %tmp1 = load <8 x i16>, <8 x i16>* %A 323 324 %tmp2 = load <16 x i8>, <16 x i8>* %B 325 %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 326 %ext2 = zext <8 x i8> %high2 to <8 x i16> 327 328 %res = add <8 x i16> %tmp1, %ext2 329 ret <8 x i16> %res 330 } 331 332 define <4 x i32> @uaddw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind { 333 ;CHECK-LABEL: uaddw2_4s: 334 ;CHECK: uaddw.4s 335 %tmp1 = load <4 x i32>, <4 x i32>* %A 336 337 %tmp2 = load <8 x i16>, <8 x i16>* %B 338 %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 339 %ext2 = zext <4 x i16> %high2 to <4 x i32> 340 341 %res = add <4 x i32> %tmp1, %ext2 342 ret <4 x i32> %res 343 } 344 345 define <2 x i64> @uaddw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind { 346 ;CHECK-LABEL: uaddw2_2d: 347 ;CHECK: uaddw.2d 348 %tmp1 = load <2 x i64>, <2 x i64>* %A 349 350 %tmp2 = load <4 x i32>, <4 x i32>* %B 351 %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 352 %ext2 = zext <2 x i32> %high2 to <2 x i64> 353 354 %res = add <2 x i64> %tmp1, %ext2 355 ret <2 x i64> %res 356 } 357 358 define <8 x i16> @saddw8h(<8 x i16>* %A, <8 x i8>* %B) nounwind { 359 ;CHECK-LABEL: saddw8h: 360 ;CHECK: saddw.8h 361 %tmp1 = load <8 x i16>, <8 x i16>* %A 362 %tmp2 = load <8 x i8>, <8 x i8>* %B 363 %tmp3 = sext <8 x i8> %tmp2 to <8 x i16> 364 %tmp4 = add <8 x i16> %tmp1, %tmp3 365 ret <8 x i16> %tmp4 366 } 367 368 define <4 x i32> @saddw4s(<4 x i32>* %A, <4 x i16>* %B) nounwind { 369 ;CHECK-LABEL: saddw4s: 370 ;CHECK: saddw.4s 371 %tmp1 = load <4 x i32>, <4 x i32>* %A 372 %tmp2 = load <4 x i16>, <4 x i16>* %B 373 %tmp3 = sext <4 x i16> %tmp2 to <4 x i32> 374 %tmp4 = add <4 x i32> %tmp1, %tmp3 375 ret <4 x i32> %tmp4 376 } 377 378 define <2 x i64> @saddw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind { 379 ;CHECK-LABEL: saddw2d: 380 ;CHECK: saddw.2d 381 %tmp1 = load <2 x i64>, <2 x i64>* %A 382 %tmp2 = load <2 x i32>, <2 x i32>* %B 383 %tmp3 = sext <2 x i32> %tmp2 to <2 x i64> 384 %tmp4 = add <2 x i64> %tmp1, %tmp3 385 ret <2 x i64> %tmp4 386 } 387 388 define <8 x i16> @saddw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind { 389 ;CHECK-LABEL: saddw2_8h: 390 ;CHECK: saddw.8h 391 %tmp1 = load <8 x i16>, <8 x i16>* %A 392 393 %tmp2 = load <16 x i8>, <16 x i8>* %B 394 %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 395 %ext2 = sext <8 x i8> %high2 to <8 x i16> 396 397 %res = add <8 x i16> %tmp1, %ext2 398 ret <8 x i16> %res 399 } 400 401 define <4 x i32> @saddw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind { 402 ;CHECK-LABEL: saddw2_4s: 403 ;CHECK: saddw.4s 404 %tmp1 = load <4 x i32>, <4 x i32>* %A 405 406 %tmp2 = load <8 x i16>, <8 x i16>* %B 407 %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 408 %ext2 = sext <4 x i16> %high2 to <4 x i32> 409 410 %res = add <4 x i32> %tmp1, %ext2 411 ret <4 x i32> %res 412 } 413 414 define <2 x i64> @saddw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind { 415 ;CHECK-LABEL: saddw2_2d: 416 ;CHECK: saddw.2d 417 %tmp1 = load <2 x i64>, <2 x i64>* %A 418 419 %tmp2 = load <4 x i32>, <4 x i32>* %B 420 %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 421 %ext2 = sext <2 x i32> %high2 to <2 x i64> 422 423 %res = add <2 x i64> %tmp1, %ext2 424 ret <2 x i64> %res 425 } 426 427 define <4 x i16> @saddlp4h(<8 x i8>* %A) nounwind { 428 ;CHECK-LABEL: saddlp4h: 429 ;CHECK: saddlp.4h 430 %tmp1 = load <8 x i8>, <8 x i8>* %A 431 %tmp3 = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1) 432 ret <4 x i16> %tmp3 433 } 434 435 define <2 x i32> @saddlp2s(<4 x i16>* %A) nounwind { 436 ;CHECK-LABEL: saddlp2s: 437 ;CHECK: saddlp.2s 438 %tmp1 = load <4 x i16>, <4 x i16>* %A 439 %tmp3 = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1) 440 ret <2 x i32> %tmp3 441 } 442 443 define <1 x i64> @saddlp1d(<2 x i32>* %A) nounwind { 444 ;CHECK-LABEL: saddlp1d: 445 ;CHECK: saddlp.1d 446 %tmp1 = load <2 x i32>, <2 x i32>* %A 447 %tmp3 = call <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32> %tmp1) 448 ret <1 x i64> %tmp3 449 } 450 451 define <8 x i16> @saddlp8h(<16 x i8>* %A) nounwind { 452 ;CHECK-LABEL: saddlp8h: 453 ;CHECK: saddlp.8h 454 %tmp1 = load <16 x i8>, <16 x i8>* %A 455 %tmp3 = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1) 456 ret <8 x i16> %tmp3 457 } 458 459 define <4 x i32> @saddlp4s(<8 x i16>* %A) nounwind { 460 ;CHECK-LABEL: saddlp4s: 461 ;CHECK: saddlp.4s 462 %tmp1 = load <8 x i16>, <8 x i16>* %A 463 %tmp3 = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1) 464 ret <4 x i32> %tmp3 465 } 466 467 define <2 x i64> @saddlp2d(<4 x i32>* %A) nounwind { 468 ;CHECK-LABEL: saddlp2d: 469 ;CHECK: saddlp.2d 470 %tmp1 = load <4 x i32>, <4 x i32>* %A 471 %tmp3 = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1) 472 ret <2 x i64> %tmp3 473 } 474 475 declare <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8>) nounwind readnone 476 declare <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16>) nounwind readnone 477 declare <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32>) nounwind readnone 478 479 declare <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8>) nounwind readnone 480 declare <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16>) nounwind readnone 481 declare <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32>) nounwind readnone 482 483 define <4 x i16> @uaddlp4h(<8 x i8>* %A) nounwind { 484 ;CHECK-LABEL: uaddlp4h: 485 ;CHECK: uaddlp.4h 486 %tmp1 = load <8 x i8>, <8 x i8>* %A 487 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1) 488 ret <4 x i16> %tmp3 489 } 490 491 define <2 x i32> @uaddlp2s(<4 x i16>* %A) nounwind { 492 ;CHECK-LABEL: uaddlp2s: 493 ;CHECK: uaddlp.2s 494 %tmp1 = load <4 x i16>, <4 x i16>* %A 495 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1) 496 ret <2 x i32> %tmp3 497 } 498 499 define <1 x i64> @uaddlp1d(<2 x i32>* %A) nounwind { 500 ;CHECK-LABEL: uaddlp1d: 501 ;CHECK: uaddlp.1d 502 %tmp1 = load <2 x i32>, <2 x i32>* %A 503 %tmp3 = call <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32> %tmp1) 504 ret <1 x i64> %tmp3 505 } 506 507 define <8 x i16> @uaddlp8h(<16 x i8>* %A) nounwind { 508 ;CHECK-LABEL: uaddlp8h: 509 ;CHECK: uaddlp.8h 510 %tmp1 = load <16 x i8>, <16 x i8>* %A 511 %tmp3 = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1) 512 ret <8 x i16> %tmp3 513 } 514 515 define <4 x i32> @uaddlp4s(<8 x i16>* %A) nounwind { 516 ;CHECK-LABEL: uaddlp4s: 517 ;CHECK: uaddlp.4s 518 %tmp1 = load <8 x i16>, <8 x i16>* %A 519 %tmp3 = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1) 520 ret <4 x i32> %tmp3 521 } 522 523 define <2 x i64> @uaddlp2d(<4 x i32>* %A) nounwind { 524 ;CHECK-LABEL: uaddlp2d: 525 ;CHECK: uaddlp.2d 526 %tmp1 = load <4 x i32>, <4 x i32>* %A 527 %tmp3 = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1) 528 ret <2 x i64> %tmp3 529 } 530 531 declare <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8>) nounwind readnone 532 declare <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16>) nounwind readnone 533 declare <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32>) nounwind readnone 534 535 declare <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8>) nounwind readnone 536 declare <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16>) nounwind readnone 537 declare <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32>) nounwind readnone 538 539 define <4 x i16> @sadalp4h(<8 x i8>* %A, <4 x i16>* %B) nounwind { 540 ;CHECK-LABEL: sadalp4h: 541 ;CHECK: sadalp.4h 542 %tmp1 = load <8 x i8>, <8 x i8>* %A 543 %tmp3 = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1) 544 %tmp4 = load <4 x i16>, <4 x i16>* %B 545 %tmp5 = add <4 x i16> %tmp3, %tmp4 546 ret <4 x i16> %tmp5 547 } 548 549 define <2 x i32> @sadalp2s(<4 x i16>* %A, <2 x i32>* %B) nounwind { 550 ;CHECK-LABEL: sadalp2s: 551 ;CHECK: sadalp.2s 552 %tmp1 = load <4 x i16>, <4 x i16>* %A 553 %tmp3 = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1) 554 %tmp4 = load <2 x i32>, <2 x i32>* %B 555 %tmp5 = add <2 x i32> %tmp3, %tmp4 556 ret <2 x i32> %tmp5 557 } 558 559 define <8 x i16> @sadalp8h(<16 x i8>* %A, <8 x i16>* %B) nounwind { 560 ;CHECK-LABEL: sadalp8h: 561 ;CHECK: sadalp.8h 562 %tmp1 = load <16 x i8>, <16 x i8>* %A 563 %tmp3 = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1) 564 %tmp4 = load <8 x i16>, <8 x i16>* %B 565 %tmp5 = add <8 x i16> %tmp3, %tmp4 566 ret <8 x i16> %tmp5 567 } 568 569 define <4 x i32> @sadalp4s(<8 x i16>* %A, <4 x i32>* %B) nounwind { 570 ;CHECK-LABEL: sadalp4s: 571 ;CHECK: sadalp.4s 572 %tmp1 = load <8 x i16>, <8 x i16>* %A 573 %tmp3 = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1) 574 %tmp4 = load <4 x i32>, <4 x i32>* %B 575 %tmp5 = add <4 x i32> %tmp3, %tmp4 576 ret <4 x i32> %tmp5 577 } 578 579 define <2 x i64> @sadalp2d(<4 x i32>* %A, <2 x i64>* %B) nounwind { 580 ;CHECK-LABEL: sadalp2d: 581 ;CHECK: sadalp.2d 582 %tmp1 = load <4 x i32>, <4 x i32>* %A 583 %tmp3 = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1) 584 %tmp4 = load <2 x i64>, <2 x i64>* %B 585 %tmp5 = add <2 x i64> %tmp3, %tmp4 586 ret <2 x i64> %tmp5 587 } 588 589 define <4 x i16> @uadalp4h(<8 x i8>* %A, <4 x i16>* %B) nounwind { 590 ;CHECK-LABEL: uadalp4h: 591 ;CHECK: uadalp.4h 592 %tmp1 = load <8 x i8>, <8 x i8>* %A 593 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1) 594 %tmp4 = load <4 x i16>, <4 x i16>* %B 595 %tmp5 = add <4 x i16> %tmp3, %tmp4 596 ret <4 x i16> %tmp5 597 } 598 599 define <2 x i32> @uadalp2s(<4 x i16>* %A, <2 x i32>* %B) nounwind { 600 ;CHECK-LABEL: uadalp2s: 601 ;CHECK: uadalp.2s 602 %tmp1 = load <4 x i16>, <4 x i16>* %A 603 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1) 604 %tmp4 = load <2 x i32>, <2 x i32>* %B 605 %tmp5 = add <2 x i32> %tmp3, %tmp4 606 ret <2 x i32> %tmp5 607 } 608 609 define <8 x i16> @uadalp8h(<16 x i8>* %A, <8 x i16>* %B) nounwind { 610 ;CHECK-LABEL: uadalp8h: 611 ;CHECK: uadalp.8h 612 %tmp1 = load <16 x i8>, <16 x i8>* %A 613 %tmp3 = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1) 614 %tmp4 = load <8 x i16>, <8 x i16>* %B 615 %tmp5 = add <8 x i16> %tmp3, %tmp4 616 ret <8 x i16> %tmp5 617 } 618 619 define <4 x i32> @uadalp4s(<8 x i16>* %A, <4 x i32>* %B) nounwind { 620 ;CHECK-LABEL: uadalp4s: 621 ;CHECK: uadalp.4s 622 %tmp1 = load <8 x i16>, <8 x i16>* %A 623 %tmp3 = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1) 624 %tmp4 = load <4 x i32>, <4 x i32>* %B 625 %tmp5 = add <4 x i32> %tmp3, %tmp4 626 ret <4 x i32> %tmp5 627 } 628 629 define <2 x i64> @uadalp2d(<4 x i32>* %A, <2 x i64>* %B) nounwind { 630 ;CHECK-LABEL: uadalp2d: 631 ;CHECK: uadalp.2d 632 %tmp1 = load <4 x i32>, <4 x i32>* %A 633 %tmp3 = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1) 634 %tmp4 = load <2 x i64>, <2 x i64>* %B 635 %tmp5 = add <2 x i64> %tmp3, %tmp4 636 ret <2 x i64> %tmp5 637 } 638 639 define <8 x i8> @addp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind { 640 ;CHECK-LABEL: addp_8b: 641 ;CHECK: addp.8b 642 %tmp1 = load <8 x i8>, <8 x i8>* %A 643 %tmp2 = load <8 x i8>, <8 x i8>* %B 644 %tmp3 = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 645 ret <8 x i8> %tmp3 646 } 647 648 define <16 x i8> @addp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind { 649 ;CHECK-LABEL: addp_16b: 650 ;CHECK: addp.16b 651 %tmp1 = load <16 x i8>, <16 x i8>* %A 652 %tmp2 = load <16 x i8>, <16 x i8>* %B 653 %tmp3 = call <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) 654 ret <16 x i8> %tmp3 655 } 656 657 define <4 x i16> @addp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind { 658 ;CHECK-LABEL: addp_4h: 659 ;CHECK: addp.4h 660 %tmp1 = load <4 x i16>, <4 x i16>* %A 661 %tmp2 = load <4 x i16>, <4 x i16>* %B 662 %tmp3 = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 663 ret <4 x i16> %tmp3 664 } 665 666 define <8 x i16> @addp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind { 667 ;CHECK-LABEL: addp_8h: 668 ;CHECK: addp.8h 669 %tmp1 = load <8 x i16>, <8 x i16>* %A 670 %tmp2 = load <8 x i16>, <8 x i16>* %B 671 %tmp3 = call <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) 672 ret <8 x i16> %tmp3 673 } 674 675 define <2 x i32> @addp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind { 676 ;CHECK-LABEL: addp_2s: 677 ;CHECK: addp.2s 678 %tmp1 = load <2 x i32>, <2 x i32>* %A 679 %tmp2 = load <2 x i32>, <2 x i32>* %B 680 %tmp3 = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 681 ret <2 x i32> %tmp3 682 } 683 684 define <4 x i32> @addp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind { 685 ;CHECK-LABEL: addp_4s: 686 ;CHECK: addp.4s 687 %tmp1 = load <4 x i32>, <4 x i32>* %A 688 %tmp2 = load <4 x i32>, <4 x i32>* %B 689 %tmp3 = call <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) 690 ret <4 x i32> %tmp3 691 } 692 693 define <2 x i64> @addp_2d(<2 x i64>* %A, <2 x i64>* %B) nounwind { 694 ;CHECK-LABEL: addp_2d: 695 ;CHECK: addp.2d 696 %tmp1 = load <2 x i64>, <2 x i64>* %A 697 %tmp2 = load <2 x i64>, <2 x i64>* %B 698 %tmp3 = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2) 699 ret <2 x i64> %tmp3 700 } 701 702 declare <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone 703 declare <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone 704 declare <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone 705 declare <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone 706 declare <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 707 declare <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 708 declare <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64>, <2 x i64>) nounwind readnone 709 710 define <2 x float> @faddp_2s(<2 x float>* %A, <2 x float>* %B) nounwind { 711 ;CHECK-LABEL: faddp_2s: 712 ;CHECK: faddp.2s 713 %tmp1 = load <2 x float>, <2 x float>* %A 714 %tmp2 = load <2 x float>, <2 x float>* %B 715 %tmp3 = call <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2) 716 ret <2 x float> %tmp3 717 } 718 719 define <4 x float> @faddp_4s(<4 x float>* %A, <4 x float>* %B) nounwind { 720 ;CHECK-LABEL: faddp_4s: 721 ;CHECK: faddp.4s 722 %tmp1 = load <4 x float>, <4 x float>* %A 723 %tmp2 = load <4 x float>, <4 x float>* %B 724 %tmp3 = call <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2) 725 ret <4 x float> %tmp3 726 } 727 728 define <2 x double> @faddp_2d(<2 x double>* %A, <2 x double>* %B) nounwind { 729 ;CHECK-LABEL: faddp_2d: 730 ;CHECK: faddp.2d 731 %tmp1 = load <2 x double>, <2 x double>* %A 732 %tmp2 = load <2 x double>, <2 x double>* %B 733 %tmp3 = call <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2) 734 ret <2 x double> %tmp3 735 } 736 737 declare <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float>, <2 x float>) nounwind readnone 738 declare <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float>, <4 x float>) nounwind readnone 739 declare <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double>, <2 x double>) nounwind readnone 740 741 define <2 x i64> @uaddl2_duprhs(<4 x i32> %lhs, i32 %rhs) { 742 ; CHECK-LABEL: uaddl2_duprhs 743 ; CHECK-NOT: ext.16b 744 ; CHECK: uaddl2.2d 745 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0 746 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1 747 748 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 749 750 %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64> 751 %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64> 752 753 %res = add <2 x i64> %lhs.ext, %rhs.ext 754 ret <2 x i64> %res 755 } 756 757 define <2 x i64> @saddl2_duplhs(i32 %lhs, <4 x i32> %rhs) { 758 ; CHECK-LABEL: saddl2_duplhs 759 ; CHECK-NOT: ext.16b 760 ; CHECK: saddl2.2d 761 %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0 762 %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1 763 764 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 765 766 %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64> 767 %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64> 768 769 %res = add <2 x i64> %lhs.ext, %rhs.ext 770 ret <2 x i64> %res 771 } 772 773 define <2 x i64> @usubl2_duprhs(<4 x i32> %lhs, i32 %rhs) { 774 ; CHECK-LABEL: usubl2_duprhs 775 ; CHECK-NOT: ext.16b 776 ; CHECK: usubl2.2d 777 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0 778 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1 779 780 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 781 782 %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64> 783 %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64> 784 785 %res = sub <2 x i64> %lhs.ext, %rhs.ext 786 ret <2 x i64> %res 787 } 788 789 define <2 x i64> @ssubl2_duplhs(i32 %lhs, <4 x i32> %rhs) { 790 ; CHECK-LABEL: ssubl2_duplhs 791 ; CHECK-NOT: ext.16b 792 ; CHECK: ssubl2.2d 793 %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0 794 %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1 795 796 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 797 798 %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64> 799 %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64> 800 801 %res = sub <2 x i64> %lhs.ext, %rhs.ext 802 ret <2 x i64> %res 803 } 804 805 define <8 x i8> @addhn8b_natural(<8 x i16>* %A, <8 x i16>* %B) nounwind { 806 ;CHECK-LABEL: addhn8b_natural: 807 ;CHECK: addhn.8b 808 %tmp1 = load <8 x i16>, <8 x i16>* %A 809 %tmp2 = load <8 x i16>, <8 x i16>* %B 810 %sum = add <8 x i16> %tmp1, %tmp2 811 %high_bits = lshr <8 x i16> %sum, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 812 %narrowed = trunc <8 x i16> %high_bits to <8 x i8> 813 ret <8 x i8> %narrowed 814 } 815 816 define <4 x i16> @addhn4h_natural(<4 x i32>* %A, <4 x i32>* %B) nounwind { 817 ;CHECK-LABEL: addhn4h_natural: 818 ;CHECK: addhn.4h 819 %tmp1 = load <4 x i32>, <4 x i32>* %A 820 %tmp2 = load <4 x i32>, <4 x i32>* %B 821 %sum = add <4 x i32> %tmp1, %tmp2 822 %high_bits = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16> 823 %narrowed = trunc <4 x i32> %high_bits to <4 x i16> 824 ret <4 x i16> %narrowed 825 } 826 827 define <2 x i32> @addhn2s_natural(<2 x i64>* %A, <2 x i64>* %B) nounwind { 828 ;CHECK-LABEL: addhn2s_natural: 829 ;CHECK: addhn.2s 830 %tmp1 = load <2 x i64>, <2 x i64>* %A 831 %tmp2 = load <2 x i64>, <2 x i64>* %B 832 %sum = add <2 x i64> %tmp1, %tmp2 833 %high_bits = lshr <2 x i64> %sum, <i64 32, i64 32> 834 %narrowed = trunc <2 x i64> %high_bits to <2 x i32> 835 ret <2 x i32> %narrowed 836 } 837 838 define <16 x i8> @addhn2_16b_natural(<8 x i8> %low, <8 x i16>* %A, <8 x i16>* %B) nounwind { 839 ;CHECK-LABEL: addhn2_16b_natural: 840 ;CHECK: addhn2.16b 841 %tmp1 = load <8 x i16>, <8 x i16>* %A 842 %tmp2 = load <8 x i16>, <8 x i16>* %B 843 %sum = add <8 x i16> %tmp1, %tmp2 844 %high_bits = lshr <8 x i16> %sum, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 845 %narrowed = trunc <8 x i16> %high_bits to <8 x i8> 846 %res = shufflevector <8 x i8> %low, <8 x i8> %narrowed, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 847 ret <16 x i8> %res 848 } 849 850 define <8 x i16> @addhn2_8h_natural(<4 x i16> %low, <4 x i32>* %A, <4 x i32>* %B) nounwind { 851 ;CHECK-LABEL: addhn2_8h_natural: 852 ;CHECK: addhn2.8h 853 %tmp1 = load <4 x i32>, <4 x i32>* %A 854 %tmp2 = load <4 x i32>, <4 x i32>* %B 855 %sum = add <4 x i32> %tmp1, %tmp2 856 %high_bits = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16> 857 %narrowed = trunc <4 x i32> %high_bits to <4 x i16> 858 %res = shufflevector <4 x i16> %low, <4 x i16> %narrowed, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 859 ret <8 x i16> %res 860 } 861 862 define <4 x i32> @addhn2_4s_natural(<2 x i32> %low, <2 x i64>* %A, <2 x i64>* %B) nounwind { 863 ;CHECK-LABEL: addhn2_4s_natural: 864 ;CHECK: addhn2.4s 865 %tmp1 = load <2 x i64>, <2 x i64>* %A 866 %tmp2 = load <2 x i64>, <2 x i64>* %B 867 %sum = add <2 x i64> %tmp1, %tmp2 868 %high_bits = lshr <2 x i64> %sum, <i64 32, i64 32> 869 %narrowed = trunc <2 x i64> %high_bits to <2 x i32> 870 %res = shufflevector <2 x i32> %low, <2 x i32> %narrowed, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 871 ret <4 x i32> %res 872 } 873 874 define <8 x i8> @subhn8b_natural(<8 x i16>* %A, <8 x i16>* %B) nounwind { 875 ;CHECK-LABEL: subhn8b_natural: 876 ;CHECK: subhn.8b 877 %tmp1 = load <8 x i16>, <8 x i16>* %A 878 %tmp2 = load <8 x i16>, <8 x i16>* %B 879 %diff = sub <8 x i16> %tmp1, %tmp2 880 %high_bits = lshr <8 x i16> %diff, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 881 %narrowed = trunc <8 x i16> %high_bits to <8 x i8> 882 ret <8 x i8> %narrowed 883 } 884 885 define <4 x i16> @subhn4h_natural(<4 x i32>* %A, <4 x i32>* %B) nounwind { 886 ;CHECK-LABEL: subhn4h_natural: 887 ;CHECK: subhn.4h 888 %tmp1 = load <4 x i32>, <4 x i32>* %A 889 %tmp2 = load <4 x i32>, <4 x i32>* %B 890 %diff = sub <4 x i32> %tmp1, %tmp2 891 %high_bits = lshr <4 x i32> %diff, <i32 16, i32 16, i32 16, i32 16> 892 %narrowed = trunc <4 x i32> %high_bits to <4 x i16> 893 ret <4 x i16> %narrowed 894 } 895 896 define <2 x i32> @subhn2s_natural(<2 x i64>* %A, <2 x i64>* %B) nounwind { 897 ;CHECK-LABEL: subhn2s_natural: 898 ;CHECK: subhn.2s 899 %tmp1 = load <2 x i64>, <2 x i64>* %A 900 %tmp2 = load <2 x i64>, <2 x i64>* %B 901 %diff = sub <2 x i64> %tmp1, %tmp2 902 %high_bits = lshr <2 x i64> %diff, <i64 32, i64 32> 903 %narrowed = trunc <2 x i64> %high_bits to <2 x i32> 904 ret <2 x i32> %narrowed 905 } 906 907 define <16 x i8> @subhn2_16b_natural(<8 x i8> %low, <8 x i16>* %A, <8 x i16>* %B) nounwind { 908 ;CHECK-LABEL: subhn2_16b_natural: 909 ;CHECK: subhn2.16b 910 %tmp1 = load <8 x i16>, <8 x i16>* %A 911 %tmp2 = load <8 x i16>, <8 x i16>* %B 912 %diff = sub <8 x i16> %tmp1, %tmp2 913 %high_bits = lshr <8 x i16> %diff, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 914 %narrowed = trunc <8 x i16> %high_bits to <8 x i8> 915 %res = shufflevector <8 x i8> %low, <8 x i8> %narrowed, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 916 ret <16 x i8> %res 917 } 918 919 define <8 x i16> @subhn2_8h_natural(<4 x i16> %low, <4 x i32>* %A, <4 x i32>* %B) nounwind { 920 ;CHECK-LABEL: subhn2_8h_natural: 921 ;CHECK: subhn2.8h 922 %tmp1 = load <4 x i32>, <4 x i32>* %A 923 %tmp2 = load <4 x i32>, <4 x i32>* %B 924 %diff = sub <4 x i32> %tmp1, %tmp2 925 %high_bits = lshr <4 x i32> %diff, <i32 16, i32 16, i32 16, i32 16> 926 %narrowed = trunc <4 x i32> %high_bits to <4 x i16> 927 %res = shufflevector <4 x i16> %low, <4 x i16> %narrowed, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 928 ret <8 x i16> %res 929 } 930 931 define <4 x i32> @subhn2_4s_natural(<2 x i32> %low, <2 x i64>* %A, <2 x i64>* %B) nounwind { 932 ;CHECK-LABEL: subhn2_4s_natural: 933 ;CHECK: subhn2.4s 934 %tmp1 = load <2 x i64>, <2 x i64>* %A 935 %tmp2 = load <2 x i64>, <2 x i64>* %B 936 %diff = sub <2 x i64> %tmp1, %tmp2 937 %high_bits = lshr <2 x i64> %diff, <i64 32, i64 32> 938 %narrowed = trunc <2 x i64> %high_bits to <2 x i32> 939 %res = shufflevector <2 x i32> %low, <2 x i32> %narrowed, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 940 ret <4 x i32> %res 941 } 942