1 ; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast \ 2 ; RUN: < %s -verify-machineinstrs -asm-verbose=false | FileCheck %s 3 4 define <4 x i32> @test_vmull_high_n_s16(<8 x i16> %a, i16 %b) #0 { 5 ; CHECK-LABEL: test_vmull_high_n_s16: 6 ; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].8h, w0 7 ; CHECK-NEXT: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h 8 ; CHECK-NEXT: ret 9 entry: 10 %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 11 %vecinit.i.i = insertelement <4 x i16> undef, i16 %b, i32 0 12 %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %b, i32 1 13 %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %b, i32 2 14 %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %b, i32 3 15 %vmull15.i.i = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i) 16 ret <4 x i32> %vmull15.i.i 17 } 18 19 define <4 x i32> @test_vmull_high_n_s16_imm(<8 x i16> %a) #0 { 20 ; CHECK-LABEL: test_vmull_high_n_s16_imm: 21 ; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].8h, #29 22 ; CHECK-NEXT: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h 23 ; CHECK-NEXT: ret 24 entry: 25 %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 26 %vmull15.i.i = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> <i16 29, i16 29, i16 29, i16 29>) 27 ret <4 x i32> %vmull15.i.i 28 } 29 30 define <2 x i64> @test_vmull_high_n_s32(<4 x i32> %a, i32 %b) #0 { 31 ; CHECK-LABEL: test_vmull_high_n_s32: 32 ; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].4s, w0 33 ; CHECK-NEXT: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s 34 ; CHECK-NEXT: ret 35 entry: 36 %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 37 %vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0 38 %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %b, i32 1 39 %vmull9.i.i = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i) 40 ret <2 x i64> %vmull9.i.i 41 } 42 43 define <2 x i64> @test_vmull_high_n_s32_imm(<4 x i32> %a) #0 { 44 ; CHECK-LABEL: test_vmull_high_n_s32_imm: 45 ; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].4s, #1, msl #8 46 ; CHECK-NEXT: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s 47 ; CHECK-NEXT: ret 48 entry: 49 %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 50 %vmull9.i.i = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> <i32 511, i32 511>) 51 ret <2 x i64> %vmull9.i.i 52 } 53 54 define <4 x i32> @test_vmull_high_n_u16(<8 x i16> %a, i16 %b) #0 { 55 ; CHECK-LABEL: test_vmull_high_n_u16: 56 ; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].8h, w0 57 ; CHECK-NEXT: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h 58 ; CHECK-NEXT: ret 59 entry: 60 %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 61 %vecinit.i.i = insertelement <4 x i16> undef, i16 %b, i32 0 62 %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %b, i32 1 63 %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %b, i32 2 64 %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %b, i32 3 65 %vmull15.i.i = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i) 66 ret <4 x i32> %vmull15.i.i 67 } 68 69 define <4 x i32> @test_vmull_high_n_u16_imm(<8 x i16> %a) #0 { 70 ; CHECK-LABEL: test_vmull_high_n_u16_imm: 71 ; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].8h, #17, lsl #8 72 ; CHECK-NEXT: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h 73 ; CHECK-NEXT: ret 74 entry: 75 %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 76 %vmull15.i.i = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> <i16 4352, i16 4352, i16 4352, i16 4352>) 77 ret <4 x i32> %vmull15.i.i 78 } 79 80 define <2 x i64> @test_vmull_high_n_u32(<4 x i32> %a, i32 %b) #0 { 81 ; CHECK-LABEL: test_vmull_high_n_u32: 82 ; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].4s, w0 83 ; CHECK-NEXT: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s 84 ; CHECK-NEXT: ret 85 entry: 86 %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 87 %vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0 88 %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %b, i32 1 89 %vmull9.i.i = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i) 90 ret <2 x i64> %vmull9.i.i 91 } 92 93 define <2 x i64> @test_vmull_high_n_u32_imm(<4 x i32> %a) #0 { 94 ; CHECK-LABEL: test_vmull_high_n_u32_imm: 95 ; CHECK-NEXT: mvni [[REPLICATE:v[0-9]+]].4s, #1, msl #8 96 ; CHECK-NEXT: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s 97 ; CHECK-NEXT: ret 98 entry: 99 %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 100 %vmull9.i.i = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> <i32 4294966784, i32 4294966784>) 101 ret <2 x i64> %vmull9.i.i 102 } 103 104 define <4 x i32> @test_vqdmull_high_n_s16(<8 x i16> %a, i16 %b) #0 { 105 ; CHECK-LABEL: test_vqdmull_high_n_s16: 106 ; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].8h, w0 107 ; CHECK-NEXT: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h 108 ; CHECK-NEXT: ret 109 entry: 110 %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 111 %vecinit.i.i = insertelement <4 x i16> undef, i16 %b, i32 0 112 %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %b, i32 1 113 %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %b, i32 2 114 %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %b, i32 3 115 %vqdmull15.i.i = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i) 116 ret <4 x i32> %vqdmull15.i.i 117 } 118 119 define <4 x i32> @test_vqdmull_high_n_s16_imm(<8 x i16> %a) #0 { 120 ; CHECK-LABEL: test_vqdmull_high_n_s16_imm: 121 ; CHECK-NEXT: mvni [[REPLICATE:v[0-9]+]].8h, #17, lsl #8 122 ; CHECK-NEXT: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h 123 ; CHECK-NEXT: ret 124 entry: 125 %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 126 %vqdmull15.i.i = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> <i16 61183, i16 61183, i16 61183, i16 61183>) 127 ret <4 x i32> %vqdmull15.i.i 128 } 129 130 define <2 x i64> @test_vqdmull_high_n_s32(<4 x i32> %a, i32 %b) #0 { 131 ; CHECK-LABEL: test_vqdmull_high_n_s32: 132 ; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].4s, w0 133 ; CHECK-NEXT: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s 134 ; CHECK-NEXT: ret 135 entry: 136 %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 137 %vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0 138 %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %b, i32 1 139 %vqdmull9.i.i = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i) 140 ret <2 x i64> %vqdmull9.i.i 141 } 142 143 define <2 x i64> @test_vqdmull_high_n_s32_imm(<4 x i32> %a) #0 { 144 ; CHECK-LABEL: test_vqdmull_high_n_s32_imm: 145 ; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].4s, #29 146 ; CHECK-NEXT: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s 147 ; CHECK-NEXT: ret 148 entry: 149 %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 150 %vqdmull9.i.i = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> <i32 29, i32 29>) 151 ret <2 x i64> %vqdmull9.i.i 152 } 153 154 define <4 x i32> @test_vmlal_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 { 155 ; CHECK-LABEL: test_vmlal_high_n_s16: 156 ; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].8h, w0 157 ; CHECK-NEXT: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h 158 ; CHECK-NEXT: ret 159 entry: 160 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 161 %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0 162 %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1 163 %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2 164 %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3 165 %vmull2.i.i.i = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i) 166 %add.i.i = add <4 x i32> %vmull2.i.i.i, %a 167 ret <4 x i32> %add.i.i 168 } 169 170 define <4 x i32> @test_vmlal_high_n_s16_imm(<4 x i32> %a, <8 x i16> %b) #0 { 171 ; CHECK-LABEL: test_vmlal_high_n_s16_imm: 172 ; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].8h, #29 173 ; CHECK-NEXT: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h 174 ; CHECK-NEXT: ret 175 entry: 176 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 177 %vmull2.i.i.i = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> <i16 29, i16 29, i16 29, i16 29>) 178 %add.i.i = add <4 x i32> %vmull2.i.i.i, %a 179 ret <4 x i32> %add.i.i 180 } 181 182 define <2 x i64> @test_vmlal_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 { 183 ; CHECK-LABEL: test_vmlal_high_n_s32: 184 ; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].4s, w0 185 ; CHECK-NEXT: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s 186 ; CHECK-NEXT: ret 187 entry: 188 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 189 %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0 190 %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1 191 %vmull2.i.i.i = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i) 192 %add.i.i = add <2 x i64> %vmull2.i.i.i, %a 193 ret <2 x i64> %add.i.i 194 } 195 196 define <2 x i64> @test_vmlal_high_n_s32_imm(<2 x i64> %a, <4 x i32> %b) #0 { 197 ; CHECK-LABEL: test_vmlal_high_n_s32_imm: 198 ; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].4s, #29 199 ; CHECK-NEXT: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s 200 ; CHECK-NEXT: ret 201 entry: 202 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 203 %vmull2.i.i.i = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> <i32 29, i32 29>) 204 %add.i.i = add <2 x i64> %vmull2.i.i.i, %a 205 ret <2 x i64> %add.i.i 206 } 207 208 define <4 x i32> @test_vmlal_high_n_u16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 { 209 ; CHECK-LABEL: test_vmlal_high_n_u16: 210 ; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].8h, w0 211 ; CHECK-NEXT: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h 212 ; CHECK-NEXT: ret 213 entry: 214 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 215 %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0 216 %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1 217 %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2 218 %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3 219 %vmull2.i.i.i = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i) 220 %add.i.i = add <4 x i32> %vmull2.i.i.i, %a 221 ret <4 x i32> %add.i.i 222 } 223 224 define <4 x i32> @test_vmlal_high_n_u16_imm(<4 x i32> %a, <8 x i16> %b) #0 { 225 ; CHECK-LABEL: test_vmlal_high_n_u16_imm: 226 ; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].8h, #29 227 ; CHECK-NEXT: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h 228 ; CHECK-NEXT: ret 229 entry: 230 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 231 %vmull2.i.i.i = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> <i16 29, i16 29, i16 29, i16 29>) 232 %add.i.i = add <4 x i32> %vmull2.i.i.i, %a 233 ret <4 x i32> %add.i.i 234 } 235 236 define <2 x i64> @test_vmlal_high_n_u32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 { 237 ; CHECK-LABEL: test_vmlal_high_n_u32: 238 ; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].4s, w0 239 ; CHECK-NEXT: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s 240 ; CHECK-NEXT: ret 241 entry: 242 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 243 %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0 244 %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1 245 %vmull2.i.i.i = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i) 246 %add.i.i = add <2 x i64> %vmull2.i.i.i, %a 247 ret <2 x i64> %add.i.i 248 } 249 250 define <2 x i64> @test_vmlal_high_n_u32_imm(<2 x i64> %a, <4 x i32> %b) #0 { 251 ; CHECK-LABEL: test_vmlal_high_n_u32_imm: 252 ; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].4s, #29 253 ; CHECK-NEXT: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s 254 ; CHECK-NEXT: ret 255 entry: 256 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 257 %vmull2.i.i.i = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> <i32 29, i32 29>) 258 %add.i.i = add <2 x i64> %vmull2.i.i.i, %a 259 ret <2 x i64> %add.i.i 260 } 261 262 define <4 x i32> @test_vqdmlal_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 { 263 ; CHECK-LABEL: test_vqdmlal_high_n_s16: 264 ; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].8h, w0 265 ; CHECK-NEXT: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h 266 ; CHECK-NEXT: ret 267 entry: 268 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 269 %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0 270 %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1 271 %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2 272 %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3 273 %vqdmlal15.i.i = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i) 274 %vqdmlal17.i.i = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal15.i.i) 275 ret <4 x i32> %vqdmlal17.i.i 276 } 277 278 define <4 x i32> @test_vqdmlal_high_n_s16_imm(<4 x i32> %a, <8 x i16> %b) #0 { 279 ; CHECK-LABEL: test_vqdmlal_high_n_s16_imm: 280 ; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].8h, #29 281 ; CHECK-NEXT: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h 282 ; CHECK-NEXT: ret 283 entry: 284 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 285 %vqdmlal15.i.i = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> <i16 29, i16 29, i16 29, i16 29>) 286 %vqdmlal17.i.i = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal15.i.i) 287 ret <4 x i32> %vqdmlal17.i.i 288 } 289 290 define <2 x i64> @test_vqdmlal_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 { 291 ; CHECK-LABEL: test_vqdmlal_high_n_s32: 292 ; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].4s, w0 293 ; CHECK-NEXT: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s 294 ; CHECK-NEXT: ret 295 entry: 296 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 297 %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0 298 %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1 299 %vqdmlal9.i.i = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i) 300 %vqdmlal11.i.i = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal9.i.i) 301 ret <2 x i64> %vqdmlal11.i.i 302 } 303 304 define <2 x i64> @test_vqdmlal_high_n_s32_imm(<2 x i64> %a, <4 x i32> %b) #0 { 305 ; CHECK-LABEL: test_vqdmlal_high_n_s32_imm: 306 ; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].4s, #29 307 ; CHECK-NEXT: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s 308 ; CHECK-NEXT: ret 309 entry: 310 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 311 %vqdmlal9.i.i = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> <i32 29, i32 29>) 312 %vqdmlal11.i.i = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal9.i.i) 313 ret <2 x i64> %vqdmlal11.i.i 314 } 315 316 define <4 x i32> @test_vmlsl_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 { 317 ; CHECK-LABEL: test_vmlsl_high_n_s16: 318 ; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].8h, w0 319 ; CHECK-NEXT: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h 320 ; CHECK-NEXT: ret 321 entry: 322 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 323 %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0 324 %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1 325 %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2 326 %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3 327 %vmull2.i.i.i = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i) 328 %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i 329 ret <4 x i32> %sub.i.i 330 } 331 332 define <4 x i32> @test_vmlsl_high_n_s16_imm(<4 x i32> %a, <8 x i16> %b) #0 { 333 ; CHECK-LABEL: test_vmlsl_high_n_s16_imm: 334 ; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].8h, #29 335 ; CHECK-NEXT: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h 336 ; CHECK-NEXT: ret 337 entry: 338 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 339 %vmull2.i.i.i = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> <i16 29, i16 29, i16 29, i16 29>) 340 %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i 341 ret <4 x i32> %sub.i.i 342 } 343 344 define <2 x i64> @test_vmlsl_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 { 345 ; CHECK-LABEL: test_vmlsl_high_n_s32: 346 ; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].4s, w0 347 ; CHECK-NEXT: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s 348 ; CHECK-NEXT: ret 349 entry: 350 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 351 %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0 352 %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1 353 %vmull2.i.i.i = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i) 354 %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i 355 ret <2 x i64> %sub.i.i 356 } 357 358 define <2 x i64> @test_vmlsl_high_n_s32_imm(<2 x i64> %a, <4 x i32> %b) #0 { 359 ; CHECK-LABEL: test_vmlsl_high_n_s32_imm: 360 ; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].4s, #29 361 ; CHECK-NEXT: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s 362 ; CHECK-NEXT: ret 363 entry: 364 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 365 %vmull2.i.i.i = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> <i32 29, i32 29>) 366 %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i 367 ret <2 x i64> %sub.i.i 368 } 369 370 define <4 x i32> @test_vmlsl_high_n_u16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 { 371 ; CHECK-LABEL: test_vmlsl_high_n_u16: 372 ; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].8h, w0 373 ; CHECK-NEXT: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h 374 ; CHECK-NEXT: ret 375 entry: 376 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 377 %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0 378 %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1 379 %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2 380 %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3 381 %vmull2.i.i.i = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i) 382 %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i 383 ret <4 x i32> %sub.i.i 384 } 385 386 define <4 x i32> @test_vmlsl_high_n_u16_imm(<4 x i32> %a, <8 x i16> %b) #0 { 387 ; CHECK-LABEL: test_vmlsl_high_n_u16_imm: 388 ; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].8h, #29 389 ; CHECK-NEXT: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h 390 ; CHECK-NEXT: ret 391 entry: 392 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 393 %vmull2.i.i.i = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> <i16 29, i16 29, i16 29, i16 29>) 394 %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i 395 ret <4 x i32> %sub.i.i 396 } 397 398 define <2 x i64> @test_vmlsl_high_n_u32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 { 399 ; CHECK-LABEL: test_vmlsl_high_n_u32: 400 ; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].4s, w0 401 ; CHECK-NEXT: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s 402 ; CHECK-NEXT: ret 403 entry: 404 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 405 %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0 406 %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1 407 %vmull2.i.i.i = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i) 408 %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i 409 ret <2 x i64> %sub.i.i 410 } 411 412 define <2 x i64> @test_vmlsl_high_n_u32_imm(<2 x i64> %a, <4 x i32> %b) #0 { 413 ; CHECK-LABEL: test_vmlsl_high_n_u32_imm: 414 ; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].4s, #29 415 ; CHECK-NEXT: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s 416 ; CHECK-NEXT: ret 417 entry: 418 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 419 %vmull2.i.i.i = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> <i32 29, i32 29>) 420 %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i 421 ret <2 x i64> %sub.i.i 422 } 423 424 define <4 x i32> @test_vqdmlsl_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 { 425 ; CHECK-LABEL: test_vqdmlsl_high_n_s16: 426 ; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].8h, w0 427 ; CHECK-NEXT: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h 428 ; CHECK-NEXT: ret 429 entry: 430 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 431 %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0 432 %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1 433 %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2 434 %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3 435 %vqdmlsl15.i.i = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i) 436 %vqdmlsl17.i.i = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl15.i.i) 437 ret <4 x i32> %vqdmlsl17.i.i 438 } 439 440 define <4 x i32> @test_vqdmlsl_high_n_s16_imm(<4 x i32> %a, <8 x i16> %b) #0 { 441 ; CHECK-LABEL: test_vqdmlsl_high_n_s16_imm: 442 ; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].8h, #29 443 ; CHECK-NEXT: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h 444 ; CHECK-NEXT: ret 445 entry: 446 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 447 %vqdmlsl15.i.i = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> <i16 29, i16 29, i16 29, i16 29>) 448 %vqdmlsl17.i.i = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl15.i.i) 449 ret <4 x i32> %vqdmlsl17.i.i 450 } 451 452 define <2 x i64> @test_vqdmlsl_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 { 453 ; CHECK-LABEL: test_vqdmlsl_high_n_s32: 454 ; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].4s, w0 455 ; CHECK-NEXT: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s 456 ; CHECK-NEXT: ret 457 entry: 458 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 459 %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0 460 %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1 461 %vqdmlsl9.i.i = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i) 462 %vqdmlsl11.i.i = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl9.i.i) 463 ret <2 x i64> %vqdmlsl11.i.i 464 } 465 466 define <2 x i64> @test_vqdmlsl_high_n_s32_imm(<2 x i64> %a, <4 x i32> %b) #0 { 467 ; CHECK-LABEL: test_vqdmlsl_high_n_s32_imm: 468 ; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].4s, #29 469 ; CHECK-NEXT: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s 470 ; CHECK-NEXT: ret 471 entry: 472 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 473 %vqdmlsl9.i.i = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> <i32 29, i32 29>) 474 %vqdmlsl11.i.i = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl9.i.i) 475 ret <2 x i64> %vqdmlsl11.i.i 476 } 477 478 define <2 x float> @test_vmul_n_f32(<2 x float> %a, float %b) #0 { 479 ; CHECK-LABEL: test_vmul_n_f32: 480 ; CHECK-NEXT: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] 481 ; CHECK-NEXT: ret 482 entry: 483 %vecinit.i = insertelement <2 x float> undef, float %b, i32 0 484 %vecinit1.i = insertelement <2 x float> %vecinit.i, float %b, i32 1 485 %mul.i = fmul <2 x float> %vecinit1.i, %a 486 ret <2 x float> %mul.i 487 } 488 489 define <4 x float> @test_vmulq_n_f32(<4 x float> %a, float %b) #0 { 490 ; CHECK-LABEL: test_vmulq_n_f32: 491 ; CHECK-NEXT: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] 492 ; CHECK-NEXT: ret 493 entry: 494 %vecinit.i = insertelement <4 x float> undef, float %b, i32 0 495 %vecinit1.i = insertelement <4 x float> %vecinit.i, float %b, i32 1 496 %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %b, i32 2 497 %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %b, i32 3 498 %mul.i = fmul <4 x float> %vecinit3.i, %a 499 ret <4 x float> %mul.i 500 } 501 502 define <2 x double> @test_vmulq_n_f64(<2 x double> %a, double %b) #0 { 503 ; CHECK-LABEL: test_vmulq_n_f64: 504 ; CHECK-NEXT: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] 505 ; CHECK-NEXT: ret 506 entry: 507 %vecinit.i = insertelement <2 x double> undef, double %b, i32 0 508 %vecinit1.i = insertelement <2 x double> %vecinit.i, double %b, i32 1 509 %mul.i = fmul <2 x double> %vecinit1.i, %a 510 ret <2 x double> %mul.i 511 } 512 513 define <2 x float> @test_vfma_n_f32(<2 x float> %a, <2 x float> %b, float %n) #0 { 514 ; CHECK-LABEL: test_vfma_n_f32: 515 ; CHECK-NEXT: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}] 516 ; CHECK-NEXT: ret 517 entry: 518 %vecinit.i = insertelement <2 x float> undef, float %n, i32 0 519 %vecinit1.i = insertelement <2 x float> %vecinit.i, float %n, i32 1 520 %0 = call <2 x float> @llvm.fma.v2f32(<2 x float> %b, <2 x float> %vecinit1.i, <2 x float> %a) 521 ret <2 x float> %0 522 } 523 524 define <4 x float> @test_vfmaq_n_f32(<4 x float> %a, <4 x float> %b, float %n) #0 { 525 ; CHECK-LABEL: test_vfmaq_n_f32: 526 ; CHECK-NEXT: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] 527 ; CHECK-NEXT: ret 528 entry: 529 %vecinit.i = insertelement <4 x float> undef, float %n, i32 0 530 %vecinit1.i = insertelement <4 x float> %vecinit.i, float %n, i32 1 531 %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %n, i32 2 532 %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %n, i32 3 533 %0 = call <4 x float> @llvm.fma.v4f32(<4 x float> %b, <4 x float> %vecinit3.i, <4 x float> %a) 534 ret <4 x float> %0 535 } 536 537 define <2 x float> @test_vfms_n_f32(<2 x float> %a, <2 x float> %b, float %n) #0 { 538 ; CHECK-LABEL: test_vfms_n_f32: 539 ; CHECK-NEXT: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}] 540 ; CHECK-NEXT: ret 541 entry: 542 %vecinit.i = insertelement <2 x float> undef, float %n, i32 0 543 %vecinit1.i = insertelement <2 x float> %vecinit.i, float %n, i32 1 544 %0 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b 545 %1 = call <2 x float> @llvm.fma.v2f32(<2 x float> %0, <2 x float> %vecinit1.i, <2 x float> %a) 546 ret <2 x float> %1 547 } 548 549 define <4 x float> @test_vfmsq_n_f32(<4 x float> %a, <4 x float> %b, float %n) #0 { 550 ; CHECK-LABEL: test_vfmsq_n_f32: 551 ; CHECK-NEXT: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] 552 ; CHECK-NEXT: ret 553 entry: 554 %vecinit.i = insertelement <4 x float> undef, float %n, i32 0 555 %vecinit1.i = insertelement <4 x float> %vecinit.i, float %n, i32 1 556 %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %n, i32 2 557 %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %n, i32 3 558 %0 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b 559 %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %0, <4 x float> %vecinit3.i, <4 x float> %a) 560 ret <4 x float> %1 561 } 562 563 attributes #0 = { nounwind } 564 565 declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>) 566 declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>) 567 declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>) 568 declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>) 569 declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>) 570 declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>) 571 declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>) 572 declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>) 573 declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>) 574 declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>) 575 declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) 576 declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) 577