1 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s -o -| FileCheck %s 2 3 define <8 x i16> @smull_v8i8_v8i16(<8 x i8>* %A, <8 x i8>* %B) nounwind { 4 ; CHECK-LABEL: smull_v8i8_v8i16: 5 ; CHECK: smull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b 6 %tmp1 = load <8 x i8>, <8 x i8>* %A 7 %tmp2 = load <8 x i8>, <8 x i8>* %B 8 %tmp3 = sext <8 x i8> %tmp1 to <8 x i16> 9 %tmp4 = sext <8 x i8> %tmp2 to <8 x i16> 10 %tmp5 = mul <8 x i16> %tmp3, %tmp4 11 ret <8 x i16> %tmp5 12 } 13 14 define <4 x i32> @smull_v4i16_v4i32(<4 x i16>* %A, <4 x i16>* %B) nounwind { 15 ; CHECK-LABEL: smull_v4i16_v4i32: 16 ; CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h 17 %tmp1 = load <4 x i16>, <4 x i16>* %A 18 %tmp2 = load <4 x i16>, <4 x i16>* %B 19 %tmp3 = sext <4 x i16> %tmp1 to <4 x i32> 20 %tmp4 = sext <4 x i16> %tmp2 to <4 x i32> 21 %tmp5 = mul <4 x i32> %tmp3, %tmp4 22 ret <4 x i32> %tmp5 23 } 24 25 define <2 x i64> @smull_v2i32_v2i64(<2 x i32>* %A, <2 x i32>* %B) nounwind { 26 ; CHECK-LABEL: smull_v2i32_v2i64: 27 ; CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s 28 %tmp1 = load <2 x i32>, <2 x i32>* %A 29 %tmp2 = load <2 x i32>, <2 x i32>* %B 30 %tmp3 = sext <2 x i32> %tmp1 to <2 x i64> 31 %tmp4 = sext <2 x i32> %tmp2 to <2 x i64> 32 %tmp5 = mul <2 x i64> %tmp3, %tmp4 33 ret <2 x i64> %tmp5 34 } 35 36 define <8 x i16> @umull_v8i8_v8i16(<8 x i8>* %A, <8 x i8>* %B) nounwind { 37 ; CHECK-LABEL: umull_v8i8_v8i16: 38 ; CHECK: umull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b 39 %tmp1 = load <8 x i8>, <8 x i8>* %A 40 %tmp2 = load <8 x i8>, <8 x i8>* %B 41 %tmp3 = zext <8 x i8> %tmp1 to <8 x i16> 42 %tmp4 = zext <8 x i8> %tmp2 to <8 x i16> 43 %tmp5 = mul <8 x i16> %tmp3, %tmp4 44 ret <8 x i16> %tmp5 45 } 46 47 define <4 x i32> @umull_v4i16_v4i32(<4 x i16>* %A, <4 x i16>* %B) nounwind { 48 ; CHECK-LABEL: umull_v4i16_v4i32: 49 ; CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h 50 %tmp1 = load <4 x i16>, <4 x i16>* %A 51 %tmp2 = load <4 x i16>, <4 x i16>* %B 52 %tmp3 = zext <4 x i16> %tmp1 to <4 x i32> 53 %tmp4 = zext <4 x i16> %tmp2 to <4 x i32> 54 %tmp5 = mul <4 x i32> %tmp3, %tmp4 55 ret <4 x i32> %tmp5 56 } 57 58 define <2 x i64> @umull_v2i32_v2i64(<2 x i32>* %A, <2 x i32>* %B) nounwind { 59 ; CHECK-LABEL: umull_v2i32_v2i64: 60 ; CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s 61 %tmp1 = load <2 x i32>, <2 x i32>* %A 62 %tmp2 = load <2 x i32>, <2 x i32>* %B 63 %tmp3 = zext <2 x i32> %tmp1 to <2 x i64> 64 %tmp4 = zext <2 x i32> %tmp2 to <2 x i64> 65 %tmp5 = mul <2 x i64> %tmp3, %tmp4 66 ret <2 x i64> %tmp5 67 } 68 69 define <8 x i16> @smlal_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { 70 ; CHECK-LABEL: smlal_v8i8_v8i16: 71 ; CHECK: smlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b 72 %tmp1 = load <8 x i16>, <8 x i16>* %A 73 %tmp2 = load <8 x i8>, <8 x i8>* %B 74 %tmp3 = load <8 x i8>, <8 x i8>* %C 75 %tmp4 = sext <8 x i8> %tmp2 to <8 x i16> 76 %tmp5 = sext <8 x i8> %tmp3 to <8 x i16> 77 %tmp6 = mul <8 x i16> %tmp4, %tmp5 78 %tmp7 = add <8 x i16> %tmp1, %tmp6 79 ret <8 x i16> %tmp7 80 } 81 82 define <4 x i32> @smlal_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { 83 ; CHECK-LABEL: smlal_v4i16_v4i32: 84 ; CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h 85 %tmp1 = load <4 x i32>, <4 x i32>* %A 86 %tmp2 = load <4 x i16>, <4 x i16>* %B 87 %tmp3 = load <4 x i16>, <4 x i16>* %C 88 %tmp4 = sext <4 x i16> %tmp2 to <4 x i32> 89 %tmp5 = sext <4 x i16> %tmp3 to <4 x i32> 90 %tmp6 = mul <4 x i32> %tmp4, %tmp5 91 %tmp7 = add <4 x i32> %tmp1, %tmp6 92 ret <4 x i32> %tmp7 93 } 94 95 define <2 x i64> @smlal_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { 96 ; CHECK-LABEL: smlal_v2i32_v2i64: 97 ; CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s 98 %tmp1 = load <2 x i64>, <2 x i64>* %A 99 %tmp2 = load <2 x i32>, <2 x i32>* %B 100 %tmp3 = load <2 x i32>, <2 x i32>* %C 101 %tmp4 = sext <2 x i32> %tmp2 to <2 x i64> 102 %tmp5 = sext <2 x i32> %tmp3 to <2 x i64> 103 %tmp6 = mul <2 x i64> %tmp4, %tmp5 104 %tmp7 = add <2 x i64> %tmp1, %tmp6 105 ret <2 x i64> %tmp7 106 } 107 108 define <8 x i16> @umlal_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { 109 ; CHECK-LABEL: umlal_v8i8_v8i16: 110 ; CHECK: umlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b 111 %tmp1 = load <8 x i16>, <8 x i16>* %A 112 %tmp2 = load <8 x i8>, <8 x i8>* %B 113 %tmp3 = load <8 x i8>, <8 x i8>* %C 114 %tmp4 = zext <8 x i8> %tmp2 to <8 x i16> 115 %tmp5 = zext <8 x i8> %tmp3 to <8 x i16> 116 %tmp6 = mul <8 x i16> %tmp4, %tmp5 117 %tmp7 = add <8 x i16> %tmp1, %tmp6 118 ret <8 x i16> %tmp7 119 } 120 121 define <4 x i32> @umlal_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { 122 ; CHECK-LABEL: umlal_v4i16_v4i32: 123 ; CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h 124 %tmp1 = load <4 x i32>, <4 x i32>* %A 125 %tmp2 = load <4 x i16>, <4 x i16>* %B 126 %tmp3 = load <4 x i16>, <4 x i16>* %C 127 %tmp4 = zext <4 x i16> %tmp2 to <4 x i32> 128 %tmp5 = zext <4 x i16> %tmp3 to <4 x i32> 129 %tmp6 = mul <4 x i32> %tmp4, %tmp5 130 %tmp7 = add <4 x i32> %tmp1, %tmp6 131 ret <4 x i32> %tmp7 132 } 133 134 define <2 x i64> @umlal_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { 135 ; CHECK-LABEL: umlal_v2i32_v2i64: 136 ; CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s 137 %tmp1 = load <2 x i64>, <2 x i64>* %A 138 %tmp2 = load <2 x i32>, <2 x i32>* %B 139 %tmp3 = load <2 x i32>, <2 x i32>* %C 140 %tmp4 = zext <2 x i32> %tmp2 to <2 x i64> 141 %tmp5 = zext <2 x i32> %tmp3 to <2 x i64> 142 %tmp6 = mul <2 x i64> %tmp4, %tmp5 143 %tmp7 = add <2 x i64> %tmp1, %tmp6 144 ret <2 x i64> %tmp7 145 } 146 147 define <8 x i16> @smlsl_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { 148 ; CHECK-LABEL: smlsl_v8i8_v8i16: 149 ; CHECK: smlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b 150 %tmp1 = load <8 x i16>, <8 x i16>* %A 151 %tmp2 = load <8 x i8>, <8 x i8>* %B 152 %tmp3 = load <8 x i8>, <8 x i8>* %C 153 %tmp4 = sext <8 x i8> %tmp2 to <8 x i16> 154 %tmp5 = sext <8 x i8> %tmp3 to <8 x i16> 155 %tmp6 = mul <8 x i16> %tmp4, %tmp5 156 %tmp7 = sub <8 x i16> %tmp1, %tmp6 157 ret <8 x i16> %tmp7 158 } 159 160 define <4 x i32> @smlsl_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { 161 ; CHECK-LABEL: smlsl_v4i16_v4i32: 162 ; CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h 163 %tmp1 = load <4 x i32>, <4 x i32>* %A 164 %tmp2 = load <4 x i16>, <4 x i16>* %B 165 %tmp3 = load <4 x i16>, <4 x i16>* %C 166 %tmp4 = sext <4 x i16> %tmp2 to <4 x i32> 167 %tmp5 = sext <4 x i16> %tmp3 to <4 x i32> 168 %tmp6 = mul <4 x i32> %tmp4, %tmp5 169 %tmp7 = sub <4 x i32> %tmp1, %tmp6 170 ret <4 x i32> %tmp7 171 } 172 173 define <2 x i64> @smlsl_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { 174 ; CHECK-LABEL: smlsl_v2i32_v2i64: 175 ; CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s 176 %tmp1 = load <2 x i64>, <2 x i64>* %A 177 %tmp2 = load <2 x i32>, <2 x i32>* %B 178 %tmp3 = load <2 x i32>, <2 x i32>* %C 179 %tmp4 = sext <2 x i32> %tmp2 to <2 x i64> 180 %tmp5 = sext <2 x i32> %tmp3 to <2 x i64> 181 %tmp6 = mul <2 x i64> %tmp4, %tmp5 182 %tmp7 = sub <2 x i64> %tmp1, %tmp6 183 ret <2 x i64> %tmp7 184 } 185 186 define <8 x i16> @umlsl_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { 187 ; CHECK-LABEL: umlsl_v8i8_v8i16: 188 ; CHECK: umlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b 189 %tmp1 = load <8 x i16>, <8 x i16>* %A 190 %tmp2 = load <8 x i8>, <8 x i8>* %B 191 %tmp3 = load <8 x i8>, <8 x i8>* %C 192 %tmp4 = zext <8 x i8> %tmp2 to <8 x i16> 193 %tmp5 = zext <8 x i8> %tmp3 to <8 x i16> 194 %tmp6 = mul <8 x i16> %tmp4, %tmp5 195 %tmp7 = sub <8 x i16> %tmp1, %tmp6 196 ret <8 x i16> %tmp7 197 } 198 199 define <4 x i32> @umlsl_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { 200 ; CHECK-LABEL: umlsl_v4i16_v4i32: 201 ; CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h 202 %tmp1 = load <4 x i32>, <4 x i32>* %A 203 %tmp2 = load <4 x i16>, <4 x i16>* %B 204 %tmp3 = load <4 x i16>, <4 x i16>* %C 205 %tmp4 = zext <4 x i16> %tmp2 to <4 x i32> 206 %tmp5 = zext <4 x i16> %tmp3 to <4 x i32> 207 %tmp6 = mul <4 x i32> %tmp4, %tmp5 208 %tmp7 = sub <4 x i32> %tmp1, %tmp6 209 ret <4 x i32> %tmp7 210 } 211 212 define <2 x i64> @umlsl_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { 213 ; CHECK-LABEL: umlsl_v2i32_v2i64: 214 ; CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s 215 %tmp1 = load <2 x i64>, <2 x i64>* %A 216 %tmp2 = load <2 x i32>, <2 x i32>* %B 217 %tmp3 = load <2 x i32>, <2 x i32>* %C 218 %tmp4 = zext <2 x i32> %tmp2 to <2 x i64> 219 %tmp5 = zext <2 x i32> %tmp3 to <2 x i64> 220 %tmp6 = mul <2 x i64> %tmp4, %tmp5 221 %tmp7 = sub <2 x i64> %tmp1, %tmp6 222 ret <2 x i64> %tmp7 223 } 224 225 ; SMULL recognizing BUILD_VECTORs with sign/zero-extended elements. 226 define <8 x i16> @smull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind { 227 ; CHECK-LABEL: smull_extvec_v8i8_v8i16: 228 ; CHECK: smull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b 229 %tmp3 = sext <8 x i8> %arg to <8 x i16> 230 %tmp4 = mul <8 x i16> %tmp3, <i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12> 231 ret <8 x i16> %tmp4 232 } 233 234 define <8 x i16> @smull_noextvec_v8i8_v8i16(<8 x i8> %arg) nounwind { 235 ; Do not use SMULL if the BUILD_VECTOR element values are too big. 236 ; CHECK-LABEL: smull_noextvec_v8i8_v8i16: 237 ; CHECK: movz 238 ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h 239 %tmp3 = sext <8 x i8> %arg to <8 x i16> 240 %tmp4 = mul <8 x i16> %tmp3, <i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999> 241 ret <8 x i16> %tmp4 242 } 243 244 define <4 x i32> @smull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind { 245 ; CHECK-LABEL: smull_extvec_v4i16_v4i32: 246 ; CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h 247 %tmp3 = sext <4 x i16> %arg to <4 x i32> 248 %tmp4 = mul <4 x i32> %tmp3, <i32 -12, i32 -12, i32 -12, i32 -12> 249 ret <4 x i32> %tmp4 250 } 251 252 define <2 x i64> @smull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind { 253 ; CHECK: smull_extvec_v2i32_v2i64 254 ; CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s 255 %tmp3 = sext <2 x i32> %arg to <2 x i64> 256 %tmp4 = mul <2 x i64> %tmp3, <i64 -1234, i64 -1234> 257 ret <2 x i64> %tmp4 258 } 259 260 define <8 x i16> @umull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind { 261 ; CHECK-LABEL: umull_extvec_v8i8_v8i16: 262 ; CHECK: umull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b 263 %tmp3 = zext <8 x i8> %arg to <8 x i16> 264 %tmp4 = mul <8 x i16> %tmp3, <i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12> 265 ret <8 x i16> %tmp4 266 } 267 268 define <8 x i16> @umull_noextvec_v8i8_v8i16(<8 x i8> %arg) nounwind { 269 ; Do not use SMULL if the BUILD_VECTOR element values are too big. 270 ; CHECK-LABEL: umull_noextvec_v8i8_v8i16: 271 ; CHECK: movz 272 ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h 273 %tmp3 = zext <8 x i8> %arg to <8 x i16> 274 %tmp4 = mul <8 x i16> %tmp3, <i16 999, i16 999, i16 999, i16 999, i16 999, i16 999, i16 999, i16 999> 275 ret <8 x i16> %tmp4 276 } 277 278 define <4 x i32> @umull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind { 279 ; CHECK-LABEL: umull_extvec_v4i16_v4i32: 280 ; CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h 281 %tmp3 = zext <4 x i16> %arg to <4 x i32> 282 %tmp4 = mul <4 x i32> %tmp3, <i32 1234, i32 1234, i32 1234, i32 1234> 283 ret <4 x i32> %tmp4 284 } 285 286 define <2 x i64> @umull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind { 287 ; CHECK-LABEL: umull_extvec_v2i32_v2i64: 288 ; CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s 289 %tmp3 = zext <2 x i32> %arg to <2 x i64> 290 %tmp4 = mul <2 x i64> %tmp3, <i64 1234, i64 1234> 291 ret <2 x i64> %tmp4 292 } 293 294 define i16 @smullWithInconsistentExtensions(<8 x i8> %vec) { 295 ; If one operand has a zero-extend and the other a sign-extend, smull 296 ; cannot be used. 297 ; CHECK-LABEL: smullWithInconsistentExtensions: 298 ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h 299 %1 = sext <8 x i8> %vec to <8 x i16> 300 %2 = mul <8 x i16> %1, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> 301 %3 = extractelement <8 x i16> %2, i32 0 302 ret i16 %3 303 } 304 305 define void @distribute(i16* %dst, i8* %src, i32 %mul) nounwind { 306 entry: 307 ; CHECK-LABEL: distribute: 308 ; CHECK: umull [[REG1:(v[0-9]+.8h)]], {{v[0-9]+}}.8b, [[REG2:(v[0-9]+.8b)]] 309 ; CHECK: umlal [[REG1]], {{v[0-9]+}}.8b, [[REG2]] 310 %0 = trunc i32 %mul to i8 311 %1 = insertelement <8 x i8> undef, i8 %0, i32 0 312 %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer 313 %3 = tail call <16 x i8> @llvm.aarch64.neon.vld1.v16i8(i8* %src, i32 1) 314 %4 = bitcast <16 x i8> %3 to <2 x double> 315 %5 = extractelement <2 x double> %4, i32 1 316 %6 = bitcast double %5 to <8 x i8> 317 %7 = zext <8 x i8> %6 to <8 x i16> 318 %8 = zext <8 x i8> %2 to <8 x i16> 319 %9 = extractelement <2 x double> %4, i32 0 320 %10 = bitcast double %9 to <8 x i8> 321 %11 = zext <8 x i8> %10 to <8 x i16> 322 %12 = add <8 x i16> %7, %11 323 %13 = mul <8 x i16> %12, %8 324 %14 = bitcast i16* %dst to i8* 325 tail call void @llvm.aarch64.neon.vst1.v8i16(i8* %14, <8 x i16> %13, i32 2) 326 ret void 327 } 328 329 declare <16 x i8> @llvm.aarch64.neon.vld1.v16i8(i8*, i32) nounwind readonly 330 331 declare void @llvm.aarch64.neon.vst1.v8i16(i8*, <8 x i16>, i32) nounwind 332 333