1 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s 2 3 // Test new aarch64 intrinsics and types 4 5 #include <arm_neon.h> 6 7 // CHECK-LABEL: define <4 x i16> @test_vmla_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 { 8 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 9 // CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]] 10 // CHECK: [[ADD:%.*]] = add <4 x i16> %a, [[MUL]] 11 // CHECK: ret <4 x i16> [[ADD]] 12 int16x4_t test_vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) { 13 return vmla_lane_s16(a, b, v, 3); 14 } 15 16 // CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 { 17 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> 18 // CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]] 19 // CHECK: [[ADD:%.*]] = add <8 x i16> %a, [[MUL]] 20 // CHECK: ret <8 x i16> [[ADD]] 21 int16x8_t test_vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) { 22 return vmlaq_lane_s16(a, b, v, 3); 23 } 24 25 // CHECK-LABEL: define <2 x i32> @test_vmla_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 { 26 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1> 27 // CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]] 28 // CHECK: [[ADD:%.*]] = add <2 x i32> %a, [[MUL]] 29 // CHECK: ret <2 x i32> [[ADD]] 30 int32x2_t test_vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) { 31 return vmla_lane_s32(a, b, v, 1); 32 } 33 34 // CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 { 35 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 36 // CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]] 37 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[MUL]] 38 // CHECK: ret <4 x i32> [[ADD]] 39 int32x4_t test_vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) { 40 return vmlaq_lane_s32(a, b, v, 1); 41 } 42 43 // CHECK-LABEL: define <4 x i16> @test_vmla_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 { 44 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7> 45 // CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]] 46 // CHECK: [[ADD:%.*]] = add <4 x i16> %a, [[MUL]] 47 // CHECK: ret <4 x i16> [[ADD]] 48 int16x4_t test_vmla_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) { 49 return vmla_laneq_s16(a, b, v, 7); 50 } 51 52 // CHECK-LABEL: define <8 x i16> @test_vmlaq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 { 53 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> 54 // CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]] 55 // CHECK: [[ADD:%.*]] = add <8 x i16> %a, [[MUL]] 56 // CHECK: ret <8 x i16> [[ADD]] 57 int16x8_t test_vmlaq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) { 58 return vmlaq_laneq_s16(a, b, v, 7); 59 } 60 61 // CHECK-LABEL: define <2 x i32> @test_vmla_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 { 62 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3> 63 // CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]] 64 // CHECK: [[ADD:%.*]] = add <2 x i32> %a, [[MUL]] 65 // CHECK: ret <2 x i32> [[ADD]] 66 int32x2_t test_vmla_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) { 67 return vmla_laneq_s32(a, b, v, 3); 68 } 69 70 // CHECK-LABEL: define <4 x i32> @test_vmlaq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 { 71 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 72 // CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]] 73 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[MUL]] 74 // CHECK: ret <4 x i32> [[ADD]] 75 int32x4_t test_vmlaq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) { 76 return vmlaq_laneq_s32(a, b, v, 3); 77 } 78 79 // CHECK-LABEL: define <4 x i16> @test_vmls_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 { 80 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 81 // CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]] 82 // CHECK: [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]] 83 // CHECK: ret <4 x i16> [[SUB]] 84 int16x4_t test_vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) { 85 return vmls_lane_s16(a, b, v, 3); 86 } 87 88 // CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 { 89 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> 90 // CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]] 91 // CHECK: [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]] 92 // CHECK: ret <8 x i16> [[SUB]] 93 int16x8_t test_vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) { 94 return vmlsq_lane_s16(a, b, v, 3); 95 } 96 97 // CHECK-LABEL: define <2 x i32> @test_vmls_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 { 98 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1> 99 // CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]] 100 // CHECK: [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]] 101 // CHECK: ret <2 x i32> [[SUB]] 102 int32x2_t test_vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) { 103 return vmls_lane_s32(a, b, v, 1); 104 } 105 106 // CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 { 107 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 108 // CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]] 109 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]] 110 // CHECK: ret <4 x i32> [[SUB]] 111 int32x4_t test_vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) { 112 return vmlsq_lane_s32(a, b, v, 1); 113 } 114 115 // CHECK-LABEL: define <4 x i16> @test_vmls_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 { 116 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7> 117 // CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]] 118 // CHECK: [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]] 119 // CHECK: ret <4 x i16> [[SUB]] 120 int16x4_t test_vmls_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) { 121 return vmls_laneq_s16(a, b, v, 7); 122 } 123 124 // CHECK-LABEL: define <8 x i16> @test_vmlsq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 { 125 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> 126 // CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]] 127 // CHECK: [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]] 128 // CHECK: ret <8 x i16> [[SUB]] 129 int16x8_t test_vmlsq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) { 130 return vmlsq_laneq_s16(a, b, v, 7); 131 } 132 133 // CHECK-LABEL: define <2 x i32> @test_vmls_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 { 134 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3> 135 // CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]] 136 // CHECK: [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]] 137 // CHECK: ret <2 x i32> [[SUB]] 138 int32x2_t test_vmls_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) { 139 return vmls_laneq_s32(a, b, v, 3); 140 } 141 142 // CHECK-LABEL: define <4 x i32> @test_vmlsq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 { 143 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 144 // CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]] 145 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]] 146 // CHECK: ret <4 x i32> [[SUB]] 147 int32x4_t test_vmlsq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) { 148 return vmlsq_laneq_s32(a, b, v, 3); 149 } 150 151 // CHECK-LABEL: define <4 x i16> @test_vmul_lane_s16(<4 x i16> %a, <4 x i16> %v) #0 { 152 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 153 // CHECK: [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]] 154 // CHECK: ret <4 x i16> [[MUL]] 155 int16x4_t test_vmul_lane_s16(int16x4_t a, int16x4_t v) { 156 return vmul_lane_s16(a, v, 3); 157 } 158 159 // CHECK-LABEL: define <8 x i16> @test_vmulq_lane_s16(<8 x i16> %a, <4 x i16> %v) #0 { 160 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> 161 // CHECK: [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]] 162 // CHECK: ret <8 x i16> [[MUL]] 163 int16x8_t test_vmulq_lane_s16(int16x8_t a, int16x4_t v) { 164 return vmulq_lane_s16(a, v, 3); 165 } 166 167 // CHECK-LABEL: define <2 x i32> @test_vmul_lane_s32(<2 x i32> %a, <2 x i32> %v) #0 { 168 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1> 169 // CHECK: [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]] 170 // CHECK: ret <2 x i32> [[MUL]] 171 int32x2_t test_vmul_lane_s32(int32x2_t a, int32x2_t v) { 172 return vmul_lane_s32(a, v, 1); 173 } 174 175 // CHECK-LABEL: define <4 x i32> @test_vmulq_lane_s32(<4 x i32> %a, <2 x i32> %v) #0 { 176 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 177 // CHECK: [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]] 178 // CHECK: ret <4 x i32> [[MUL]] 179 int32x4_t test_vmulq_lane_s32(int32x4_t a, int32x2_t v) { 180 return vmulq_lane_s32(a, v, 1); 181 } 182 183 // CHECK-LABEL: define <4 x i16> @test_vmul_lane_u16(<4 x i16> %a, <4 x i16> %v) #0 { 184 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 185 // CHECK: [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]] 186 // CHECK: ret <4 x i16> [[MUL]] 187 uint16x4_t test_vmul_lane_u16(uint16x4_t a, uint16x4_t v) { 188 return vmul_lane_u16(a, v, 3); 189 } 190 191 // CHECK-LABEL: define <8 x i16> @test_vmulq_lane_u16(<8 x i16> %a, <4 x i16> %v) #0 { 192 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> 193 // CHECK: [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]] 194 // CHECK: ret <8 x i16> [[MUL]] 195 uint16x8_t test_vmulq_lane_u16(uint16x8_t a, uint16x4_t v) { 196 return vmulq_lane_u16(a, v, 3); 197 } 198 199 // CHECK-LABEL: define <2 x i32> @test_vmul_lane_u32(<2 x i32> %a, <2 x i32> %v) #0 { 200 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1> 201 // CHECK: [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]] 202 // CHECK: ret <2 x i32> [[MUL]] 203 uint32x2_t test_vmul_lane_u32(uint32x2_t a, uint32x2_t v) { 204 return vmul_lane_u32(a, v, 1); 205 } 206 207 // CHECK-LABEL: define <4 x i32> @test_vmulq_lane_u32(<4 x i32> %a, <2 x i32> %v) #0 { 208 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 209 // CHECK: [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]] 210 // CHECK: ret <4 x i32> [[MUL]] 211 uint32x4_t test_vmulq_lane_u32(uint32x4_t a, uint32x2_t v) { 212 return vmulq_lane_u32(a, v, 1); 213 } 214 215 // CHECK-LABEL: define <4 x i16> @test_vmul_laneq_s16(<4 x i16> %a, <8 x i16> %v) #0 { 216 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7> 217 // CHECK: [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]] 218 // CHECK: ret <4 x i16> [[MUL]] 219 int16x4_t test_vmul_laneq_s16(int16x4_t a, int16x8_t v) { 220 return vmul_laneq_s16(a, v, 7); 221 } 222 223 // CHECK-LABEL: define <8 x i16> @test_vmulq_laneq_s16(<8 x i16> %a, <8 x i16> %v) #0 { 224 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> 225 // CHECK: [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]] 226 // CHECK: ret <8 x i16> [[MUL]] 227 int16x8_t test_vmulq_laneq_s16(int16x8_t a, int16x8_t v) { 228 return vmulq_laneq_s16(a, v, 7); 229 } 230 231 // CHECK-LABEL: define <2 x i32> @test_vmul_laneq_s32(<2 x i32> %a, <4 x i32> %v) #0 { 232 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3> 233 // CHECK: [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]] 234 // CHECK: ret <2 x i32> [[MUL]] 235 int32x2_t test_vmul_laneq_s32(int32x2_t a, int32x4_t v) { 236 return vmul_laneq_s32(a, v, 3); 237 } 238 239 // CHECK-LABEL: define <4 x i32> @test_vmulq_laneq_s32(<4 x i32> %a, <4 x i32> %v) #0 { 240 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 241 // CHECK: [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]] 242 // CHECK: ret <4 x i32> [[MUL]] 243 int32x4_t test_vmulq_laneq_s32(int32x4_t a, int32x4_t v) { 244 return vmulq_laneq_s32(a, v, 3); 245 } 246 247 // CHECK-LABEL: define <4 x i16> @test_vmul_laneq_u16(<4 x i16> %a, <8 x i16> %v) #0 { 248 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7> 249 // CHECK: [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]] 250 // CHECK: ret <4 x i16> [[MUL]] 251 uint16x4_t test_vmul_laneq_u16(uint16x4_t a, uint16x8_t v) { 252 return vmul_laneq_u16(a, v, 7); 253 } 254 255 // CHECK-LABEL: define <8 x i16> @test_vmulq_laneq_u16(<8 x i16> %a, <8 x i16> %v) #0 { 256 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> 257 // CHECK: [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]] 258 // CHECK: ret <8 x i16> [[MUL]] 259 uint16x8_t test_vmulq_laneq_u16(uint16x8_t a, uint16x8_t v) { 260 return vmulq_laneq_u16(a, v, 7); 261 } 262 263 // CHECK-LABEL: define <2 x i32> @test_vmul_laneq_u32(<2 x i32> %a, <4 x i32> %v) #0 { 264 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3> 265 // CHECK: [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]] 266 // CHECK: ret <2 x i32> [[MUL]] 267 uint32x2_t test_vmul_laneq_u32(uint32x2_t a, uint32x4_t v) { 268 return vmul_laneq_u32(a, v, 3); 269 } 270 271 // CHECK-LABEL: define <4 x i32> @test_vmulq_laneq_u32(<4 x i32> %a, <4 x i32> %v) #0 { 272 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 273 // CHECK: [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]] 274 // CHECK: ret <4 x i32> [[MUL]] 275 uint32x4_t test_vmulq_laneq_u32(uint32x4_t a, uint32x4_t v) { 276 return vmulq_laneq_u32(a, v, 3); 277 } 278 279 // CHECK-LABEL: define <2 x float> @test_vfma_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 { 280 // CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> 281 // CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> 282 // CHECK: [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8> 283 // CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> 284 // CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> <i32 1, i32 1> 285 // CHECK: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> 286 // CHECK: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> 287 // CHECK: [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]]) 288 // CHECK: ret <2 x float> [[FMLA2]] 289 float32x2_t test_vfma_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) { 290 return vfma_lane_f32(a, b, v, 1); 291 } 292 293 // CHECK-LABEL: define <4 x float> @test_vfmaq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 { 294 // CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> 295 // CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> 296 // CHECK: [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8> 297 // CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> 298 // CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> <i32 1, i32 1, i32 1, i32 1> 299 // CHECK: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> 300 // CHECK: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> 301 // CHECK: [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]]) 302 // CHECK: ret <4 x float> [[FMLA2]] 303 float32x4_t test_vfmaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) { 304 return vfmaq_lane_f32(a, b, v, 1); 305 } 306 307 // CHECK-LABEL: define <2 x float> @test_vfma_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 { 308 // CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> 309 // CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> 310 // CHECK: [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8> 311 // CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> 312 // CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> 313 // CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> 314 // CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> <i32 3, i32 3> 315 // CHECK: [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]]) 316 // CHECK: ret <2 x float> [[TMP6]] 317 float32x2_t test_vfma_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) { 318 return vfma_laneq_f32(a, b, v, 3); 319 } 320 321 // CHECK-LABEL: define <4 x float> @test_vfmaq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 { 322 // CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> 323 // CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> 324 // CHECK: [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8> 325 // CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> 326 // CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> 327 // CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> 328 // CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> <i32 3, i32 3, i32 3, i32 3> 329 // CHECK: [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]]) 330 // CHECK: ret <4 x float> [[TMP6]] 331 float32x4_t test_vfmaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) { 332 return vfmaq_laneq_f32(a, b, v, 3); 333 } 334 335 // CHECK-LABEL: define <2 x float> @test_vfms_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 { 336 // CHECK: [[SUB:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b 337 // CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> 338 // CHECK: [[TMP1:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8> 339 // CHECK: [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8> 340 // CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> 341 // CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> <i32 1, i32 1> 342 // CHECK: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> 343 // CHECK: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> 344 // CHECK: [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]]) 345 // CHECK: ret <2 x float> [[FMLA2]] 346 float32x2_t test_vfms_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) { 347 return vfms_lane_f32(a, b, v, 1); 348 } 349 350 // CHECK-LABEL: define <4 x float> @test_vfmsq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 { 351 // CHECK: [[SUB:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b 352 // CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> 353 // CHECK: [[TMP1:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8> 354 // CHECK: [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8> 355 // CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> 356 // CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> <i32 1, i32 1, i32 1, i32 1> 357 // CHECK: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> 358 // CHECK: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> 359 // CHECK: [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]]) 360 // CHECK: ret <4 x float> [[FMLA2]] 361 float32x4_t test_vfmsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) { 362 return vfmsq_lane_f32(a, b, v, 1); 363 } 364 365 // CHECK-LABEL: define <2 x float> @test_vfms_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 { 366 // CHECK: [[SUB:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b 367 // CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> 368 // CHECK: [[TMP1:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8> 369 // CHECK: [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8> 370 // CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> 371 // CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> 372 // CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> 373 // CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> <i32 3, i32 3> 374 // CHECK: [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]]) 375 // CHECK: ret <2 x float> [[TMP6]] 376 float32x2_t test_vfms_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) { 377 return vfms_laneq_f32(a, b, v, 3); 378 } 379 380 // CHECK-LABEL: define <4 x float> @test_vfmsq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 { 381 // CHECK: [[SUB:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b 382 // CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> 383 // CHECK: [[TMP1:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8> 384 // CHECK: [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8> 385 // CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> 386 // CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> 387 // CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> 388 // CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> <i32 3, i32 3, i32 3, i32 3> 389 // CHECK: [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]]) 390 // CHECK: ret <4 x float> [[TMP6]] 391 float32x4_t test_vfmsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) { 392 return vfmsq_laneq_f32(a, b, v, 3); 393 } 394 395 // CHECK-LABEL: define <2 x double> @test_vfmaq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) #0 { 396 // CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> 397 // CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8> 398 // CHECK: [[TMP2:%.*]] = bitcast <1 x double> %v to <8 x i8> 399 // CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double> 400 // CHECK: [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <2 x i32> zeroinitializer 401 // CHECK: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> 402 // CHECK: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> 403 // CHECK: [[FMLA2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FMLA]], <2 x double> [[LANE]], <2 x double> [[FMLA1]]) 404 // CHECK: ret <2 x double> [[FMLA2]] 405 float64x2_t test_vfmaq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) { 406 return vfmaq_lane_f64(a, b, v, 0); 407 } 408 409 // CHECK-LABEL: define <2 x double> @test_vfmaq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) #0 { 410 // CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> 411 // CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8> 412 // CHECK: [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8> 413 // CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> 414 // CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> 415 // CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> 416 // CHECK: [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> <i32 1, i32 1> 417 // CHECK: [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]]) 418 // CHECK: ret <2 x double> [[TMP6]] 419 float64x2_t test_vfmaq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) { 420 return vfmaq_laneq_f64(a, b, v, 1); 421 } 422 423 // CHECK-LABEL: define <2 x double> @test_vfmsq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) #0 { 424 // CHECK: [[SUB:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %b 425 // CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> 426 // CHECK: [[TMP1:%.*]] = bitcast <2 x double> [[SUB]] to <16 x i8> 427 // CHECK: [[TMP2:%.*]] = bitcast <1 x double> %v to <8 x i8> 428 // CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double> 429 // CHECK: [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <2 x i32> zeroinitializer 430 // CHECK: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> 431 // CHECK: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> 432 // CHECK: [[FMLA2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FMLA]], <2 x double> [[LANE]], <2 x double> [[FMLA1]]) 433 // CHECK: ret <2 x double> [[FMLA2]] 434 float64x2_t test_vfmsq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) { 435 return vfmsq_lane_f64(a, b, v, 0); 436 } 437 438 // CHECK-LABEL: define <2 x double> @test_vfmsq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) #0 { 439 // CHECK: [[SUB:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %b 440 // CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> 441 // CHECK: [[TMP1:%.*]] = bitcast <2 x double> [[SUB]] to <16 x i8> 442 // CHECK: [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8> 443 // CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> 444 // CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> 445 // CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> 446 // CHECK: [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> <i32 1, i32 1> 447 // CHECK: [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]]) 448 // CHECK: ret <2 x double> [[TMP6]] 449 float64x2_t test_vfmsq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) { 450 return vfmsq_laneq_f64(a, b, v, 1); 451 } 452 453 // CHECK-LABEL: define float @test_vfmas_laneq_f32(float %a, float %b, <4 x float> %v) #0 { 454 // CHECK: [[TMP0:%.*]] = bitcast <4 x float> %v to <16 x i8> 455 // CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> 456 // CHECK: [[EXTRACT:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 457 // CHECK: [[TMP2:%.*]] = call float @llvm.fma.f32(float %b, float [[EXTRACT]], float %a) 458 // CHECK: ret float [[TMP2]] 459 float32_t test_vfmas_laneq_f32(float32_t a, float32_t b, float32x4_t v) { 460 return vfmas_laneq_f32(a, b, v, 3); 461 } 462 463 // CHECK-LABEL: define double @test_vfmsd_lane_f64(double %a, double %b, <1 x double> %v) #0 { 464 // CHECK: [[SUB:%.*]] = fsub double -0.000000e+00, %b 465 // CHECK: [[TMP0:%.*]] = bitcast <1 x double> %v to <8 x i8> 466 // CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> 467 // CHECK: [[EXTRACT:%.*]] = extractelement <1 x double> [[TMP1]], i32 0 468 // CHECK: [[TMP2:%.*]] = call double @llvm.fma.f64(double [[SUB]], double [[EXTRACT]], double %a) 469 // CHECK: ret double [[TMP2]] 470 float64_t test_vfmsd_lane_f64(float64_t a, float64_t b, float64x1_t v) { 471 return vfmsd_lane_f64(a, b, v, 0); 472 } 473 474 // CHECK-LABEL: define float @test_vfmss_laneq_f32(float %a, float %b, <4 x float> %v) #0 { 475 // CHECK: [[SUB:%.*]] = fsub float -0.000000e+00, %b 476 // CHECK: [[TMP0:%.*]] = bitcast <4 x float> %v to <16 x i8> 477 // CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> 478 // CHECK: [[EXTRACT:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 479 // CHECK: [[TMP2:%.*]] = call float @llvm.fma.f32(float [[SUB]], float [[EXTRACT]], float %a) 480 // CHECK: ret float [[TMP2]] 481 float32_t test_vfmss_laneq_f32(float32_t a, float32_t b, float32x4_t v) { 482 return vfmss_laneq_f32(a, b, v, 3); 483 } 484 485 // CHECK-LABEL: define double @test_vfmsd_laneq_f64(double %a, double %b, <2 x double> %v) #0 { 486 // CHECK: [[SUB:%.*]] = fsub double -0.000000e+00, %b 487 // CHECK: [[TMP0:%.*]] = bitcast <2 x double> %v to <16 x i8> 488 // CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> 489 // CHECK: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 490 // CHECK: [[TMP2:%.*]] = call double @llvm.fma.f64(double [[SUB]], double [[EXTRACT]], double %a) 491 // CHECK: ret double [[TMP2]] 492 float64_t test_vfmsd_laneq_f64(float64_t a, float64_t b, float64x2_t v) { 493 return vfmsd_laneq_f64(a, b, v, 1); 494 } 495 496 // CHECK-LABEL: define <4 x i32> @test_vmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 { 497 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 498 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> 499 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 500 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 501 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 502 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 503 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]] 504 // CHECK: ret <4 x i32> [[ADD]] 505 int32x4_t test_vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) { 506 return vmlal_lane_s16(a, b, v, 3); 507 } 508 509 // CHECK-LABEL: define <2 x i64> @test_vmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 { 510 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1> 511 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> 512 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 513 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 514 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 515 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 516 // CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]] 517 // CHECK: ret <2 x i64> [[ADD]] 518 int64x2_t test_vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) { 519 return vmlal_lane_s32(a, b, v, 1); 520 } 521 522 // CHECK-LABEL: define <4 x i32> @test_vmlal_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 { 523 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7> 524 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> 525 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 526 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 527 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 528 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 529 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]] 530 // CHECK: ret <4 x i32> [[ADD]] 531 int32x4_t test_vmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) { 532 return vmlal_laneq_s16(a, b, v, 7); 533 } 534 535 // CHECK-LABEL: define <2 x i64> @test_vmlal_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 { 536 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3> 537 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> 538 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 539 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 540 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 541 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 542 // CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]] 543 // CHECK: ret <2 x i64> [[ADD]] 544 int64x2_t test_vmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) { 545 return vmlal_laneq_s32(a, b, v, 3); 546 } 547 548 // CHECK-LABEL: define <4 x i32> @test_vmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 { 549 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 550 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 551 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 552 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 553 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 554 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 555 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 556 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]] 557 // CHECK: ret <4 x i32> [[ADD]] 558 int32x4_t test_vmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) { 559 return vmlal_high_lane_s16(a, b, v, 3); 560 } 561 562 // CHECK-LABEL: define <2 x i64> @test_vmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 { 563 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3> 564 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1> 565 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 566 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 567 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 568 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 569 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 570 // CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]] 571 // CHECK: ret <2 x i64> [[ADD]] 572 int64x2_t test_vmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) { 573 return vmlal_high_lane_s32(a, b, v, 1); 574 } 575 576 // CHECK-LABEL: define <4 x i32> @test_vmlal_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 { 577 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 578 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7> 579 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 580 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 581 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 582 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 583 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 584 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]] 585 // CHECK: ret <4 x i32> [[ADD]] 586 int32x4_t test_vmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) { 587 return vmlal_high_laneq_s16(a, b, v, 7); 588 } 589 590 // CHECK-LABEL: define <2 x i64> @test_vmlal_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 { 591 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3> 592 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3> 593 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 594 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 595 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 596 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 597 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 598 // CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]] 599 // CHECK: ret <2 x i64> [[ADD]] 600 int64x2_t test_vmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) { 601 return vmlal_high_laneq_s32(a, b, v, 3); 602 } 603 604 // CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 { 605 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 606 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> 607 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 608 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 609 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 610 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 611 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]] 612 // CHECK: ret <4 x i32> [[SUB]] 613 int32x4_t test_vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) { 614 return vmlsl_lane_s16(a, b, v, 3); 615 } 616 617 // CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 { 618 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1> 619 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> 620 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 621 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 622 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 623 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 624 // CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]] 625 // CHECK: ret <2 x i64> [[SUB]] 626 int64x2_t test_vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) { 627 return vmlsl_lane_s32(a, b, v, 1); 628 } 629 630 // CHECK-LABEL: define <4 x i32> @test_vmlsl_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 { 631 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7> 632 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> 633 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 634 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 635 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 636 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 637 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]] 638 // CHECK: ret <4 x i32> [[SUB]] 639 int32x4_t test_vmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) { 640 return vmlsl_laneq_s16(a, b, v, 7); 641 } 642 643 // CHECK-LABEL: define <2 x i64> @test_vmlsl_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 { 644 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3> 645 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> 646 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 647 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 648 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 649 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 650 // CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]] 651 // CHECK: ret <2 x i64> [[SUB]] 652 int64x2_t test_vmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) { 653 return vmlsl_laneq_s32(a, b, v, 3); 654 } 655 656 // CHECK-LABEL: define <4 x i32> @test_vmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 { 657 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 658 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 659 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 660 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 661 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 662 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 663 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 664 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]] 665 // CHECK: ret <4 x i32> [[SUB]] 666 int32x4_t test_vmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) { 667 return vmlsl_high_lane_s16(a, b, v, 3); 668 } 669 670 // CHECK-LABEL: define <2 x i64> @test_vmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 { 671 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3> 672 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1> 673 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 674 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 675 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 676 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 677 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 678 // CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]] 679 // CHECK: ret <2 x i64> [[SUB]] 680 int64x2_t test_vmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) { 681 return vmlsl_high_lane_s32(a, b, v, 1); 682 } 683 684 // CHECK-LABEL: define <4 x i32> @test_vmlsl_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 { 685 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 686 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7> 687 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 688 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 689 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 690 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 691 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 692 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]] 693 // CHECK: ret <4 x i32> [[SUB]] 694 int32x4_t test_vmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) { 695 return vmlsl_high_laneq_s16(a, b, v, 7); 696 } 697 698 // CHECK-LABEL: define <2 x i64> @test_vmlsl_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 { 699 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3> 700 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3> 701 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 702 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 703 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 704 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 705 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 706 // CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]] 707 // CHECK: ret <2 x i64> [[SUB]] 708 int64x2_t test_vmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) { 709 return vmlsl_high_laneq_s32(a, b, v, 3); 710 } 711 712 // CHECK-LABEL: define <4 x i32> @test_vmlal_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 { 713 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 714 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> 715 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 716 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 717 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 718 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 719 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]] 720 // CHECK: ret <4 x i32> [[ADD]] 721 int32x4_t test_vmlal_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) { 722 return vmlal_lane_u16(a, b, v, 3); 723 } 724 725 // CHECK-LABEL: define <2 x i64> @test_vmlal_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 { 726 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1> 727 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> 728 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 729 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 730 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 731 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 732 // CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]] 733 // CHECK: ret <2 x i64> [[ADD]] 734 int64x2_t test_vmlal_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) { 735 return vmlal_lane_u32(a, b, v, 1); 736 } 737 738 // CHECK-LABEL: define <4 x i32> @test_vmlal_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 { 739 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7> 740 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> 741 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 742 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 743 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 744 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 745 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]] 746 // CHECK: ret <4 x i32> [[ADD]] 747 int32x4_t test_vmlal_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) { 748 return vmlal_laneq_u16(a, b, v, 7); 749 } 750 751 // CHECK-LABEL: define <2 x i64> @test_vmlal_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 { 752 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3> 753 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> 754 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 755 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 756 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 757 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 758 // CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]] 759 // CHECK: ret <2 x i64> [[ADD]] 760 int64x2_t test_vmlal_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) { 761 return vmlal_laneq_u32(a, b, v, 3); 762 } 763 764 // CHECK-LABEL: define <4 x i32> @test_vmlal_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 { 765 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 766 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 767 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 768 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 769 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 770 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 771 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 772 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]] 773 // CHECK: ret <4 x i32> [[ADD]] 774 int32x4_t test_vmlal_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) { 775 return vmlal_high_lane_u16(a, b, v, 3); 776 } 777 778 // CHECK-LABEL: define <2 x i64> @test_vmlal_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 { 779 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3> 780 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1> 781 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 782 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 783 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 784 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 785 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 786 // CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]] 787 // CHECK: ret <2 x i64> [[ADD]] 788 int64x2_t test_vmlal_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) { 789 return vmlal_high_lane_u32(a, b, v, 1); 790 } 791 792 // CHECK-LABEL: define <4 x i32> @test_vmlal_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 { 793 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 794 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7> 795 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 796 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 797 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 798 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 799 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 800 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]] 801 // CHECK: ret <4 x i32> [[ADD]] 802 int32x4_t test_vmlal_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) { 803 return vmlal_high_laneq_u16(a, b, v, 7); 804 } 805 806 // CHECK-LABEL: define <2 x i64> @test_vmlal_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 { 807 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3> 808 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3> 809 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 810 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 811 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 812 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 813 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 814 // CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]] 815 // CHECK: ret <2 x i64> [[ADD]] 816 int64x2_t test_vmlal_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) { 817 return vmlal_high_laneq_u32(a, b, v, 3); 818 } 819 820 // CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 { 821 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 822 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> 823 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 824 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 825 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 826 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 827 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]] 828 // CHECK: ret <4 x i32> [[SUB]] 829 int32x4_t test_vmlsl_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) { 830 return vmlsl_lane_u16(a, b, v, 3); 831 } 832 833 // CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 { 834 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1> 835 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> 836 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 837 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 838 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 839 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 840 // CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]] 841 // CHECK: ret <2 x i64> [[SUB]] 842 int64x2_t test_vmlsl_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) { 843 return vmlsl_lane_u32(a, b, v, 1); 844 } 845 846 // CHECK-LABEL: define <4 x i32> @test_vmlsl_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 { 847 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7> 848 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> 849 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 850 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 851 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 852 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 853 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]] 854 // CHECK: ret <4 x i32> [[SUB]] 855 int32x4_t test_vmlsl_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) { 856 return vmlsl_laneq_u16(a, b, v, 7); 857 } 858 859 // CHECK-LABEL: define <2 x i64> @test_vmlsl_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 { 860 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3> 861 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> 862 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 863 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 864 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 865 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 866 // CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]] 867 // CHECK: ret <2 x i64> [[SUB]] 868 int64x2_t test_vmlsl_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) { 869 return vmlsl_laneq_u32(a, b, v, 3); 870 } 871 872 // CHECK-LABEL: define <4 x i32> @test_vmlsl_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 { 873 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 874 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 875 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 876 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 877 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 878 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 879 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 880 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]] 881 // CHECK: ret <4 x i32> [[SUB]] 882 int32x4_t test_vmlsl_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) { 883 return vmlsl_high_lane_u16(a, b, v, 3); 884 } 885 886 // CHECK-LABEL: define <2 x i64> @test_vmlsl_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 { 887 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3> 888 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1> 889 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 890 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 891 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 892 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 893 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 894 // CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]] 895 // CHECK: ret <2 x i64> [[SUB]] 896 int64x2_t test_vmlsl_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) { 897 return vmlsl_high_lane_u32(a, b, v, 1); 898 } 899 900 // CHECK-LABEL: define <4 x i32> @test_vmlsl_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 { 901 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 902 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7> 903 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 904 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 905 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 906 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 907 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 908 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]] 909 // CHECK: ret <4 x i32> [[SUB]] 910 int32x4_t test_vmlsl_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) { 911 return vmlsl_high_laneq_u16(a, b, v, 7); 912 } 913 914 // CHECK-LABEL: define <2 x i64> @test_vmlsl_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 { 915 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3> 916 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3> 917 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 918 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 919 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 920 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 921 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 922 // CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]] 923 // CHECK: ret <2 x i64> [[SUB]] 924 int64x2_t test_vmlsl_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) { 925 return vmlsl_high_laneq_u32(a, b, v, 3); 926 } 927 928 // CHECK-LABEL: define <4 x i32> @test_vmull_lane_s16(<4 x i16> %a, <4 x i16> %v) #0 { 929 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 930 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> 931 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 932 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 933 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 934 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 935 // CHECK: ret <4 x i32> [[VMULL2_I]] 936 int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t v) { 937 return vmull_lane_s16(a, v, 3); 938 } 939 940 // CHECK-LABEL: define <2 x i64> @test_vmull_lane_s32(<2 x i32> %a, <2 x i32> %v) #0 { 941 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1> 942 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> 943 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 944 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 945 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 946 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 947 // CHECK: ret <2 x i64> [[VMULL2_I]] 948 int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t v) { 949 return vmull_lane_s32(a, v, 1); 950 } 951 952 // CHECK-LABEL: define <4 x i32> @test_vmull_lane_u16(<4 x i16> %a, <4 x i16> %v) #0 { 953 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 954 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> 955 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 956 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 957 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 958 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 959 // CHECK: ret <4 x i32> [[VMULL2_I]] 960 uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t v) { 961 return vmull_lane_u16(a, v, 3); 962 } 963 964 // CHECK-LABEL: define <2 x i64> @test_vmull_lane_u32(<2 x i32> %a, <2 x i32> %v) #0 { 965 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1> 966 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> 967 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 968 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 969 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 970 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 971 // CHECK: ret <2 x i64> [[VMULL2_I]] 972 uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t v) { 973 return vmull_lane_u32(a, v, 1); 974 } 975 976 // CHECK-LABEL: define <4 x i32> @test_vmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) #0 { 977 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 978 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 979 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 980 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 981 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 982 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 983 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 984 // CHECK: ret <4 x i32> [[VMULL2_I]] 985 int32x4_t test_vmull_high_lane_s16(int16x8_t a, int16x4_t v) { 986 return vmull_high_lane_s16(a, v, 3); 987 } 988 989 // CHECK-LABEL: define <2 x i64> @test_vmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) #0 { 990 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3> 991 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1> 992 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 993 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 994 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 995 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 996 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 997 // CHECK: ret <2 x i64> [[VMULL2_I]] 998 int64x2_t test_vmull_high_lane_s32(int32x4_t a, int32x2_t v) { 999 return vmull_high_lane_s32(a, v, 1); 1000 } 1001 1002 // CHECK-LABEL: define <4 x i32> @test_vmull_high_lane_u16(<8 x i16> %a, <4 x i16> %v) #0 { 1003 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 1004 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 1005 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 1006 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 1007 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 1008 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 1009 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 1010 // CHECK: ret <4 x i32> [[VMULL2_I]] 1011 uint32x4_t test_vmull_high_lane_u16(uint16x8_t a, uint16x4_t v) { 1012 return vmull_high_lane_u16(a, v, 3); 1013 } 1014 1015 // CHECK-LABEL: define <2 x i64> @test_vmull_high_lane_u32(<4 x i32> %a, <2 x i32> %v) #0 { 1016 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3> 1017 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1> 1018 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 1019 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 1020 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 1021 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 1022 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 1023 // CHECK: ret <2 x i64> [[VMULL2_I]] 1024 uint64x2_t test_vmull_high_lane_u32(uint32x4_t a, uint32x2_t v) { 1025 return vmull_high_lane_u32(a, v, 1); 1026 } 1027 1028 // CHECK-LABEL: define <4 x i32> @test_vmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) #0 { 1029 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7> 1030 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> 1031 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 1032 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 1033 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 1034 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 1035 // CHECK: ret <4 x i32> [[VMULL2_I]] 1036 int32x4_t test_vmull_laneq_s16(int16x4_t a, int16x8_t v) { 1037 return vmull_laneq_s16(a, v, 7); 1038 } 1039 1040 // CHECK-LABEL: define <2 x i64> @test_vmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) #0 { 1041 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3> 1042 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> 1043 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 1044 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 1045 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 1046 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 1047 // CHECK: ret <2 x i64> [[VMULL2_I]] 1048 int64x2_t test_vmull_laneq_s32(int32x2_t a, int32x4_t v) { 1049 return vmull_laneq_s32(a, v, 3); 1050 } 1051 1052 // CHECK-LABEL: define <4 x i32> @test_vmull_laneq_u16(<4 x i16> %a, <8 x i16> %v) #0 { 1053 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7> 1054 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> 1055 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 1056 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 1057 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 1058 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 1059 // CHECK: ret <4 x i32> [[VMULL2_I]] 1060 uint32x4_t test_vmull_laneq_u16(uint16x4_t a, uint16x8_t v) { 1061 return vmull_laneq_u16(a, v, 7); 1062 } 1063 1064 // CHECK-LABEL: define <2 x i64> @test_vmull_laneq_u32(<2 x i32> %a, <4 x i32> %v) #0 { 1065 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3> 1066 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> 1067 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 1068 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 1069 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 1070 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 1071 // CHECK: ret <2 x i64> [[VMULL2_I]] 1072 uint64x2_t test_vmull_laneq_u32(uint32x2_t a, uint32x4_t v) { 1073 return vmull_laneq_u32(a, v, 3); 1074 } 1075 1076 // CHECK-LABEL: define <4 x i32> @test_vmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) #0 { 1077 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 1078 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7> 1079 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 1080 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 1081 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 1082 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 1083 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 1084 // CHECK: ret <4 x i32> [[VMULL2_I]] 1085 int32x4_t test_vmull_high_laneq_s16(int16x8_t a, int16x8_t v) { 1086 return vmull_high_laneq_s16(a, v, 7); 1087 } 1088 1089 // CHECK-LABEL: define <2 x i64> @test_vmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) #0 { 1090 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3> 1091 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3> 1092 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 1093 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 1094 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 1095 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 1096 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 1097 // CHECK: ret <2 x i64> [[VMULL2_I]] 1098 int64x2_t test_vmull_high_laneq_s32(int32x4_t a, int32x4_t v) { 1099 return vmull_high_laneq_s32(a, v, 3); 1100 } 1101 1102 // CHECK-LABEL: define <4 x i32> @test_vmull_high_laneq_u16(<8 x i16> %a, <8 x i16> %v) #0 { 1103 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 1104 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7> 1105 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 1106 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 1107 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 1108 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 1109 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 1110 // CHECK: ret <4 x i32> [[VMULL2_I]] 1111 uint32x4_t test_vmull_high_laneq_u16(uint16x8_t a, uint16x8_t v) { 1112 return vmull_high_laneq_u16(a, v, 7); 1113 } 1114 1115 // CHECK-LABEL: define <2 x i64> @test_vmull_high_laneq_u32(<4 x i32> %a, <4 x i32> %v) #0 { 1116 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3> 1117 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3> 1118 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 1119 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 1120 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 1121 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 1122 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 1123 // CHECK: ret <2 x i64> [[VMULL2_I]] 1124 uint64x2_t test_vmull_high_laneq_u32(uint32x4_t a, uint32x4_t v) { 1125 return vmull_high_laneq_u32(a, v, 3); 1126 } 1127 1128 // CHECK-LABEL: define <4 x i32> @test_vqdmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 { 1129 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 1130 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> 1131 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> 1132 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 1133 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 1134 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> 1135 // CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2 1136 // CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 1137 // CHECK: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2 1138 // CHECK: ret <4 x i32> [[VQDMLAL_V3_I]] 1139 int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) { 1140 return vqdmlal_lane_s16(a, b, v, 3); 1141 } 1142 1143 // CHECK-LABEL: define <2 x i64> @test_vqdmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 { 1144 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1> 1145 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> 1146 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> 1147 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 1148 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 1149 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> 1150 // CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2 1151 // CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> 1152 // CHECK: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2 1153 // CHECK: ret <2 x i64> [[VQDMLAL_V3_I]] 1154 int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) { 1155 return vqdmlal_lane_s32(a, b, v, 1); 1156 } 1157 1158 // CHECK-LABEL: define <4 x i32> @test_vqdmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 { 1159 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 1160 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 1161 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> 1162 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 1163 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 1164 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 1165 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> 1166 // CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2 1167 // CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 1168 // CHECK: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2 1169 // CHECK: ret <4 x i32> [[VQDMLAL_V3_I]] 1170 int32x4_t test_vqdmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) { 1171 return vqdmlal_high_lane_s16(a, b, v, 3); 1172 } 1173 1174 // CHECK-LABEL: define <2 x i64> @test_vqdmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 { 1175 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3> 1176 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1> 1177 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> 1178 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 1179 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 1180 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 1181 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> 1182 // CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2 1183 // CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> 1184 // CHECK: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2 1185 // CHECK: ret <2 x i64> [[VQDMLAL_V3_I]] 1186 int64x2_t test_vqdmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) { 1187 return vqdmlal_high_lane_s32(a, b, v, 1); 1188 } 1189 1190 // CHECK-LABEL: define <4 x i32> @test_vqdmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 { 1191 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 1192 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> 1193 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> 1194 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 1195 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 1196 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> 1197 // CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2 1198 // CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 1199 // CHECK: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2 1200 // CHECK: ret <4 x i32> [[VQDMLSL_V3_I]] 1201 int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) { 1202 return vqdmlsl_lane_s16(a, b, v, 3); 1203 } 1204 1205 // CHECK-LABEL: define <2 x i64> @test_vqdmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 { 1206 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1> 1207 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> 1208 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> 1209 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 1210 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 1211 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> 1212 // CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2 1213 // CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> 1214 // CHECK: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2 1215 // CHECK: ret <2 x i64> [[VQDMLSL_V3_I]] 1216 int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) { 1217 return vqdmlsl_lane_s32(a, b, v, 1); 1218 } 1219 1220 // CHECK-LABEL: define <4 x i32> @test_vqdmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 { 1221 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 1222 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 1223 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> 1224 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 1225 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 1226 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 1227 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> 1228 // CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2 1229 // CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 1230 // CHECK: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2 1231 // CHECK: ret <4 x i32> [[VQDMLSL_V3_I]] 1232 int32x4_t test_vqdmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) { 1233 return vqdmlsl_high_lane_s16(a, b, v, 3); 1234 } 1235 1236 // CHECK-LABEL: define <2 x i64> @test_vqdmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 { 1237 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3> 1238 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1> 1239 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> 1240 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 1241 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 1242 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 1243 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> 1244 // CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2 1245 // CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> 1246 // CHECK: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2 1247 // CHECK: ret <2 x i64> [[VQDMLSL_V3_I]] 1248 int64x2_t test_vqdmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) { 1249 return vqdmlsl_high_lane_s32(a, b, v, 1); 1250 } 1251 1252 // CHECK-LABEL: define <4 x i32> @test_vqdmull_lane_s16(<4 x i16> %a, <4 x i16> %v) #0 { 1253 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 1254 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> 1255 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 1256 // CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 1257 // CHECK: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 1258 // CHECK: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2 1259 // CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> 1260 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32> 1261 // CHECK: ret <4 x i32> [[TMP2]] 1262 int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t v) { 1263 return vqdmull_lane_s16(a, v, 3); 1264 } 1265 1266 // CHECK-LABEL: define <2 x i64> @test_vqdmull_lane_s32(<2 x i32> %a, <2 x i32> %v) #0 { 1267 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1> 1268 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> 1269 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 1270 // CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 1271 // CHECK: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 1272 // CHECK: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2 1273 // CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> 1274 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64> 1275 // CHECK: ret <2 x i64> [[TMP2]] 1276 int64x2_t test_vqdmull_lane_s32(int32x2_t a, int32x2_t v) { 1277 return vqdmull_lane_s32(a, v, 1); 1278 } 1279 1280 // CHECK-LABEL: define <4 x i32> @test_vqdmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) #0 { 1281 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 1282 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> 1283 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 1284 // CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 1285 // CHECK: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 1286 // CHECK: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2 1287 // CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> 1288 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32> 1289 // CHECK: ret <4 x i32> [[TMP2]] 1290 int32x4_t test_vqdmull_laneq_s16(int16x4_t a, int16x8_t v) { 1291 return vqdmull_laneq_s16(a, v, 3); 1292 } 1293 1294 // CHECK-LABEL: define <2 x i64> @test_vqdmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) #0 { 1295 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3> 1296 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> 1297 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 1298 // CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 1299 // CHECK: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 1300 // CHECK: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2 1301 // CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> 1302 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64> 1303 // CHECK: ret <2 x i64> [[TMP2]] 1304 int64x2_t test_vqdmull_laneq_s32(int32x2_t a, int32x4_t v) { 1305 return vqdmull_laneq_s32(a, v, 3); 1306 } 1307 1308 // CHECK-LABEL: define <4 x i32> @test_vqdmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) #0 { 1309 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 1310 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 1311 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 1312 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 1313 // CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 1314 // CHECK: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 1315 // CHECK: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2 1316 // CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> 1317 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32> 1318 // CHECK: ret <4 x i32> [[TMP2]] 1319 int32x4_t test_vqdmull_high_lane_s16(int16x8_t a, int16x4_t v) { 1320 return vqdmull_high_lane_s16(a, v, 3); 1321 } 1322 1323 // CHECK-LABEL: define <2 x i64> @test_vqdmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) #0 { 1324 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3> 1325 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1> 1326 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 1327 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 1328 // CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 1329 // CHECK: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 1330 // CHECK: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2 1331 // CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> 1332 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64> 1333 // CHECK: ret <2 x i64> [[TMP2]] 1334 int64x2_t test_vqdmull_high_lane_s32(int32x4_t a, int32x2_t v) { 1335 return vqdmull_high_lane_s32(a, v, 1); 1336 } 1337 1338 // CHECK-LABEL: define <4 x i32> @test_vqdmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) #0 { 1339 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 1340 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7> 1341 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 1342 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 1343 // CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 1344 // CHECK: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 1345 // CHECK: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2 1346 // CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> 1347 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32> 1348 // CHECK: ret <4 x i32> [[TMP2]] 1349 int32x4_t test_vqdmull_high_laneq_s16(int16x8_t a, int16x8_t v) { 1350 return vqdmull_high_laneq_s16(a, v, 7); 1351 } 1352 1353 // CHECK-LABEL: define <2 x i64> @test_vqdmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) #0 { 1354 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3> 1355 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3> 1356 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 1357 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 1358 // CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 1359 // CHECK: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 1360 // CHECK: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2 1361 // CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> 1362 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64> 1363 // CHECK: ret <2 x i64> [[TMP2]] 1364 int64x2_t test_vqdmull_high_laneq_s32(int32x4_t a, int32x4_t v) { 1365 return vqdmull_high_laneq_s32(a, v, 3); 1366 } 1367 1368 // CHECK-LABEL: define <4 x i16> @test_vqdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) #0 { 1369 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 1370 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> 1371 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 1372 // CHECK: [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 1373 // CHECK: [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 1374 // CHECK: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) #2 1375 // CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8> 1376 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16> 1377 // CHECK: ret <4 x i16> [[TMP2]] 1378 int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t v) { 1379 return vqdmulh_lane_s16(a, v, 3); 1380 } 1381 1382 // CHECK-LABEL: define <8 x i16> @test_vqdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) #0 { 1383 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> 1384 // CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> 1385 // CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8> 1386 // CHECK: [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 1387 // CHECK: [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> 1388 // CHECK: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) #2 1389 // CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8> 1390 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16> 1391 // CHECK: ret <8 x i16> [[TMP2]] 1392 int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t v) { 1393 return vqdmulhq_lane_s16(a, v, 3); 1394 } 1395 1396 // CHECK-LABEL: define <2 x i32> @test_vqdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) #0 { 1397 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1> 1398 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> 1399 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 1400 // CHECK: [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 1401 // CHECK: [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 1402 // CHECK: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) #2 1403 // CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8> 1404 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32> 1405 // CHECK: ret <2 x i32> [[TMP2]] 1406 int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t v) { 1407 return vqdmulh_lane_s32(a, v, 1); 1408 } 1409 1410 // CHECK-LABEL: define <4 x i32> @test_vqdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) #0 { 1411 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 1412 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> 1413 // CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8> 1414 // CHECK: [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 1415 // CHECK: [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> 1416 // CHECK: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) #2 1417 // CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8> 1418 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32> 1419 // CHECK: ret <4 x i32> [[TMP2]] 1420 int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t v) { 1421 return vqdmulhq_lane_s32(a, v, 1); 1422 } 1423 1424 // CHECK-LABEL: define <4 x i16> @test_vqrdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) #0 { 1425 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 1426 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> 1427 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 1428 // CHECK: [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 1429 // CHECK: [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 1430 // CHECK: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) #2 1431 // CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8> 1432 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16> 1433 // CHECK: ret <4 x i16> [[TMP2]] 1434 int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t v) { 1435 return vqrdmulh_lane_s16(a, v, 3); 1436 } 1437 1438 // CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) #0 { 1439 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> 1440 // CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> 1441 // CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8> 1442 // CHECK: [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 1443 // CHECK: [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> 1444 // CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) #2 1445 // CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8> 1446 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16> 1447 // CHECK: ret <8 x i16> [[TMP2]] 1448 int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t v) { 1449 return vqrdmulhq_lane_s16(a, v, 3); 1450 } 1451 1452 // CHECK-LABEL: define <2 x i32> @test_vqrdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) #0 { 1453 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1> 1454 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> 1455 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 1456 // CHECK: [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 1457 // CHECK: [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 1458 // CHECK: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) #2 1459 // CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8> 1460 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32> 1461 // CHECK: ret <2 x i32> [[TMP2]] 1462 int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t v) { 1463 return vqrdmulh_lane_s32(a, v, 1); 1464 } 1465 1466 // CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) #0 { 1467 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 1468 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> 1469 // CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8> 1470 // CHECK: [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 1471 // CHECK: [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> 1472 // CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) #2 1473 // CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8> 1474 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32> 1475 // CHECK: ret <4 x i32> [[TMP2]] 1476 int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t v) { 1477 return vqrdmulhq_lane_s32(a, v, 1); 1478 } 1479 1480 // CHECK-LABEL: define <2 x float> @test_vmul_lane_f32(<2 x float> %a, <2 x float> %v) #0 { 1481 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> <i32 1, i32 1> 1482 // CHECK: [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]] 1483 // CHECK: ret <2 x float> [[MUL]] 1484 float32x2_t test_vmul_lane_f32(float32x2_t a, float32x2_t v) { 1485 return vmul_lane_f32(a, v, 1); 1486 } 1487 1488 1489 // CHECK-LABEL: define <1 x double> @test_vmul_lane_f64(<1 x double> %a, <1 x double> %v) #0 { 1490 // CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> 1491 // CHECK: [[TMP1:%.*]] = bitcast <1 x double> %v to <8 x i8> 1492 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double 1493 // CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> 1494 // CHECK: [[EXTRACT:%.*]] = extractelement <1 x double> [[TMP3]], i32 0 1495 // CHECK: [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]] 1496 // CHECK: [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double> 1497 // CHECK: ret <1 x double> [[TMP5]] 1498 float64x1_t test_vmul_lane_f64(float64x1_t a, float64x1_t v) { 1499 return vmul_lane_f64(a, v, 0); 1500 } 1501 1502 1503 // CHECK-LABEL: define <4 x float> @test_vmulq_lane_f32(<4 x float> %a, <2 x float> %v) #0 { 1504 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 1505 // CHECK: [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]] 1506 // CHECK: ret <4 x float> [[MUL]] 1507 float32x4_t test_vmulq_lane_f32(float32x4_t a, float32x2_t v) { 1508 return vmulq_lane_f32(a, v, 1); 1509 } 1510 1511 // CHECK-LABEL: define <2 x double> @test_vmulq_lane_f64(<2 x double> %a, <1 x double> %v) #0 { 1512 // CHECK: [[SHUFFLE:%.*]] = shufflevector <1 x double> %v, <1 x double> %v, <2 x i32> zeroinitializer 1513 // CHECK: [[MUL:%.*]] = fmul <2 x double> %a, [[SHUFFLE]] 1514 // CHECK: ret <2 x double> [[MUL]] 1515 float64x2_t test_vmulq_lane_f64(float64x2_t a, float64x1_t v) { 1516 return vmulq_lane_f64(a, v, 0); 1517 } 1518 1519 // CHECK-LABEL: define <2 x float> @test_vmul_laneq_f32(<2 x float> %a, <4 x float> %v) #0 { 1520 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> <i32 3, i32 3> 1521 // CHECK: [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]] 1522 // CHECK: ret <2 x float> [[MUL]] 1523 float32x2_t test_vmul_laneq_f32(float32x2_t a, float32x4_t v) { 1524 return vmul_laneq_f32(a, v, 3); 1525 } 1526 1527 // CHECK-LABEL: define <1 x double> @test_vmul_laneq_f64(<1 x double> %a, <2 x double> %v) #0 { 1528 // CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> 1529 // CHECK: [[TMP1:%.*]] = bitcast <2 x double> %v to <16 x i8> 1530 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double 1531 // CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> 1532 // CHECK: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 1533 // CHECK: [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]] 1534 // CHECK: [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double> 1535 // CHECK: ret <1 x double> [[TMP5]] 1536 float64x1_t test_vmul_laneq_f64(float64x1_t a, float64x2_t v) { 1537 return vmul_laneq_f64(a, v, 1); 1538 } 1539 1540 1541 // CHECK-LABEL: define <4 x float> @test_vmulq_laneq_f32(<4 x float> %a, <4 x float> %v) #0 { 1542 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 1543 // CHECK: [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]] 1544 // CHECK: ret <4 x float> [[MUL]] 1545 float32x4_t test_vmulq_laneq_f32(float32x4_t a, float32x4_t v) { 1546 return vmulq_laneq_f32(a, v, 3); 1547 } 1548 1549 // CHECK-LABEL: define <2 x double> @test_vmulq_laneq_f64(<2 x double> %a, <2 x double> %v) #0 { 1550 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x double> %v, <2 x double> %v, <2 x i32> <i32 1, i32 1> 1551 // CHECK: [[MUL:%.*]] = fmul <2 x double> %a, [[SHUFFLE]] 1552 // CHECK: ret <2 x double> [[MUL]] 1553 float64x2_t test_vmulq_laneq_f64(float64x2_t a, float64x2_t v) { 1554 return vmulq_laneq_f64(a, v, 1); 1555 } 1556 1557 // CHECK-LABEL: define <2 x float> @test_vmulx_lane_f32(<2 x float> %a, <2 x float> %v) #0 { 1558 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> <i32 1, i32 1> 1559 // CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> 1560 // CHECK: [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8> 1561 // CHECK: [[VMULX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> 1562 // CHECK: [[VMULX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> 1563 // CHECK: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[VMULX_I]], <2 x float> [[VMULX1_I]]) #2 1564 // CHECK: ret <2 x float> [[VMULX2_I]] 1565 float32x2_t test_vmulx_lane_f32(float32x2_t a, float32x2_t v) { 1566 return vmulx_lane_f32(a, v, 1); 1567 } 1568 1569 // CHECK-LABEL: define <4 x float> @test_vmulxq_lane_f32(<4 x float> %a, <2 x float> %v) #0 { 1570 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 1571 // CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> 1572 // CHECK: [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8> 1573 // CHECK: [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> 1574 // CHECK: [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> 1575 // CHECK: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[VMULX_I]], <4 x float> [[VMULX1_I]]) #2 1576 // CHECK: ret <4 x float> [[VMULX2_I]] 1577 float32x4_t test_vmulxq_lane_f32(float32x4_t a, float32x2_t v) { 1578 return vmulxq_lane_f32(a, v, 1); 1579 } 1580 1581 // CHECK-LABEL: define <2 x double> @test_vmulxq_lane_f64(<2 x double> %a, <1 x double> %v) #0 { 1582 // CHECK: [[SHUFFLE:%.*]] = shufflevector <1 x double> %v, <1 x double> %v, <2 x i32> zeroinitializer 1583 // CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> 1584 // CHECK: [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8> 1585 // CHECK: [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> 1586 // CHECK: [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> 1587 // CHECK: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[VMULX_I]], <2 x double> [[VMULX1_I]]) #2 1588 // CHECK: ret <2 x double> [[VMULX2_I]] 1589 float64x2_t test_vmulxq_lane_f64(float64x2_t a, float64x1_t v) { 1590 return vmulxq_lane_f64(a, v, 0); 1591 } 1592 1593 // CHECK-LABEL: define <2 x float> @test_vmulx_laneq_f32(<2 x float> %a, <4 x float> %v) #0 { 1594 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> <i32 3, i32 3> 1595 // CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> 1596 // CHECK: [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8> 1597 // CHECK: [[VMULX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> 1598 // CHECK: [[VMULX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> 1599 // CHECK: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[VMULX_I]], <2 x float> [[VMULX1_I]]) #2 1600 // CHECK: ret <2 x float> [[VMULX2_I]] 1601 float32x2_t test_vmulx_laneq_f32(float32x2_t a, float32x4_t v) { 1602 return vmulx_laneq_f32(a, v, 3); 1603 } 1604 1605 // CHECK-LABEL: define <4 x float> @test_vmulxq_laneq_f32(<4 x float> %a, <4 x float> %v) #0 { 1606 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 1607 // CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> 1608 // CHECK: [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8> 1609 // CHECK: [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> 1610 // CHECK: [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> 1611 // CHECK: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[VMULX_I]], <4 x float> [[VMULX1_I]]) #2 1612 // CHECK: ret <4 x float> [[VMULX2_I]] 1613 float32x4_t test_vmulxq_laneq_f32(float32x4_t a, float32x4_t v) { 1614 return vmulxq_laneq_f32(a, v, 3); 1615 } 1616 1617 // CHECK-LABEL: define <2 x double> @test_vmulxq_laneq_f64(<2 x double> %a, <2 x double> %v) #0 { 1618 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x double> %v, <2 x double> %v, <2 x i32> <i32 1, i32 1> 1619 // CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> 1620 // CHECK: [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8> 1621 // CHECK: [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> 1622 // CHECK: [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> 1623 // CHECK: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[VMULX_I]], <2 x double> [[VMULX1_I]]) #2 1624 // CHECK: ret <2 x double> [[VMULX2_I]] 1625 float64x2_t test_vmulxq_laneq_f64(float64x2_t a, float64x2_t v) { 1626 return vmulxq_laneq_f64(a, v, 1); 1627 } 1628 1629 // CHECK-LABEL: define <4 x i16> @test_vmla_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 { 1630 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer 1631 // CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]] 1632 // CHECK: [[ADD:%.*]] = add <4 x i16> %a, [[MUL]] 1633 // CHECK: ret <4 x i16> [[ADD]] 1634 int16x4_t test_vmla_lane_s16_0(int16x4_t a, int16x4_t b, int16x4_t v) { 1635 return vmla_lane_s16(a, b, v, 0); 1636 } 1637 1638 // CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 { 1639 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer 1640 // CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]] 1641 // CHECK: [[ADD:%.*]] = add <8 x i16> %a, [[MUL]] 1642 // CHECK: ret <8 x i16> [[ADD]] 1643 int16x8_t test_vmlaq_lane_s16_0(int16x8_t a, int16x8_t b, int16x4_t v) { 1644 return vmlaq_lane_s16(a, b, v, 0); 1645 } 1646 1647 // CHECK-LABEL: define <2 x i32> @test_vmla_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 { 1648 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer 1649 // CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]] 1650 // CHECK: [[ADD:%.*]] = add <2 x i32> %a, [[MUL]] 1651 // CHECK: ret <2 x i32> [[ADD]] 1652 int32x2_t test_vmla_lane_s32_0(int32x2_t a, int32x2_t b, int32x2_t v) { 1653 return vmla_lane_s32(a, b, v, 0); 1654 } 1655 1656 // CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 { 1657 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer 1658 // CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]] 1659 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[MUL]] 1660 // CHECK: ret <4 x i32> [[ADD]] 1661 int32x4_t test_vmlaq_lane_s32_0(int32x4_t a, int32x4_t b, int32x2_t v) { 1662 return vmlaq_lane_s32(a, b, v, 0); 1663 } 1664 1665 // CHECK-LABEL: define <4 x i16> @test_vmla_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 { 1666 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer 1667 // CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]] 1668 // CHECK: [[ADD:%.*]] = add <4 x i16> %a, [[MUL]] 1669 // CHECK: ret <4 x i16> [[ADD]] 1670 int16x4_t test_vmla_laneq_s16_0(int16x4_t a, int16x4_t b, int16x8_t v) { 1671 return vmla_laneq_s16(a, b, v, 0); 1672 } 1673 1674 // CHECK-LABEL: define <8 x i16> @test_vmlaq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 { 1675 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer 1676 // CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]] 1677 // CHECK: [[ADD:%.*]] = add <8 x i16> %a, [[MUL]] 1678 // CHECK: ret <8 x i16> [[ADD]] 1679 int16x8_t test_vmlaq_laneq_s16_0(int16x8_t a, int16x8_t b, int16x8_t v) { 1680 return vmlaq_laneq_s16(a, b, v, 0); 1681 } 1682 1683 // CHECK-LABEL: define <2 x i32> @test_vmla_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 { 1684 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer 1685 // CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]] 1686 // CHECK: [[ADD:%.*]] = add <2 x i32> %a, [[MUL]] 1687 // CHECK: ret <2 x i32> [[ADD]] 1688 int32x2_t test_vmla_laneq_s32_0(int32x2_t a, int32x2_t b, int32x4_t v) { 1689 return vmla_laneq_s32(a, b, v, 0); 1690 } 1691 1692 // CHECK-LABEL: define <4 x i32> @test_vmlaq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 { 1693 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer 1694 // CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]] 1695 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[MUL]] 1696 // CHECK: ret <4 x i32> [[ADD]] 1697 int32x4_t test_vmlaq_laneq_s32_0(int32x4_t a, int32x4_t b, int32x4_t v) { 1698 return vmlaq_laneq_s32(a, b, v, 0); 1699 } 1700 1701 // CHECK-LABEL: define <4 x i16> @test_vmls_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 { 1702 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer 1703 // CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]] 1704 // CHECK: [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]] 1705 // CHECK: ret <4 x i16> [[SUB]] 1706 int16x4_t test_vmls_lane_s16_0(int16x4_t a, int16x4_t b, int16x4_t v) { 1707 return vmls_lane_s16(a, b, v, 0); 1708 } 1709 1710 // CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 { 1711 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer 1712 // CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]] 1713 // CHECK: [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]] 1714 // CHECK: ret <8 x i16> [[SUB]] 1715 int16x8_t test_vmlsq_lane_s16_0(int16x8_t a, int16x8_t b, int16x4_t v) { 1716 return vmlsq_lane_s16(a, b, v, 0); 1717 } 1718 1719 // CHECK-LABEL: define <2 x i32> @test_vmls_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 { 1720 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer 1721 // CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]] 1722 // CHECK: [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]] 1723 // CHECK: ret <2 x i32> [[SUB]] 1724 int32x2_t test_vmls_lane_s32_0(int32x2_t a, int32x2_t b, int32x2_t v) { 1725 return vmls_lane_s32(a, b, v, 0); 1726 } 1727 1728 // CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 { 1729 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer 1730 // CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]] 1731 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]] 1732 // CHECK: ret <4 x i32> [[SUB]] 1733 int32x4_t test_vmlsq_lane_s32_0(int32x4_t a, int32x4_t b, int32x2_t v) { 1734 return vmlsq_lane_s32(a, b, v, 0); 1735 } 1736 1737 // CHECK-LABEL: define <4 x i16> @test_vmls_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 { 1738 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer 1739 // CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]] 1740 // CHECK: [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]] 1741 // CHECK: ret <4 x i16> [[SUB]] 1742 int16x4_t test_vmls_laneq_s16_0(int16x4_t a, int16x4_t b, int16x8_t v) { 1743 return vmls_laneq_s16(a, b, v, 0); 1744 } 1745 1746 // CHECK-LABEL: define <8 x i16> @test_vmlsq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 { 1747 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer 1748 // CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]] 1749 // CHECK: [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]] 1750 // CHECK: ret <8 x i16> [[SUB]] 1751 int16x8_t test_vmlsq_laneq_s16_0(int16x8_t a, int16x8_t b, int16x8_t v) { 1752 return vmlsq_laneq_s16(a, b, v, 0); 1753 } 1754 1755 // CHECK-LABEL: define <2 x i32> @test_vmls_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 { 1756 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer 1757 // CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]] 1758 // CHECK: [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]] 1759 // CHECK: ret <2 x i32> [[SUB]] 1760 int32x2_t test_vmls_laneq_s32_0(int32x2_t a, int32x2_t b, int32x4_t v) { 1761 return vmls_laneq_s32(a, b, v, 0); 1762 } 1763 1764 // CHECK-LABEL: define <4 x i32> @test_vmlsq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 { 1765 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer 1766 // CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]] 1767 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]] 1768 // CHECK: ret <4 x i32> [[SUB]] 1769 int32x4_t test_vmlsq_laneq_s32_0(int32x4_t a, int32x4_t b, int32x4_t v) { 1770 return vmlsq_laneq_s32(a, b, v, 0); 1771 } 1772 1773 // CHECK-LABEL: define <4 x i16> @test_vmul_lane_s16_0(<4 x i16> %a, <4 x i16> %v) #0 { 1774 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer 1775 // CHECK: [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]] 1776 // CHECK: ret <4 x i16> [[MUL]] 1777 int16x4_t test_vmul_lane_s16_0(int16x4_t a, int16x4_t v) { 1778 return vmul_lane_s16(a, v, 0); 1779 } 1780 1781 // CHECK-LABEL: define <8 x i16> @test_vmulq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) #0 { 1782 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer 1783 // CHECK: [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]] 1784 // CHECK: ret <8 x i16> [[MUL]] 1785 int16x8_t test_vmulq_lane_s16_0(int16x8_t a, int16x4_t v) { 1786 return vmulq_lane_s16(a, v, 0); 1787 } 1788 1789 // CHECK-LABEL: define <2 x i32> @test_vmul_lane_s32_0(<2 x i32> %a, <2 x i32> %v) #0 { 1790 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer 1791 // CHECK: [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]] 1792 // CHECK: ret <2 x i32> [[MUL]] 1793 int32x2_t test_vmul_lane_s32_0(int32x2_t a, int32x2_t v) { 1794 return vmul_lane_s32(a, v, 0); 1795 } 1796 1797 // CHECK-LABEL: define <4 x i32> @test_vmulq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) #0 { 1798 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer 1799 // CHECK: [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]] 1800 // CHECK: ret <4 x i32> [[MUL]] 1801 int32x4_t test_vmulq_lane_s32_0(int32x4_t a, int32x2_t v) { 1802 return vmulq_lane_s32(a, v, 0); 1803 } 1804 1805 // CHECK-LABEL: define <4 x i16> @test_vmul_lane_u16_0(<4 x i16> %a, <4 x i16> %v) #0 { 1806 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer 1807 // CHECK: [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]] 1808 // CHECK: ret <4 x i16> [[MUL]] 1809 uint16x4_t test_vmul_lane_u16_0(uint16x4_t a, uint16x4_t v) { 1810 return vmul_lane_u16(a, v, 0); 1811 } 1812 1813 // CHECK-LABEL: define <8 x i16> @test_vmulq_lane_u16_0(<8 x i16> %a, <4 x i16> %v) #0 { 1814 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer 1815 // CHECK: [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]] 1816 // CHECK: ret <8 x i16> [[MUL]] 1817 uint16x8_t test_vmulq_lane_u16_0(uint16x8_t a, uint16x4_t v) { 1818 return vmulq_lane_u16(a, v, 0); 1819 } 1820 1821 // CHECK-LABEL: define <2 x i32> @test_vmul_lane_u32_0(<2 x i32> %a, <2 x i32> %v) #0 { 1822 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer 1823 // CHECK: [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]] 1824 // CHECK: ret <2 x i32> [[MUL]] 1825 uint32x2_t test_vmul_lane_u32_0(uint32x2_t a, uint32x2_t v) { 1826 return vmul_lane_u32(a, v, 0); 1827 } 1828 1829 // CHECK-LABEL: define <4 x i32> @test_vmulq_lane_u32_0(<4 x i32> %a, <2 x i32> %v) #0 { 1830 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer 1831 // CHECK: [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]] 1832 // CHECK: ret <4 x i32> [[MUL]] 1833 uint32x4_t test_vmulq_lane_u32_0(uint32x4_t a, uint32x2_t v) { 1834 return vmulq_lane_u32(a, v, 0); 1835 } 1836 1837 // CHECK-LABEL: define <4 x i16> @test_vmul_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) #0 { 1838 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer 1839 // CHECK: [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]] 1840 // CHECK: ret <4 x i16> [[MUL]] 1841 int16x4_t test_vmul_laneq_s16_0(int16x4_t a, int16x8_t v) { 1842 return vmul_laneq_s16(a, v, 0); 1843 } 1844 1845 // CHECK-LABEL: define <8 x i16> @test_vmulq_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) #0 { 1846 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer 1847 // CHECK: [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]] 1848 // CHECK: ret <8 x i16> [[MUL]] 1849 int16x8_t test_vmulq_laneq_s16_0(int16x8_t a, int16x8_t v) { 1850 return vmulq_laneq_s16(a, v, 0); 1851 } 1852 1853 // CHECK-LABEL: define <2 x i32> @test_vmul_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) #0 { 1854 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer 1855 // CHECK: [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]] 1856 // CHECK: ret <2 x i32> [[MUL]] 1857 int32x2_t test_vmul_laneq_s32_0(int32x2_t a, int32x4_t v) { 1858 return vmul_laneq_s32(a, v, 0); 1859 } 1860 1861 // CHECK-LABEL: define <4 x i32> @test_vmulq_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) #0 { 1862 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer 1863 // CHECK: [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]] 1864 // CHECK: ret <4 x i32> [[MUL]] 1865 int32x4_t test_vmulq_laneq_s32_0(int32x4_t a, int32x4_t v) { 1866 return vmulq_laneq_s32(a, v, 0); 1867 } 1868 1869 // CHECK-LABEL: define <4 x i16> @test_vmul_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) #0 { 1870 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer 1871 // CHECK: [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]] 1872 // CHECK: ret <4 x i16> [[MUL]] 1873 uint16x4_t test_vmul_laneq_u16_0(uint16x4_t a, uint16x8_t v) { 1874 return vmul_laneq_u16(a, v, 0); 1875 } 1876 1877 // CHECK-LABEL: define <8 x i16> @test_vmulq_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) #0 { 1878 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer 1879 // CHECK: [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]] 1880 // CHECK: ret <8 x i16> [[MUL]] 1881 uint16x8_t test_vmulq_laneq_u16_0(uint16x8_t a, uint16x8_t v) { 1882 return vmulq_laneq_u16(a, v, 0); 1883 } 1884 1885 // CHECK-LABEL: define <2 x i32> @test_vmul_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) #0 { 1886 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer 1887 // CHECK: [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]] 1888 // CHECK: ret <2 x i32> [[MUL]] 1889 uint32x2_t test_vmul_laneq_u32_0(uint32x2_t a, uint32x4_t v) { 1890 return vmul_laneq_u32(a, v, 0); 1891 } 1892 1893 // CHECK-LABEL: define <4 x i32> @test_vmulq_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) #0 { 1894 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer 1895 // CHECK: [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]] 1896 // CHECK: ret <4 x i32> [[MUL]] 1897 uint32x4_t test_vmulq_laneq_u32_0(uint32x4_t a, uint32x4_t v) { 1898 return vmulq_laneq_u32(a, v, 0); 1899 } 1900 1901 // CHECK-LABEL: define <2 x float> @test_vfma_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 { 1902 // CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> 1903 // CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> 1904 // CHECK: [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8> 1905 // CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> 1906 // CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer 1907 // CHECK: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> 1908 // CHECK: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> 1909 // CHECK: [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]]) 1910 // CHECK: ret <2 x float> [[FMLA2]] 1911 float32x2_t test_vfma_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) { 1912 return vfma_lane_f32(a, b, v, 0); 1913 } 1914 1915 // CHECK-LABEL: define <4 x float> @test_vfmaq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 { 1916 // CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> 1917 // CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> 1918 // CHECK: [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8> 1919 // CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> 1920 // CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> zeroinitializer 1921 // CHECK: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> 1922 // CHECK: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> 1923 // CHECK: [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]]) 1924 // CHECK: ret <4 x float> [[FMLA2]] 1925 float32x4_t test_vfmaq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) { 1926 return vfmaq_lane_f32(a, b, v, 0); 1927 } 1928 1929 // CHECK-LABEL: define <2 x float> @test_vfma_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 { 1930 // CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> 1931 // CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> 1932 // CHECK: [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8> 1933 // CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> 1934 // CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> 1935 // CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> 1936 // CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> zeroinitializer 1937 // CHECK: [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]]) 1938 // CHECK: ret <2 x float> [[TMP6]] 1939 float32x2_t test_vfma_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) { 1940 return vfma_laneq_f32(a, b, v, 0); 1941 } 1942 1943 // CHECK-LABEL: define <4 x float> @test_vfmaq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 { 1944 // CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> 1945 // CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> 1946 // CHECK: [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8> 1947 // CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> 1948 // CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> 1949 // CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> 1950 // CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> zeroinitializer 1951 // CHECK: [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]]) 1952 // CHECK: ret <4 x float> [[TMP6]] 1953 float32x4_t test_vfmaq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) { 1954 return vfmaq_laneq_f32(a, b, v, 0); 1955 } 1956 1957 // CHECK-LABEL: define <2 x float> @test_vfms_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 { 1958 // CHECK: [[SUB:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b 1959 // CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> 1960 // CHECK: [[TMP1:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8> 1961 // CHECK: [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8> 1962 // CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> 1963 // CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer 1964 // CHECK: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> 1965 // CHECK: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> 1966 // CHECK: [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]]) 1967 // CHECK: ret <2 x float> [[FMLA2]] 1968 float32x2_t test_vfms_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) { 1969 return vfms_lane_f32(a, b, v, 0); 1970 } 1971 1972 // CHECK-LABEL: define <4 x float> @test_vfmsq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 { 1973 // CHECK: [[SUB:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b 1974 // CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> 1975 // CHECK: [[TMP1:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8> 1976 // CHECK: [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8> 1977 // CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> 1978 // CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> zeroinitializer 1979 // CHECK: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> 1980 // CHECK: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> 1981 // CHECK: [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]]) 1982 // CHECK: ret <4 x float> [[FMLA2]] 1983 float32x4_t test_vfmsq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) { 1984 return vfmsq_lane_f32(a, b, v, 0); 1985 } 1986 1987 // CHECK-LABEL: define <2 x float> @test_vfms_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 { 1988 // CHECK: [[SUB:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b 1989 // CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> 1990 // CHECK: [[TMP1:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8> 1991 // CHECK: [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8> 1992 // CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> 1993 // CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> 1994 // CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> 1995 // CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> zeroinitializer 1996 // CHECK: [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]]) 1997 // CHECK: ret <2 x float> [[TMP6]] 1998 float32x2_t test_vfms_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) { 1999 return vfms_laneq_f32(a, b, v, 0); 2000 } 2001 2002 // CHECK-LABEL: define <4 x float> @test_vfmsq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 { 2003 // CHECK: [[SUB:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b 2004 // CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> 2005 // CHECK: [[TMP1:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8> 2006 // CHECK: [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8> 2007 // CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> 2008 // CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> 2009 // CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> 2010 // CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> zeroinitializer 2011 // CHECK: [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]]) 2012 // CHECK: ret <4 x float> [[TMP6]] 2013 float32x4_t test_vfmsq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) { 2014 return vfmsq_laneq_f32(a, b, v, 0); 2015 } 2016 2017 // CHECK-LABEL: define <2 x double> @test_vfmaq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) #0 { 2018 // CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> 2019 // CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8> 2020 // CHECK: [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8> 2021 // CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> 2022 // CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> 2023 // CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> 2024 // CHECK: [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> zeroinitializer 2025 // CHECK: [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]]) 2026 // CHECK: ret <2 x double> [[TMP6]] 2027 float64x2_t test_vfmaq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) { 2028 return vfmaq_laneq_f64(a, b, v, 0); 2029 } 2030 2031 // CHECK-LABEL: define <2 x double> @test_vfmsq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) #0 { 2032 // CHECK: [[SUB:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %b 2033 // CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> 2034 // CHECK: [[TMP1:%.*]] = bitcast <2 x double> [[SUB]] to <16 x i8> 2035 // CHECK: [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8> 2036 // CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> 2037 // CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> 2038 // CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> 2039 // CHECK: [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> zeroinitializer 2040 // CHECK: [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]]) 2041 // CHECK: ret <2 x double> [[TMP6]] 2042 float64x2_t test_vfmsq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) { 2043 return vfmsq_laneq_f64(a, b, v, 0); 2044 } 2045 2046 // CHECK-LABEL: define <4 x i32> @test_vmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 { 2047 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer 2048 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> 2049 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 2050 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 2051 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 2052 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 2053 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]] 2054 // CHECK: ret <4 x i32> [[ADD]] 2055 int32x4_t test_vmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) { 2056 return vmlal_lane_s16(a, b, v, 0); 2057 } 2058 2059 // CHECK-LABEL: define <2 x i64> @test_vmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 { 2060 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer 2061 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> 2062 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 2063 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 2064 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 2065 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 2066 // CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]] 2067 // CHECK: ret <2 x i64> [[ADD]] 2068 int64x2_t test_vmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) { 2069 return vmlal_lane_s32(a, b, v, 0); 2070 } 2071 2072 // CHECK-LABEL: define <4 x i32> @test_vmlal_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 { 2073 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer 2074 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> 2075 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 2076 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 2077 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 2078 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 2079 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]] 2080 // CHECK: ret <4 x i32> [[ADD]] 2081 int32x4_t test_vmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) { 2082 return vmlal_laneq_s16(a, b, v, 0); 2083 } 2084 2085 // CHECK-LABEL: define <2 x i64> @test_vmlal_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 { 2086 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer 2087 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> 2088 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 2089 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 2090 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 2091 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 2092 // CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]] 2093 // CHECK: ret <2 x i64> [[ADD]] 2094 int64x2_t test_vmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) { 2095 return vmlal_laneq_s32(a, b, v, 0); 2096 } 2097 2098 // CHECK-LABEL: define <4 x i32> @test_vmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 { 2099 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 2100 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer 2101 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 2102 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 2103 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 2104 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 2105 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 2106 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]] 2107 // CHECK: ret <4 x i32> [[ADD]] 2108 int32x4_t test_vmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) { 2109 return vmlal_high_lane_s16(a, b, v, 0); 2110 } 2111 2112 // CHECK-LABEL: define <2 x i64> @test_vmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 { 2113 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3> 2114 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer 2115 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 2116 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 2117 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 2118 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 2119 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 2120 // CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]] 2121 // CHECK: ret <2 x i64> [[ADD]] 2122 int64x2_t test_vmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) { 2123 return vmlal_high_lane_s32(a, b, v, 0); 2124 } 2125 2126 // CHECK-LABEL: define <4 x i32> @test_vmlal_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 { 2127 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 2128 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer 2129 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 2130 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 2131 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 2132 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 2133 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 2134 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]] 2135 // CHECK: ret <4 x i32> [[ADD]] 2136 int32x4_t test_vmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) { 2137 return vmlal_high_laneq_s16(a, b, v, 0); 2138 } 2139 2140 // CHECK-LABEL: define <2 x i64> @test_vmlal_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 { 2141 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3> 2142 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer 2143 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 2144 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 2145 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 2146 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 2147 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 2148 // CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]] 2149 // CHECK: ret <2 x i64> [[ADD]] 2150 int64x2_t test_vmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) { 2151 return vmlal_high_laneq_s32(a, b, v, 0); 2152 } 2153 2154 // CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 { 2155 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer 2156 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> 2157 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 2158 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 2159 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 2160 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 2161 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]] 2162 // CHECK: ret <4 x i32> [[SUB]] 2163 int32x4_t test_vmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) { 2164 return vmlsl_lane_s16(a, b, v, 0); 2165 } 2166 2167 // CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 { 2168 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer 2169 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> 2170 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 2171 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 2172 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 2173 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 2174 // CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]] 2175 // CHECK: ret <2 x i64> [[SUB]] 2176 int64x2_t test_vmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) { 2177 return vmlsl_lane_s32(a, b, v, 0); 2178 } 2179 2180 // CHECK-LABEL: define <4 x i32> @test_vmlsl_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 { 2181 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer 2182 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> 2183 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 2184 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 2185 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 2186 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 2187 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]] 2188 // CHECK: ret <4 x i32> [[SUB]] 2189 int32x4_t test_vmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) { 2190 return vmlsl_laneq_s16(a, b, v, 0); 2191 } 2192 2193 // CHECK-LABEL: define <2 x i64> @test_vmlsl_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 { 2194 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer 2195 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> 2196 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 2197 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 2198 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 2199 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 2200 // CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]] 2201 // CHECK: ret <2 x i64> [[SUB]] 2202 int64x2_t test_vmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) { 2203 return vmlsl_laneq_s32(a, b, v, 0); 2204 } 2205 2206 // CHECK-LABEL: define <4 x i32> @test_vmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 { 2207 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 2208 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer 2209 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 2210 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 2211 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 2212 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 2213 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 2214 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]] 2215 // CHECK: ret <4 x i32> [[SUB]] 2216 int32x4_t test_vmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) { 2217 return vmlsl_high_lane_s16(a, b, v, 0); 2218 } 2219 2220 // CHECK-LABEL: define <2 x i64> @test_vmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 { 2221 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3> 2222 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer 2223 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 2224 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 2225 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 2226 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 2227 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 2228 // CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]] 2229 // CHECK: ret <2 x i64> [[SUB]] 2230 int64x2_t test_vmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) { 2231 return vmlsl_high_lane_s32(a, b, v, 0); 2232 } 2233 2234 // CHECK-LABEL: define <4 x i32> @test_vmlsl_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 { 2235 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 2236 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer 2237 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 2238 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 2239 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 2240 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 2241 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 2242 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]] 2243 // CHECK: ret <4 x i32> [[SUB]] 2244 int32x4_t test_vmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) { 2245 return vmlsl_high_laneq_s16(a, b, v, 0); 2246 } 2247 2248 // CHECK-LABEL: define <2 x i64> @test_vmlsl_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 { 2249 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3> 2250 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer 2251 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 2252 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 2253 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 2254 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 2255 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 2256 // CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]] 2257 // CHECK: ret <2 x i64> [[SUB]] 2258 int64x2_t test_vmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) { 2259 return vmlsl_high_laneq_s32(a, b, v, 0); 2260 } 2261 2262 // CHECK-LABEL: define <4 x i32> @test_vmlal_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 { 2263 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer 2264 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> 2265 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 2266 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 2267 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 2268 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 2269 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]] 2270 // CHECK: ret <4 x i32> [[ADD]] 2271 int32x4_t test_vmlal_lane_u16_0(int32x4_t a, int16x4_t b, int16x4_t v) { 2272 return vmlal_lane_u16(a, b, v, 0); 2273 } 2274 2275 // CHECK-LABEL: define <2 x i64> @test_vmlal_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 { 2276 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer 2277 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> 2278 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 2279 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 2280 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 2281 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 2282 // CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]] 2283 // CHECK: ret <2 x i64> [[ADD]] 2284 int64x2_t test_vmlal_lane_u32_0(int64x2_t a, int32x2_t b, int32x2_t v) { 2285 return vmlal_lane_u32(a, b, v, 0); 2286 } 2287 2288 // CHECK-LABEL: define <4 x i32> @test_vmlal_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 { 2289 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer 2290 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> 2291 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 2292 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 2293 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 2294 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 2295 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]] 2296 // CHECK: ret <4 x i32> [[ADD]] 2297 int32x4_t test_vmlal_laneq_u16_0(int32x4_t a, int16x4_t b, int16x8_t v) { 2298 return vmlal_laneq_u16(a, b, v, 0); 2299 } 2300 2301 // CHECK-LABEL: define <2 x i64> @test_vmlal_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 { 2302 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer 2303 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> 2304 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 2305 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 2306 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 2307 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 2308 // CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]] 2309 // CHECK: ret <2 x i64> [[ADD]] 2310 int64x2_t test_vmlal_laneq_u32_0(int64x2_t a, int32x2_t b, int32x4_t v) { 2311 return vmlal_laneq_u32(a, b, v, 0); 2312 } 2313 2314 // CHECK-LABEL: define <4 x i32> @test_vmlal_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 { 2315 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 2316 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer 2317 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 2318 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 2319 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 2320 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 2321 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 2322 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]] 2323 // CHECK: ret <4 x i32> [[ADD]] 2324 int32x4_t test_vmlal_high_lane_u16_0(int32x4_t a, int16x8_t b, int16x4_t v) { 2325 return vmlal_high_lane_u16(a, b, v, 0); 2326 } 2327 2328 // CHECK-LABEL: define <2 x i64> @test_vmlal_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 { 2329 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3> 2330 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer 2331 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 2332 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 2333 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 2334 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 2335 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 2336 // CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]] 2337 // CHECK: ret <2 x i64> [[ADD]] 2338 int64x2_t test_vmlal_high_lane_u32_0(int64x2_t a, int32x4_t b, int32x2_t v) { 2339 return vmlal_high_lane_u32(a, b, v, 0); 2340 } 2341 2342 // CHECK-LABEL: define <4 x i32> @test_vmlal_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 { 2343 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 2344 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer 2345 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 2346 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 2347 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 2348 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 2349 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 2350 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]] 2351 // CHECK: ret <4 x i32> [[ADD]] 2352 int32x4_t test_vmlal_high_laneq_u16_0(int32x4_t a, int16x8_t b, int16x8_t v) { 2353 return vmlal_high_laneq_u16(a, b, v, 0); 2354 } 2355 2356 // CHECK-LABEL: define <2 x i64> @test_vmlal_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 { 2357 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3> 2358 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer 2359 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 2360 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 2361 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 2362 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 2363 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 2364 // CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]] 2365 // CHECK: ret <2 x i64> [[ADD]] 2366 int64x2_t test_vmlal_high_laneq_u32_0(int64x2_t a, int32x4_t b, int32x4_t v) { 2367 return vmlal_high_laneq_u32(a, b, v, 0); 2368 } 2369 2370 // CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 { 2371 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer 2372 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> 2373 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 2374 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 2375 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 2376 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 2377 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]] 2378 // CHECK: ret <4 x i32> [[SUB]] 2379 int32x4_t test_vmlsl_lane_u16_0(int32x4_t a, int16x4_t b, int16x4_t v) { 2380 return vmlsl_lane_u16(a, b, v, 0); 2381 } 2382 2383 // CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 { 2384 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer 2385 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> 2386 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 2387 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 2388 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 2389 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 2390 // CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]] 2391 // CHECK: ret <2 x i64> [[SUB]] 2392 int64x2_t test_vmlsl_lane_u32_0(int64x2_t a, int32x2_t b, int32x2_t v) { 2393 return vmlsl_lane_u32(a, b, v, 0); 2394 } 2395 2396 // CHECK-LABEL: define <4 x i32> @test_vmlsl_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 { 2397 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer 2398 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> 2399 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 2400 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 2401 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 2402 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 2403 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]] 2404 // CHECK: ret <4 x i32> [[SUB]] 2405 int32x4_t test_vmlsl_laneq_u16_0(int32x4_t a, int16x4_t b, int16x8_t v) { 2406 return vmlsl_laneq_u16(a, b, v, 0); 2407 } 2408 2409 // CHECK-LABEL: define <2 x i64> @test_vmlsl_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 { 2410 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer 2411 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> 2412 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 2413 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 2414 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 2415 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 2416 // CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]] 2417 // CHECK: ret <2 x i64> [[SUB]] 2418 int64x2_t test_vmlsl_laneq_u32_0(int64x2_t a, int32x2_t b, int32x4_t v) { 2419 return vmlsl_laneq_u32(a, b, v, 0); 2420 } 2421 2422 // CHECK-LABEL: define <4 x i32> @test_vmlsl_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 { 2423 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 2424 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer 2425 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 2426 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 2427 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 2428 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 2429 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 2430 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]] 2431 // CHECK: ret <4 x i32> [[SUB]] 2432 int32x4_t test_vmlsl_high_lane_u16_0(int32x4_t a, int16x8_t b, int16x4_t v) { 2433 return vmlsl_high_lane_u16(a, b, v, 0); 2434 } 2435 2436 // CHECK-LABEL: define <2 x i64> @test_vmlsl_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 { 2437 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3> 2438 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer 2439 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 2440 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 2441 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 2442 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 2443 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 2444 // CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]] 2445 // CHECK: ret <2 x i64> [[SUB]] 2446 int64x2_t test_vmlsl_high_lane_u32_0(int64x2_t a, int32x4_t b, int32x2_t v) { 2447 return vmlsl_high_lane_u32(a, b, v, 0); 2448 } 2449 2450 // CHECK-LABEL: define <4 x i32> @test_vmlsl_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 { 2451 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 2452 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer 2453 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 2454 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 2455 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 2456 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 2457 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 2458 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]] 2459 // CHECK: ret <4 x i32> [[SUB]] 2460 int32x4_t test_vmlsl_high_laneq_u16_0(int32x4_t a, int16x8_t b, int16x8_t v) { 2461 return vmlsl_high_laneq_u16(a, b, v, 0); 2462 } 2463 2464 // CHECK-LABEL: define <2 x i64> @test_vmlsl_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 { 2465 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3> 2466 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer 2467 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 2468 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 2469 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 2470 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 2471 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 2472 // CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]] 2473 // CHECK: ret <2 x i64> [[SUB]] 2474 int64x2_t test_vmlsl_high_laneq_u32_0(int64x2_t a, int32x4_t b, int32x4_t v) { 2475 return vmlsl_high_laneq_u32(a, b, v, 0); 2476 } 2477 2478 // CHECK-LABEL: define <4 x i32> @test_vmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) #0 { 2479 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer 2480 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> 2481 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 2482 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 2483 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 2484 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 2485 // CHECK: ret <4 x i32> [[VMULL2_I]] 2486 int32x4_t test_vmull_lane_s16_0(int16x4_t a, int16x4_t v) { 2487 return vmull_lane_s16(a, v, 0); 2488 } 2489 2490 // CHECK-LABEL: define <2 x i64> @test_vmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) #0 { 2491 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer 2492 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> 2493 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 2494 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 2495 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 2496 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 2497 // CHECK: ret <2 x i64> [[VMULL2_I]] 2498 int64x2_t test_vmull_lane_s32_0(int32x2_t a, int32x2_t v) { 2499 return vmull_lane_s32(a, v, 0); 2500 } 2501 2502 // CHECK-LABEL: define <4 x i32> @test_vmull_lane_u16_0(<4 x i16> %a, <4 x i16> %v) #0 { 2503 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer 2504 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> 2505 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 2506 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 2507 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 2508 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 2509 // CHECK: ret <4 x i32> [[VMULL2_I]] 2510 uint32x4_t test_vmull_lane_u16_0(uint16x4_t a, uint16x4_t v) { 2511 return vmull_lane_u16(a, v, 0); 2512 } 2513 2514 // CHECK-LABEL: define <2 x i64> @test_vmull_lane_u32_0(<2 x i32> %a, <2 x i32> %v) #0 { 2515 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer 2516 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> 2517 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 2518 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 2519 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 2520 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 2521 // CHECK: ret <2 x i64> [[VMULL2_I]] 2522 uint64x2_t test_vmull_lane_u32_0(uint32x2_t a, uint32x2_t v) { 2523 return vmull_lane_u32(a, v, 0); 2524 } 2525 2526 // CHECK-LABEL: define <4 x i32> @test_vmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) #0 { 2527 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 2528 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer 2529 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 2530 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 2531 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 2532 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 2533 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 2534 // CHECK: ret <4 x i32> [[VMULL2_I]] 2535 int32x4_t test_vmull_high_lane_s16_0(int16x8_t a, int16x4_t v) { 2536 return vmull_high_lane_s16(a, v, 0); 2537 } 2538 2539 // CHECK-LABEL: define <2 x i64> @test_vmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) #0 { 2540 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3> 2541 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer 2542 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 2543 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 2544 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 2545 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 2546 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 2547 // CHECK: ret <2 x i64> [[VMULL2_I]] 2548 int64x2_t test_vmull_high_lane_s32_0(int32x4_t a, int32x2_t v) { 2549 return vmull_high_lane_s32(a, v, 0); 2550 } 2551 2552 // CHECK-LABEL: define <4 x i32> @test_vmull_high_lane_u16_0(<8 x i16> %a, <4 x i16> %v) #0 { 2553 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 2554 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer 2555 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 2556 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 2557 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 2558 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 2559 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 2560 // CHECK: ret <4 x i32> [[VMULL2_I]] 2561 uint32x4_t test_vmull_high_lane_u16_0(uint16x8_t a, uint16x4_t v) { 2562 return vmull_high_lane_u16(a, v, 0); 2563 } 2564 2565 // CHECK-LABEL: define <2 x i64> @test_vmull_high_lane_u32_0(<4 x i32> %a, <2 x i32> %v) #0 { 2566 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3> 2567 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer 2568 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 2569 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 2570 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 2571 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 2572 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 2573 // CHECK: ret <2 x i64> [[VMULL2_I]] 2574 uint64x2_t test_vmull_high_lane_u32_0(uint32x4_t a, uint32x2_t v) { 2575 return vmull_high_lane_u32(a, v, 0); 2576 } 2577 2578 // CHECK-LABEL: define <4 x i32> @test_vmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) #0 { 2579 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer 2580 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> 2581 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 2582 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 2583 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 2584 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 2585 // CHECK: ret <4 x i32> [[VMULL2_I]] 2586 int32x4_t test_vmull_laneq_s16_0(int16x4_t a, int16x8_t v) { 2587 return vmull_laneq_s16(a, v, 0); 2588 } 2589 2590 // CHECK-LABEL: define <2 x i64> @test_vmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) #0 { 2591 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer 2592 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> 2593 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 2594 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 2595 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 2596 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 2597 // CHECK: ret <2 x i64> [[VMULL2_I]] 2598 int64x2_t test_vmull_laneq_s32_0(int32x2_t a, int32x4_t v) { 2599 return vmull_laneq_s32(a, v, 0); 2600 } 2601 2602 // CHECK-LABEL: define <4 x i32> @test_vmull_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) #0 { 2603 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer 2604 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> 2605 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 2606 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 2607 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 2608 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 2609 // CHECK: ret <4 x i32> [[VMULL2_I]] 2610 uint32x4_t test_vmull_laneq_u16_0(uint16x4_t a, uint16x8_t v) { 2611 return vmull_laneq_u16(a, v, 0); 2612 } 2613 2614 // CHECK-LABEL: define <2 x i64> @test_vmull_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) #0 { 2615 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer 2616 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> 2617 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 2618 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 2619 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 2620 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 2621 // CHECK: ret <2 x i64> [[VMULL2_I]] 2622 uint64x2_t test_vmull_laneq_u32_0(uint32x2_t a, uint32x4_t v) { 2623 return vmull_laneq_u32(a, v, 0); 2624 } 2625 2626 // CHECK-LABEL: define <4 x i32> @test_vmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) #0 { 2627 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 2628 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer 2629 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 2630 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 2631 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 2632 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 2633 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 2634 // CHECK: ret <4 x i32> [[VMULL2_I]] 2635 int32x4_t test_vmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) { 2636 return vmull_high_laneq_s16(a, v, 0); 2637 } 2638 2639 // CHECK-LABEL: define <2 x i64> @test_vmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) #0 { 2640 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3> 2641 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer 2642 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 2643 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 2644 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 2645 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 2646 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 2647 // CHECK: ret <2 x i64> [[VMULL2_I]] 2648 int64x2_t test_vmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) { 2649 return vmull_high_laneq_s32(a, v, 0); 2650 } 2651 2652 // CHECK-LABEL: define <4 x i32> @test_vmull_high_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) #0 { 2653 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 2654 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer 2655 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 2656 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 2657 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 2658 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 2659 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2 2660 // CHECK: ret <4 x i32> [[VMULL2_I]] 2661 uint32x4_t test_vmull_high_laneq_u16_0(uint16x8_t a, uint16x8_t v) { 2662 return vmull_high_laneq_u16(a, v, 0); 2663 } 2664 2665 // CHECK-LABEL: define <2 x i64> @test_vmull_high_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) #0 { 2666 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3> 2667 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer 2668 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 2669 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 2670 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 2671 // CHECK: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 2672 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2 2673 // CHECK: ret <2 x i64> [[VMULL2_I]] 2674 uint64x2_t test_vmull_high_laneq_u32_0(uint32x4_t a, uint32x4_t v) { 2675 return vmull_high_laneq_u32(a, v, 0); 2676 } 2677 2678 // CHECK-LABEL: define <4 x i32> @test_vqdmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 { 2679 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer 2680 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> 2681 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> 2682 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 2683 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 2684 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> 2685 // CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2 2686 // CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 2687 // CHECK: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2 2688 // CHECK: ret <4 x i32> [[VQDMLAL_V3_I]] 2689 int32x4_t test_vqdmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) { 2690 return vqdmlal_lane_s16(a, b, v, 0); 2691 } 2692 2693 // CHECK-LABEL: define <2 x i64> @test_vqdmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 { 2694 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer 2695 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> 2696 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> 2697 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 2698 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 2699 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> 2700 // CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2 2701 // CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> 2702 // CHECK: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2 2703 // CHECK: ret <2 x i64> [[VQDMLAL_V3_I]] 2704 int64x2_t test_vqdmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) { 2705 return vqdmlal_lane_s32(a, b, v, 0); 2706 } 2707 2708 // CHECK-LABEL: define <4 x i32> @test_vqdmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 { 2709 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 2710 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer 2711 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> 2712 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 2713 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 2714 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 2715 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> 2716 // CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2 2717 // CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 2718 // CHECK: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2 2719 // CHECK: ret <4 x i32> [[VQDMLAL_V3_I]] 2720 int32x4_t test_vqdmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) { 2721 return vqdmlal_high_lane_s16(a, b, v, 0); 2722 } 2723 2724 // CHECK-LABEL: define <2 x i64> @test_vqdmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 { 2725 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3> 2726 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer 2727 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> 2728 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 2729 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 2730 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 2731 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> 2732 // CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2 2733 // CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> 2734 // CHECK: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2 2735 // CHECK: ret <2 x i64> [[VQDMLAL_V3_I]] 2736 int64x2_t test_vqdmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) { 2737 return vqdmlal_high_lane_s32(a, b, v, 0); 2738 } 2739 2740 // CHECK-LABEL: define <4 x i32> @test_vqdmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 { 2741 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer 2742 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> 2743 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> 2744 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 2745 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 2746 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> 2747 // CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2 2748 // CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 2749 // CHECK: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2 2750 // CHECK: ret <4 x i32> [[VQDMLSL_V3_I]] 2751 int32x4_t test_vqdmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) { 2752 return vqdmlsl_lane_s16(a, b, v, 0); 2753 } 2754 2755 // CHECK-LABEL: define <2 x i64> @test_vqdmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 { 2756 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer 2757 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> 2758 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> 2759 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 2760 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 2761 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> 2762 // CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2 2763 // CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> 2764 // CHECK: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2 2765 // CHECK: ret <2 x i64> [[VQDMLSL_V3_I]] 2766 int64x2_t test_vqdmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) { 2767 return vqdmlsl_lane_s32(a, b, v, 0); 2768 } 2769 2770 // CHECK-LABEL: define <4 x i32> @test_vqdmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 { 2771 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 2772 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer 2773 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> 2774 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 2775 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 2776 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 2777 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> 2778 // CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2 2779 // CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 2780 // CHECK: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2 2781 // CHECK: ret <4 x i32> [[VQDMLSL_V3_I]] 2782 int32x4_t test_vqdmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) { 2783 return vqdmlsl_high_lane_s16(a, b, v, 0); 2784 } 2785 2786 // CHECK-LABEL: define <2 x i64> @test_vqdmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 { 2787 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3> 2788 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer 2789 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> 2790 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 2791 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 2792 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 2793 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> 2794 // CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2 2795 // CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> 2796 // CHECK: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2 2797 // CHECK: ret <2 x i64> [[VQDMLSL_V3_I]] 2798 int64x2_t test_vqdmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) { 2799 return vqdmlsl_high_lane_s32(a, b, v, 0); 2800 } 2801 2802 // CHECK-LABEL: define <4 x i32> @test_vqdmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) #0 { 2803 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer 2804 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> 2805 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 2806 // CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 2807 // CHECK: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 2808 // CHECK: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2 2809 // CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> 2810 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32> 2811 // CHECK: ret <4 x i32> [[TMP2]] 2812 int32x4_t test_vqdmull_lane_s16_0(int16x4_t a, int16x4_t v) { 2813 return vqdmull_lane_s16(a, v, 0); 2814 } 2815 2816 // CHECK-LABEL: define <2 x i64> @test_vqdmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) #0 { 2817 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer 2818 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> 2819 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 2820 // CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 2821 // CHECK: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 2822 // CHECK: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2 2823 // CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> 2824 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64> 2825 // CHECK: ret <2 x i64> [[TMP2]] 2826 int64x2_t test_vqdmull_lane_s32_0(int32x2_t a, int32x2_t v) { 2827 return vqdmull_lane_s32(a, v, 0); 2828 } 2829 2830 // CHECK-LABEL: define <4 x i32> @test_vqdmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) #0 { 2831 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer 2832 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> 2833 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 2834 // CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 2835 // CHECK: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 2836 // CHECK: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2 2837 // CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> 2838 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32> 2839 // CHECK: ret <4 x i32> [[TMP2]] 2840 int32x4_t test_vqdmull_laneq_s16_0(int16x4_t a, int16x8_t v) { 2841 return vqdmull_laneq_s16(a, v, 0); 2842 } 2843 2844 // CHECK-LABEL: define <2 x i64> @test_vqdmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) #0 { 2845 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer 2846 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> 2847 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 2848 // CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 2849 // CHECK: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 2850 // CHECK: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2 2851 // CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> 2852 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64> 2853 // CHECK: ret <2 x i64> [[TMP2]] 2854 int64x2_t test_vqdmull_laneq_s32_0(int32x2_t a, int32x4_t v) { 2855 return vqdmull_laneq_s32(a, v, 0); 2856 } 2857 2858 // CHECK-LABEL: define <4 x i32> @test_vqdmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) #0 { 2859 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 2860 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer 2861 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 2862 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 2863 // CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 2864 // CHECK: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 2865 // CHECK: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2 2866 // CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> 2867 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32> 2868 // CHECK: ret <4 x i32> [[TMP2]] 2869 int32x4_t test_vqdmull_high_lane_s16_0(int16x8_t a, int16x4_t v) { 2870 return vqdmull_high_lane_s16(a, v, 0); 2871 } 2872 2873 // CHECK-LABEL: define <2 x i64> @test_vqdmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) #0 { 2874 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3> 2875 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer 2876 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 2877 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 2878 // CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 2879 // CHECK: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 2880 // CHECK: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2 2881 // CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> 2882 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64> 2883 // CHECK: ret <2 x i64> [[TMP2]] 2884 int64x2_t test_vqdmull_high_lane_s32_0(int32x4_t a, int32x2_t v) { 2885 return vqdmull_high_lane_s32(a, v, 0); 2886 } 2887 2888 // CHECK-LABEL: define <4 x i32> @test_vqdmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) #0 { 2889 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 2890 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer 2891 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 2892 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 2893 // CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 2894 // CHECK: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 2895 // CHECK: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2 2896 // CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> 2897 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32> 2898 // CHECK: ret <4 x i32> [[TMP2]] 2899 int32x4_t test_vqdmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) { 2900 return vqdmull_high_laneq_s16(a, v, 0); 2901 } 2902 2903 // CHECK-LABEL: define <2 x i64> @test_vqdmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) #0 { 2904 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3> 2905 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer 2906 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 2907 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 2908 // CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 2909 // CHECK: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 2910 // CHECK: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2 2911 // CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> 2912 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64> 2913 // CHECK: ret <2 x i64> [[TMP2]] 2914 int64x2_t test_vqdmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) { 2915 return vqdmull_high_laneq_s32(a, v, 0); 2916 } 2917 2918 // CHECK-LABEL: define <4 x i16> @test_vqdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) #0 { 2919 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer 2920 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> 2921 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 2922 // CHECK: [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 2923 // CHECK: [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 2924 // CHECK: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) #2 2925 // CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8> 2926 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16> 2927 // CHECK: ret <4 x i16> [[TMP2]] 2928 int16x4_t test_vqdmulh_lane_s16_0(int16x4_t a, int16x4_t v) { 2929 return vqdmulh_lane_s16(a, v, 0); 2930 } 2931 2932 // CHECK-LABEL: define <8 x i16> @test_vqdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) #0 { 2933 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer 2934 // CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> 2935 // CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8> 2936 // CHECK: [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 2937 // CHECK: [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> 2938 // CHECK: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) #2 2939 // CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8> 2940 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16> 2941 // CHECK: ret <8 x i16> [[TMP2]] 2942 int16x8_t test_vqdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) { 2943 return vqdmulhq_lane_s16(a, v, 0); 2944 } 2945 2946 // CHECK-LABEL: define <2 x i32> @test_vqdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) #0 { 2947 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer 2948 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> 2949 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 2950 // CHECK: [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 2951 // CHECK: [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 2952 // CHECK: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) #2 2953 // CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8> 2954 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32> 2955 // CHECK: ret <2 x i32> [[TMP2]] 2956 int32x2_t test_vqdmulh_lane_s32_0(int32x2_t a, int32x2_t v) { 2957 return vqdmulh_lane_s32(a, v, 0); 2958 } 2959 2960 // CHECK-LABEL: define <4 x i32> @test_vqdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) #0 { 2961 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer 2962 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> 2963 // CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8> 2964 // CHECK: [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 2965 // CHECK: [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> 2966 // CHECK: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) #2 2967 // CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8> 2968 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32> 2969 // CHECK: ret <4 x i32> [[TMP2]] 2970 int32x4_t test_vqdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) { 2971 return vqdmulhq_lane_s32(a, v, 0); 2972 } 2973 2974 // CHECK-LABEL: define <4 x i16> @test_vqrdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) #0 { 2975 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer 2976 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> 2977 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 2978 // CHECK: [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 2979 // CHECK: [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 2980 // CHECK: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) #2 2981 // CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8> 2982 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16> 2983 // CHECK: ret <4 x i16> [[TMP2]] 2984 int16x4_t test_vqrdmulh_lane_s16_0(int16x4_t a, int16x4_t v) { 2985 return vqrdmulh_lane_s16(a, v, 0); 2986 } 2987 2988 // CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) #0 { 2989 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer 2990 // CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> 2991 // CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8> 2992 // CHECK: [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 2993 // CHECK: [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> 2994 // CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) #2 2995 // CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8> 2996 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16> 2997 // CHECK: ret <8 x i16> [[TMP2]] 2998 int16x8_t test_vqrdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) { 2999 return vqrdmulhq_lane_s16(a, v, 0); 3000 } 3001 3002 // CHECK-LABEL: define <2 x i32> @test_vqrdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) #0 { 3003 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer 3004 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> 3005 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 3006 // CHECK: [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 3007 // CHECK: [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 3008 // CHECK: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) #2 3009 // CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8> 3010 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32> 3011 // CHECK: ret <2 x i32> [[TMP2]] 3012 int32x2_t test_vqrdmulh_lane_s32_0(int32x2_t a, int32x2_t v) { 3013 return vqrdmulh_lane_s32(a, v, 0); 3014 } 3015 3016 // CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) #0 { 3017 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer 3018 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> 3019 // CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8> 3020 // CHECK: [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 3021 // CHECK: [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> 3022 // CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) #2 3023 // CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8> 3024 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32> 3025 // CHECK: ret <4 x i32> [[TMP2]] 3026 int32x4_t test_vqrdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) { 3027 return vqrdmulhq_lane_s32(a, v, 0); 3028 } 3029 3030 // CHECK-LABEL: define <2 x float> @test_vmul_lane_f32_0(<2 x float> %a, <2 x float> %v) #0 { 3031 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> zeroinitializer 3032 // CHECK: [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]] 3033 // CHECK: ret <2 x float> [[MUL]] 3034 float32x2_t test_vmul_lane_f32_0(float32x2_t a, float32x2_t v) { 3035 return vmul_lane_f32(a, v, 0); 3036 } 3037 3038 // CHECK-LABEL: define <4 x float> @test_vmulq_lane_f32_0(<4 x float> %a, <2 x float> %v) #0 { 3039 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> zeroinitializer 3040 // CHECK: [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]] 3041 // CHECK: ret <4 x float> [[MUL]] 3042 float32x4_t test_vmulq_lane_f32_0(float32x4_t a, float32x2_t v) { 3043 return vmulq_lane_f32(a, v, 0); 3044 } 3045 3046 // CHECK-LABEL: define <2 x float> @test_vmul_laneq_f32_0(<2 x float> %a, <4 x float> %v) #0 { 3047 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> zeroinitializer 3048 // CHECK: [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]] 3049 // CHECK: ret <2 x float> [[MUL]] 3050 float32x2_t test_vmul_laneq_f32_0(float32x2_t a, float32x4_t v) { 3051 return vmul_laneq_f32(a, v, 0); 3052 } 3053 3054 // CHECK-LABEL: define <1 x double> @test_vmul_laneq_f64_0(<1 x double> %a, <2 x double> %v) #0 { 3055 // CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> 3056 // CHECK: [[TMP1:%.*]] = bitcast <2 x double> %v to <16 x i8> 3057 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double 3058 // CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> 3059 // CHECK: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP3]], i32 0 3060 // CHECK: [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]] 3061 // CHECK: [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double> 3062 // CHECK: ret <1 x double> [[TMP5]] 3063 float64x1_t test_vmul_laneq_f64_0(float64x1_t a, float64x2_t v) { 3064 return vmul_laneq_f64(a, v, 0); 3065 } 3066 3067 // CHECK-LABEL: define <4 x float> @test_vmulq_laneq_f32_0(<4 x float> %a, <4 x float> %v) #0 { 3068 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> zeroinitializer 3069 // CHECK: [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]] 3070 // CHECK: ret <4 x float> [[MUL]] 3071 float32x4_t test_vmulq_laneq_f32_0(float32x4_t a, float32x4_t v) { 3072 return vmulq_laneq_f32(a, v, 0); 3073 } 3074 3075 // CHECK-LABEL: define <2 x double> @test_vmulq_laneq_f64_0(<2 x double> %a, <2 x double> %v) #0 { 3076 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x double> %v, <2 x double> %v, <2 x i32> zeroinitializer 3077 // CHECK: [[MUL:%.*]] = fmul <2 x double> %a, [[SHUFFLE]] 3078 // CHECK: ret <2 x double> [[MUL]] 3079 float64x2_t test_vmulq_laneq_f64_0(float64x2_t a, float64x2_t v) { 3080 return vmulq_laneq_f64(a, v, 0); 3081 } 3082 3083 // CHECK-LABEL: define <2 x float> @test_vmulx_lane_f32_0(<2 x float> %a, <2 x float> %v) #0 { 3084 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> zeroinitializer 3085 // CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> 3086 // CHECK: [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8> 3087 // CHECK: [[VMULX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> 3088 // CHECK: [[VMULX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> 3089 // CHECK: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[VMULX_I]], <2 x float> [[VMULX1_I]]) #2 3090 // CHECK: ret <2 x float> [[VMULX2_I]] 3091 float32x2_t test_vmulx_lane_f32_0(float32x2_t a, float32x2_t v) { 3092 return vmulx_lane_f32(a, v, 0); 3093 } 3094 3095 // CHECK-LABEL: define <4 x float> @test_vmulxq_lane_f32_0(<4 x float> %a, <2 x float> %v) #0 { 3096 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> zeroinitializer 3097 // CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> 3098 // CHECK: [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8> 3099 // CHECK: [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> 3100 // CHECK: [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> 3101 // CHECK: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[VMULX_I]], <4 x float> [[VMULX1_I]]) #2 3102 // CHECK: ret <4 x float> [[VMULX2_I]] 3103 float32x4_t test_vmulxq_lane_f32_0(float32x4_t a, float32x2_t v) { 3104 return vmulxq_lane_f32(a, v, 0); 3105 } 3106 3107 // CHECK-LABEL: define <2 x double> @test_vmulxq_lane_f64_0(<2 x double> %a, <1 x double> %v) #0 { 3108 // CHECK: [[SHUFFLE:%.*]] = shufflevector <1 x double> %v, <1 x double> %v, <2 x i32> zeroinitializer 3109 // CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> 3110 // CHECK: [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8> 3111 // CHECK: [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> 3112 // CHECK: [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> 3113 // CHECK: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[VMULX_I]], <2 x double> [[VMULX1_I]]) #2 3114 // CHECK: ret <2 x double> [[VMULX2_I]] 3115 float64x2_t test_vmulxq_lane_f64_0(float64x2_t a, float64x1_t v) { 3116 return vmulxq_lane_f64(a, v, 0); 3117 } 3118 3119 // CHECK-LABEL: define <2 x float> @test_vmulx_laneq_f32_0(<2 x float> %a, <4 x float> %v) #0 { 3120 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> zeroinitializer 3121 // CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> 3122 // CHECK: [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8> 3123 // CHECK: [[VMULX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> 3124 // CHECK: [[VMULX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> 3125 // CHECK: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[VMULX_I]], <2 x float> [[VMULX1_I]]) #2 3126 // CHECK: ret <2 x float> [[VMULX2_I]] 3127 float32x2_t test_vmulx_laneq_f32_0(float32x2_t a, float32x4_t v) { 3128 return vmulx_laneq_f32(a, v, 0); 3129 } 3130 3131 // CHECK-LABEL: define <4 x float> @test_vmulxq_laneq_f32_0(<4 x float> %a, <4 x float> %v) #0 { 3132 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> zeroinitializer 3133 // CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> 3134 // CHECK: [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8> 3135 // CHECK: [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> 3136 // CHECK: [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> 3137 // CHECK: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[VMULX_I]], <4 x float> [[VMULX1_I]]) #2 3138 // CHECK: ret <4 x float> [[VMULX2_I]] 3139 float32x4_t test_vmulxq_laneq_f32_0(float32x4_t a, float32x4_t v) { 3140 return vmulxq_laneq_f32(a, v, 0); 3141 } 3142 3143 // CHECK-LABEL: define <2 x double> @test_vmulxq_laneq_f64_0(<2 x double> %a, <2 x double> %v) #0 { 3144 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x double> %v, <2 x double> %v, <2 x i32> zeroinitializer 3145 // CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> 3146 // CHECK: [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8> 3147 // CHECK: [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> 3148 // CHECK: [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> 3149 // CHECK: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[VMULX_I]], <2 x double> [[VMULX1_I]]) #2 3150 // CHECK: ret <2 x double> [[VMULX2_I]] 3151 float64x2_t test_vmulxq_laneq_f64_0(float64x2_t a, float64x2_t v) { 3152 return vmulxq_laneq_f64(a, v, 0); 3153 } 3154 3155 // CHECK-LABEL: define <4 x i32> @test_vmull_high_n_s16(<8 x i16> %a, i16 %b) #0 { 3156 // CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 3157 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> 3158 // CHECK: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0 3159 // CHECK: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %b, i32 1 3160 // CHECK: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %b, i32 2 3161 // CHECK: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %b, i32 3 3162 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8> 3163 // CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 3164 // CHECK: [[VMULL4_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 3165 // CHECK: [[VMULL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL4_I_I]]) #2 3166 // CHECK: ret <4 x i32> [[VMULL5_I_I]] 3167 int32x4_t test_vmull_high_n_s16(int16x8_t a, int16_t b) { 3168 return vmull_high_n_s16(a, b); 3169 } 3170 3171 // CHECK-LABEL: define <2 x i64> @test_vmull_high_n_s32(<4 x i32> %a, i32 %b) #0 { 3172 // CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3> 3173 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> 3174 // CHECK: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0 3175 // CHECK: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %b, i32 1 3176 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8> 3177 // CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 3178 // CHECK: [[VMULL2_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 3179 // CHECK: [[VMULL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL2_I_I]]) #2 3180 // CHECK: ret <2 x i64> [[VMULL3_I_I]] 3181 int64x2_t test_vmull_high_n_s32(int32x4_t a, int32_t b) { 3182 return vmull_high_n_s32(a, b); 3183 } 3184 3185 // CHECK-LABEL: define <4 x i32> @test_vmull_high_n_u16(<8 x i16> %a, i16 %b) #0 { 3186 // CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 3187 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> 3188 // CHECK: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0 3189 // CHECK: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %b, i32 1 3190 // CHECK: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %b, i32 2 3191 // CHECK: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %b, i32 3 3192 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8> 3193 // CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 3194 // CHECK: [[VMULL4_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 3195 // CHECK: [[VMULL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL4_I_I]]) #2 3196 // CHECK: ret <4 x i32> [[VMULL5_I_I]] 3197 uint32x4_t test_vmull_high_n_u16(uint16x8_t a, uint16_t b) { 3198 return vmull_high_n_u16(a, b); 3199 } 3200 3201 // CHECK-LABEL: define <2 x i64> @test_vmull_high_n_u32(<4 x i32> %a, i32 %b) #0 { 3202 // CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3> 3203 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> 3204 // CHECK: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0 3205 // CHECK: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %b, i32 1 3206 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8> 3207 // CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 3208 // CHECK: [[VMULL2_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 3209 // CHECK: [[VMULL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL2_I_I]]) #2 3210 // CHECK: ret <2 x i64> [[VMULL3_I_I]] 3211 uint64x2_t test_vmull_high_n_u32(uint32x4_t a, uint32_t b) { 3212 return vmull_high_n_u32(a, b); 3213 } 3214 3215 // CHECK-LABEL: define <4 x i32> @test_vqdmull_high_n_s16(<8 x i16> %a, i16 %b) #0 { 3216 // CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 3217 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> 3218 // CHECK: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0 3219 // CHECK: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %b, i32 1 3220 // CHECK: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %b, i32 2 3221 // CHECK: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %b, i32 3 3222 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8> 3223 // CHECK: [[VQDMULL_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 3224 // CHECK: [[VQDMULL_V4_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 3225 // CHECK: [[VQDMULL_V5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I_I]], <4 x i16> [[VQDMULL_V4_I_I]]) #2 3226 // CHECK: [[VQDMULL_V6_I_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V5_I_I]] to <16 x i8> 3227 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V6_I_I]] to <4 x i32> 3228 // CHECK: ret <4 x i32> [[TMP2]] 3229 int32x4_t test_vqdmull_high_n_s16(int16x8_t a, int16_t b) { 3230 return vqdmull_high_n_s16(a, b); 3231 } 3232 3233 // CHECK-LABEL: define <2 x i64> @test_vqdmull_high_n_s32(<4 x i32> %a, i32 %b) #0 { 3234 // CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3> 3235 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> 3236 // CHECK: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0 3237 // CHECK: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %b, i32 1 3238 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8> 3239 // CHECK: [[VQDMULL_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 3240 // CHECK: [[VQDMULL_V2_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 3241 // CHECK: [[VQDMULL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I_I]], <2 x i32> [[VQDMULL_V2_I_I]]) #2 3242 // CHECK: [[VQDMULL_V4_I_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V3_I_I]] to <16 x i8> 3243 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V4_I_I]] to <2 x i64> 3244 // CHECK: ret <2 x i64> [[TMP2]] 3245 int64x2_t test_vqdmull_high_n_s32(int32x4_t a, int32_t b) { 3246 return vqdmull_high_n_s32(a, b); 3247 } 3248 3249 // CHECK-LABEL: define <4 x i32> @test_vmlal_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 { 3250 // CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 3251 // CHECK: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0 3252 // CHECK: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1 3253 // CHECK: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2 3254 // CHECK: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3 3255 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> 3256 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8> 3257 // CHECK: [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 3258 // CHECK: [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 3259 // CHECK: [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) #2 3260 // CHECK: [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I_I]] 3261 // CHECK: ret <4 x i32> [[ADD_I_I]] 3262 int32x4_t test_vmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) { 3263 return vmlal_high_n_s16(a, b, c); 3264 } 3265 3266 // CHECK-LABEL: define <2 x i64> @test_vmlal_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 { 3267 // CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3> 3268 // CHECK: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0 3269 // CHECK: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1 3270 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> 3271 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8> 3272 // CHECK: [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 3273 // CHECK: [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 3274 // CHECK: [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) #2 3275 // CHECK: [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I_I]] 3276 // CHECK: ret <2 x i64> [[ADD_I_I]] 3277 int64x2_t test_vmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) { 3278 return vmlal_high_n_s32(a, b, c); 3279 } 3280 3281 // CHECK-LABEL: define <4 x i32> @test_vmlal_high_n_u16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 { 3282 // CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 3283 // CHECK: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0 3284 // CHECK: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1 3285 // CHECK: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2 3286 // CHECK: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3 3287 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> 3288 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8> 3289 // CHECK: [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 3290 // CHECK: [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 3291 // CHECK: [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) #2 3292 // CHECK: [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I_I]] 3293 // CHECK: ret <4 x i32> [[ADD_I_I]] 3294 uint32x4_t test_vmlal_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) { 3295 return vmlal_high_n_u16(a, b, c); 3296 } 3297 3298 // CHECK-LABEL: define <2 x i64> @test_vmlal_high_n_u32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 { 3299 // CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3> 3300 // CHECK: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0 3301 // CHECK: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1 3302 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> 3303 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8> 3304 // CHECK: [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 3305 // CHECK: [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 3306 // CHECK: [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) #2 3307 // CHECK: [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I_I]] 3308 // CHECK: ret <2 x i64> [[ADD_I_I]] 3309 uint64x2_t test_vmlal_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) { 3310 return vmlal_high_n_u32(a, b, c); 3311 } 3312 3313 // CHECK-LABEL: define <4 x i32> @test_vqdmlal_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 { 3314 // CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 3315 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> 3316 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> 3317 // CHECK: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0 3318 // CHECK: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1 3319 // CHECK: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2 3320 // CHECK: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3 3321 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8> 3322 // CHECK: [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 3323 // CHECK: [[VQDMLAL4_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> 3324 // CHECK: [[VQDMLAL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I_I]], <4 x i16> [[VQDMLAL4_I_I]]) #2 3325 // CHECK: [[VQDMLAL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 3326 // CHECK: [[VQDMLAL_V6_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I_I]], <4 x i32> [[VQDMLAL5_I_I]]) #2 3327 // CHECK: ret <4 x i32> [[VQDMLAL_V6_I_I]] 3328 int32x4_t test_vqdmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) { 3329 return vqdmlal_high_n_s16(a, b, c); 3330 } 3331 3332 // CHECK-LABEL: define <2 x i64> @test_vqdmlal_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 { 3333 // CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3> 3334 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> 3335 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> 3336 // CHECK: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0 3337 // CHECK: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1 3338 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8> 3339 // CHECK: [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 3340 // CHECK: [[VQDMLAL2_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> 3341 // CHECK: [[VQDMLAL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I_I]], <2 x i32> [[VQDMLAL2_I_I]]) #2 3342 // CHECK: [[VQDMLAL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> 3343 // CHECK: [[VQDMLAL_V4_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I_I]], <2 x i64> [[VQDMLAL3_I_I]]) #2 3344 // CHECK: ret <2 x i64> [[VQDMLAL_V4_I_I]] 3345 int64x2_t test_vqdmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) { 3346 return vqdmlal_high_n_s32(a, b, c); 3347 } 3348 3349 // CHECK-LABEL: define <4 x i32> @test_vmlsl_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 { 3350 // CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 3351 // CHECK: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0 3352 // CHECK: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1 3353 // CHECK: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2 3354 // CHECK: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3 3355 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> 3356 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8> 3357 // CHECK: [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 3358 // CHECK: [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 3359 // CHECK: [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) #2 3360 // CHECK: [[SUB_I_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I_I]] 3361 // CHECK: ret <4 x i32> [[SUB_I_I]] 3362 int32x4_t test_vmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) { 3363 return vmlsl_high_n_s16(a, b, c); 3364 } 3365 3366 // CHECK-LABEL: define <2 x i64> @test_vmlsl_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 { 3367 // CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3> 3368 // CHECK: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0 3369 // CHECK: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1 3370 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> 3371 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8> 3372 // CHECK: [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 3373 // CHECK: [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 3374 // CHECK: [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) #2 3375 // CHECK: [[SUB_I_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I_I]] 3376 // CHECK: ret <2 x i64> [[SUB_I_I]] 3377 int64x2_t test_vmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) { 3378 return vmlsl_high_n_s32(a, b, c); 3379 } 3380 3381 // CHECK-LABEL: define <4 x i32> @test_vmlsl_high_n_u16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 { 3382 // CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 3383 // CHECK: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0 3384 // CHECK: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1 3385 // CHECK: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2 3386 // CHECK: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3 3387 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> 3388 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8> 3389 // CHECK: [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 3390 // CHECK: [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 3391 // CHECK: [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) #2 3392 // CHECK: [[SUB_I_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I_I]] 3393 // CHECK: ret <4 x i32> [[SUB_I_I]] 3394 uint32x4_t test_vmlsl_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) { 3395 return vmlsl_high_n_u16(a, b, c); 3396 } 3397 3398 // CHECK-LABEL: define <2 x i64> @test_vmlsl_high_n_u32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 { 3399 // CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3> 3400 // CHECK: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0 3401 // CHECK: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1 3402 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> 3403 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8> 3404 // CHECK: [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 3405 // CHECK: [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 3406 // CHECK: [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) #2 3407 // CHECK: [[SUB_I_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I_I]] 3408 // CHECK: ret <2 x i64> [[SUB_I_I]] 3409 uint64x2_t test_vmlsl_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) { 3410 return vmlsl_high_n_u32(a, b, c); 3411 } 3412 3413 // CHECK-LABEL: define <4 x i32> @test_vqdmlsl_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 { 3414 // CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 3415 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> 3416 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> 3417 // CHECK: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0 3418 // CHECK: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1 3419 // CHECK: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2 3420 // CHECK: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3 3421 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8> 3422 // CHECK: [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 3423 // CHECK: [[VQDMLAL4_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> 3424 // CHECK: [[VQDMLAL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I_I]], <4 x i16> [[VQDMLAL4_I_I]]) #2 3425 // CHECK: [[VQDMLSL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 3426 // CHECK: [[VQDMLSL_V6_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I_I]], <4 x i32> [[VQDMLAL5_I_I]]) #2 3427 // CHECK: ret <4 x i32> [[VQDMLSL_V6_I_I]] 3428 int32x4_t test_vqdmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) { 3429 return vqdmlsl_high_n_s16(a, b, c); 3430 } 3431 3432 // CHECK-LABEL: define <2 x i64> @test_vqdmlsl_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 { 3433 // CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3> 3434 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> 3435 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> 3436 // CHECK: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0 3437 // CHECK: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1 3438 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8> 3439 // CHECK: [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 3440 // CHECK: [[VQDMLAL2_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> 3441 // CHECK: [[VQDMLAL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I_I]], <2 x i32> [[VQDMLAL2_I_I]]) #2 3442 // CHECK: [[VQDMLSL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> 3443 // CHECK: [[VQDMLSL_V4_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I_I]], <2 x i64> [[VQDMLAL3_I_I]]) #2 3444 // CHECK: ret <2 x i64> [[VQDMLSL_V4_I_I]] 3445 int64x2_t test_vqdmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) { 3446 return vqdmlsl_high_n_s32(a, b, c); 3447 } 3448 3449 // CHECK-LABEL: define <2 x float> @test_vmul_n_f32(<2 x float> %a, float %b) #0 { 3450 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %b, i32 0 3451 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %b, i32 1 3452 // CHECK: [[MUL_I:%.*]] = fmul <2 x float> %a, [[VECINIT1_I]] 3453 // CHECK: ret <2 x float> [[MUL_I]] 3454 float32x2_t test_vmul_n_f32(float32x2_t a, float32_t b) { 3455 return vmul_n_f32(a, b); 3456 } 3457 3458 // CHECK-LABEL: define <4 x float> @test_vmulq_n_f32(<4 x float> %a, float %b) #0 { 3459 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %b, i32 0 3460 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %b, i32 1 3461 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %b, i32 2 3462 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %b, i32 3 3463 // CHECK: [[MUL_I:%.*]] = fmul <4 x float> %a, [[VECINIT3_I]] 3464 // CHECK: ret <4 x float> [[MUL_I]] 3465 float32x4_t test_vmulq_n_f32(float32x4_t a, float32_t b) { 3466 return vmulq_n_f32(a, b); 3467 } 3468 3469 // CHECK-LABEL: define <2 x double> @test_vmulq_n_f64(<2 x double> %a, double %b) #0 { 3470 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double %b, i32 0 3471 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %b, i32 1 3472 // CHECK: [[MUL_I:%.*]] = fmul <2 x double> %a, [[VECINIT1_I]] 3473 // CHECK: ret <2 x double> [[MUL_I]] 3474 float64x2_t test_vmulq_n_f64(float64x2_t a, float64_t b) { 3475 return vmulq_n_f64(a, b); 3476 } 3477 3478 // CHECK-LABEL: define <2 x float> @test_vfma_n_f32(<2 x float> %a, <2 x float> %b, float %n) #0 { 3479 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %n, i32 0 3480 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %n, i32 1 3481 // CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> 3482 // CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> 3483 // CHECK: [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8> 3484 // CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> 3485 // CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> 3486 // CHECK: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> 3487 // CHECK: [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x float> [[TMP3]]) #2 3488 // CHECK: ret <2 x float> [[TMP6]] 3489 float32x2_t test_vfma_n_f32(float32x2_t a, float32x2_t b, float32_t n) { 3490 return vfma_n_f32(a, b, n); 3491 } 3492 3493 // CHECK-LABEL: define <4 x float> @test_vfmaq_n_f32(<4 x float> %a, <4 x float> %b, float %n) #0 { 3494 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %n, i32 0 3495 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %n, i32 1 3496 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %n, i32 2 3497 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %n, i32 3 3498 // CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> 3499 // CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> 3500 // CHECK: [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8> 3501 // CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> 3502 // CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> 3503 // CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> 3504 // CHECK: [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x float> [[TMP3]]) #2 3505 // CHECK: ret <4 x float> [[TMP6]] 3506 float32x4_t test_vfmaq_n_f32(float32x4_t a, float32x4_t b, float32_t n) { 3507 return vfmaq_n_f32(a, b, n); 3508 } 3509 3510 // CHECK-LABEL: define <2 x float> @test_vfms_n_f32(<2 x float> %a, <2 x float> %b, float %n) #0 { 3511 // CHECK: [[SUB_I:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b 3512 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %n, i32 0 3513 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %n, i32 1 3514 // CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> 3515 // CHECK: [[TMP1:%.*]] = bitcast <2 x float> [[SUB_I]] to <8 x i8> 3516 // CHECK: [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8> 3517 // CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> 3518 // CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> 3519 // CHECK: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> 3520 // CHECK: [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x float> [[TMP3]]) #2 3521 // CHECK: ret <2 x float> [[TMP6]] 3522 float32x2_t test_vfms_n_f32(float32x2_t a, float32x2_t b, float32_t n) { 3523 return vfms_n_f32(a, b, n); 3524 } 3525 3526 // CHECK-LABEL: define <4 x float> @test_vfmsq_n_f32(<4 x float> %a, <4 x float> %b, float %n) #0 { 3527 // CHECK: [[SUB_I:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b 3528 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %n, i32 0 3529 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %n, i32 1 3530 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %n, i32 2 3531 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %n, i32 3 3532 // CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> 3533 // CHECK: [[TMP1:%.*]] = bitcast <4 x float> [[SUB_I]] to <16 x i8> 3534 // CHECK: [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8> 3535 // CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> 3536 // CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> 3537 // CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> 3538 // CHECK: [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x float> [[TMP3]]) #2 3539 // CHECK: ret <4 x float> [[TMP6]] 3540 float32x4_t test_vfmsq_n_f32(float32x4_t a, float32x4_t b, float32_t n) { 3541 return vfmsq_n_f32(a, b, n); 3542 } 3543 3544 // CHECK-LABEL: define <4 x i16> @test_vmul_n_s16(<4 x i16> %a, i16 %b) #0 { 3545 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0 3546 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1 3547 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2 3548 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3 3549 // CHECK: [[MUL_I:%.*]] = mul <4 x i16> %a, [[VECINIT3_I]] 3550 // CHECK: ret <4 x i16> [[MUL_I]] 3551 int16x4_t test_vmul_n_s16(int16x4_t a, int16_t b) { 3552 return vmul_n_s16(a, b); 3553 } 3554 3555 // CHECK-LABEL: define <8 x i16> @test_vmulq_n_s16(<8 x i16> %a, i16 %b) #0 { 3556 // CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0 3557 // CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1 3558 // CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2 3559 // CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3 3560 // CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4 3561 // CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5 3562 // CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6 3563 // CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7 3564 // CHECK: [[MUL_I:%.*]] = mul <8 x i16> %a, [[VECINIT7_I]] 3565 // CHECK: ret <8 x i16> [[MUL_I]] 3566 int16x8_t test_vmulq_n_s16(int16x8_t a, int16_t b) { 3567 return vmulq_n_s16(a, b); 3568 } 3569 3570 // CHECK-LABEL: define <2 x i32> @test_vmul_n_s32(<2 x i32> %a, i32 %b) #0 { 3571 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0 3572 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1 3573 // CHECK: [[MUL_I:%.*]] = mul <2 x i32> %a, [[VECINIT1_I]] 3574 // CHECK: ret <2 x i32> [[MUL_I]] 3575 int32x2_t test_vmul_n_s32(int32x2_t a, int32_t b) { 3576 return vmul_n_s32(a, b); 3577 } 3578 3579 // CHECK-LABEL: define <4 x i32> @test_vmulq_n_s32(<4 x i32> %a, i32 %b) #0 { 3580 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0 3581 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1 3582 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2 3583 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3 3584 // CHECK: [[MUL_I:%.*]] = mul <4 x i32> %a, [[VECINIT3_I]] 3585 // CHECK: ret <4 x i32> [[MUL_I]] 3586 int32x4_t test_vmulq_n_s32(int32x4_t a, int32_t b) { 3587 return vmulq_n_s32(a, b); 3588 } 3589 3590 // CHECK-LABEL: define <4 x i16> @test_vmul_n_u16(<4 x i16> %a, i16 %b) #0 { 3591 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0 3592 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1 3593 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2 3594 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3 3595 // CHECK: [[MUL_I:%.*]] = mul <4 x i16> %a, [[VECINIT3_I]] 3596 // CHECK: ret <4 x i16> [[MUL_I]] 3597 uint16x4_t test_vmul_n_u16(uint16x4_t a, uint16_t b) { 3598 return vmul_n_u16(a, b); 3599 } 3600 3601 // CHECK-LABEL: define <8 x i16> @test_vmulq_n_u16(<8 x i16> %a, i16 %b) #0 { 3602 // CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0 3603 // CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1 3604 // CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2 3605 // CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3 3606 // CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4 3607 // CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5 3608 // CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6 3609 // CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7 3610 // CHECK: [[MUL_I:%.*]] = mul <8 x i16> %a, [[VECINIT7_I]] 3611 // CHECK: ret <8 x i16> [[MUL_I]] 3612 uint16x8_t test_vmulq_n_u16(uint16x8_t a, uint16_t b) { 3613 return vmulq_n_u16(a, b); 3614 } 3615 3616 // CHECK-LABEL: define <2 x i32> @test_vmul_n_u32(<2 x i32> %a, i32 %b) #0 { 3617 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0 3618 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1 3619 // CHECK: [[MUL_I:%.*]] = mul <2 x i32> %a, [[VECINIT1_I]] 3620 // CHECK: ret <2 x i32> [[MUL_I]] 3621 uint32x2_t test_vmul_n_u32(uint32x2_t a, uint32_t b) { 3622 return vmul_n_u32(a, b); 3623 } 3624 3625 // CHECK-LABEL: define <4 x i32> @test_vmulq_n_u32(<4 x i32> %a, i32 %b) #0 { 3626 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0 3627 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1 3628 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2 3629 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3 3630 // CHECK: [[MUL_I:%.*]] = mul <4 x i32> %a, [[VECINIT3_I]] 3631 // CHECK: ret <4 x i32> [[MUL_I]] 3632 uint32x4_t test_vmulq_n_u32(uint32x4_t a, uint32_t b) { 3633 return vmulq_n_u32(a, b); 3634 } 3635 3636 // CHECK-LABEL: define <4 x i32> @test_vmull_n_s16(<4 x i16> %a, i16 %b) #0 { 3637 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> 3638 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0 3639 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1 3640 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2 3641 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3 3642 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> 3643 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 3644 // CHECK: [[VMULL4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 3645 // CHECK: [[VMULL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL4_I]]) #2 3646 // CHECK: ret <4 x i32> [[VMULL5_I]] 3647 int32x4_t test_vmull_n_s16(int16x4_t a, int16_t b) { 3648 return vmull_n_s16(a, b); 3649 } 3650 3651 // CHECK-LABEL: define <2 x i64> @test_vmull_n_s32(<2 x i32> %a, i32 %b) #0 { 3652 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> 3653 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0 3654 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1 3655 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> 3656 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 3657 // CHECK: [[VMULL2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 3658 // CHECK: [[VMULL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL2_I]]) #2 3659 // CHECK: ret <2 x i64> [[VMULL3_I]] 3660 int64x2_t test_vmull_n_s32(int32x2_t a, int32_t b) { 3661 return vmull_n_s32(a, b); 3662 } 3663 3664 // CHECK-LABEL: define <4 x i32> @test_vmull_n_u16(<4 x i16> %a, i16 %b) #0 { 3665 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> 3666 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0 3667 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1 3668 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2 3669 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3 3670 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> 3671 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 3672 // CHECK: [[VMULL4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 3673 // CHECK: [[VMULL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL4_I]]) #2 3674 // CHECK: ret <4 x i32> [[VMULL5_I]] 3675 uint32x4_t test_vmull_n_u16(uint16x4_t a, uint16_t b) { 3676 return vmull_n_u16(a, b); 3677 } 3678 3679 // CHECK-LABEL: define <2 x i64> @test_vmull_n_u32(<2 x i32> %a, i32 %b) #0 { 3680 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> 3681 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0 3682 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1 3683 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> 3684 // CHECK: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 3685 // CHECK: [[VMULL2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 3686 // CHECK: [[VMULL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL2_I]]) #2 3687 // CHECK: ret <2 x i64> [[VMULL3_I]] 3688 uint64x2_t test_vmull_n_u32(uint32x2_t a, uint32_t b) { 3689 return vmull_n_u32(a, b); 3690 } 3691 3692 // CHECK-LABEL: define <4 x i32> @test_vqdmull_n_s16(<4 x i16> %a, i16 %b) #0 { 3693 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> 3694 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0 3695 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1 3696 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2 3697 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3 3698 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> 3699 // CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 3700 // CHECK: [[VQDMULL_V4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 3701 // CHECK: [[VQDMULL_V5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V4_I]]) #2 3702 // CHECK: [[VQDMULL_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V5_I]] to <16 x i8> 3703 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V6_I]] to <4 x i32> 3704 // CHECK: ret <4 x i32> [[TMP2]] 3705 int32x4_t test_vqdmull_n_s16(int16x4_t a, int16_t b) { 3706 return vqdmull_n_s16(a, b); 3707 } 3708 3709 // CHECK-LABEL: define <2 x i64> @test_vqdmull_n_s32(<2 x i32> %a, i32 %b) #0 { 3710 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> 3711 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0 3712 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1 3713 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> 3714 // CHECK: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 3715 // CHECK: [[VQDMULL_V2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 3716 // CHECK: [[VQDMULL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V2_I]]) #2 3717 // CHECK: [[VQDMULL_V4_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V3_I]] to <16 x i8> 3718 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V4_I]] to <2 x i64> 3719 // CHECK: ret <2 x i64> [[TMP2]] 3720 int64x2_t test_vqdmull_n_s32(int32x2_t a, int32_t b) { 3721 return vqdmull_n_s32(a, b); 3722 } 3723 3724 // CHECK-LABEL: define <4 x i16> @test_vqdmulh_n_s16(<4 x i16> %a, i16 %b) #0 { 3725 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> 3726 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0 3727 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1 3728 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2 3729 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3 3730 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> 3731 // CHECK: [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 3732 // CHECK: [[VQDMULH_V4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 3733 // CHECK: [[VQDMULH_V5_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V4_I]]) #2 3734 // CHECK: [[VQDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V5_I]] to <8 x i8> 3735 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V6_I]] to <4 x i16> 3736 // CHECK: ret <4 x i16> [[TMP2]] 3737 int16x4_t test_vqdmulh_n_s16(int16x4_t a, int16_t b) { 3738 return vqdmulh_n_s16(a, b); 3739 } 3740 3741 // CHECK-LABEL: define <8 x i16> @test_vqdmulhq_n_s16(<8 x i16> %a, i16 %b) #0 { 3742 // CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> 3743 // CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0 3744 // CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1 3745 // CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2 3746 // CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3 3747 // CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4 3748 // CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5 3749 // CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6 3750 // CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7 3751 // CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8> 3752 // CHECK: [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 3753 // CHECK: [[VQDMULHQ_V8_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> 3754 // CHECK: [[VQDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V8_I]]) #2 3755 // CHECK: [[VQDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V9_I]] to <16 x i8> 3756 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V10_I]] to <8 x i16> 3757 // CHECK: ret <8 x i16> [[TMP2]] 3758 int16x8_t test_vqdmulhq_n_s16(int16x8_t a, int16_t b) { 3759 return vqdmulhq_n_s16(a, b); 3760 } 3761 3762 // CHECK-LABEL: define <2 x i32> @test_vqdmulh_n_s32(<2 x i32> %a, i32 %b) #0 { 3763 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> 3764 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0 3765 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1 3766 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> 3767 // CHECK: [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 3768 // CHECK: [[VQDMULH_V2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 3769 // CHECK: [[VQDMULH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V2_I]]) #2 3770 // CHECK: [[VQDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V3_I]] to <8 x i8> 3771 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V4_I]] to <2 x i32> 3772 // CHECK: ret <2 x i32> [[TMP2]] 3773 int32x2_t test_vqdmulh_n_s32(int32x2_t a, int32_t b) { 3774 return vqdmulh_n_s32(a, b); 3775 } 3776 3777 // CHECK-LABEL: define <4 x i32> @test_vqdmulhq_n_s32(<4 x i32> %a, i32 %b) #0 { 3778 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> 3779 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0 3780 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1 3781 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2 3782 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3 3783 // CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8> 3784 // CHECK: [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 3785 // CHECK: [[VQDMULHQ_V4_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> 3786 // CHECK: [[VQDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V4_I]]) #2 3787 // CHECK: [[VQDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V5_I]] to <16 x i8> 3788 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V6_I]] to <4 x i32> 3789 // CHECK: ret <4 x i32> [[TMP2]] 3790 int32x4_t test_vqdmulhq_n_s32(int32x4_t a, int32_t b) { 3791 return vqdmulhq_n_s32(a, b); 3792 } 3793 3794 // CHECK-LABEL: define <4 x i16> @test_vqrdmulh_n_s16(<4 x i16> %a, i16 %b) #0 { 3795 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> 3796 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0 3797 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1 3798 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2 3799 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3 3800 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> 3801 // CHECK: [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 3802 // CHECK: [[VQRDMULH_V4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 3803 // CHECK: [[VQRDMULH_V5_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V4_I]]) #2 3804 // CHECK: [[VQRDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V5_I]] to <8 x i8> 3805 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V6_I]] to <4 x i16> 3806 // CHECK: ret <4 x i16> [[TMP2]] 3807 int16x4_t test_vqrdmulh_n_s16(int16x4_t a, int16_t b) { 3808 return vqrdmulh_n_s16(a, b); 3809 } 3810 3811 // CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_n_s16(<8 x i16> %a, i16 %b) #0 { 3812 // CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> 3813 // CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0 3814 // CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1 3815 // CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2 3816 // CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3 3817 // CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4 3818 // CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5 3819 // CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6 3820 // CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7 3821 // CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8> 3822 // CHECK: [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 3823 // CHECK: [[VQRDMULHQ_V8_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> 3824 // CHECK: [[VQRDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V8_I]]) #2 3825 // CHECK: [[VQRDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V9_I]] to <16 x i8> 3826 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V10_I]] to <8 x i16> 3827 // CHECK: ret <8 x i16> [[TMP2]] 3828 int16x8_t test_vqrdmulhq_n_s16(int16x8_t a, int16_t b) { 3829 return vqrdmulhq_n_s16(a, b); 3830 } 3831 3832 // CHECK-LABEL: define <2 x i32> @test_vqrdmulh_n_s32(<2 x i32> %a, i32 %b) #0 { 3833 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> 3834 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0 3835 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1 3836 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> 3837 // CHECK: [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 3838 // CHECK: [[VQRDMULH_V2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 3839 // CHECK: [[VQRDMULH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V2_I]]) #2 3840 // CHECK: [[VQRDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V3_I]] to <8 x i8> 3841 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V4_I]] to <2 x i32> 3842 // CHECK: ret <2 x i32> [[TMP2]] 3843 int32x2_t test_vqrdmulh_n_s32(int32x2_t a, int32_t b) { 3844 return vqrdmulh_n_s32(a, b); 3845 } 3846 3847 // CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_n_s32(<4 x i32> %a, i32 %b) #0 { 3848 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> 3849 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0 3850 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1 3851 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2 3852 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3 3853 // CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8> 3854 // CHECK: [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 3855 // CHECK: [[VQRDMULHQ_V4_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> 3856 // CHECK: [[VQRDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V4_I]]) #2 3857 // CHECK: [[VQRDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V5_I]] to <16 x i8> 3858 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V6_I]] to <4 x i32> 3859 // CHECK: ret <4 x i32> [[TMP2]] 3860 int32x4_t test_vqrdmulhq_n_s32(int32x4_t a, int32_t b) { 3861 return vqrdmulhq_n_s32(a, b); 3862 } 3863 3864 // CHECK-LABEL: define <4 x i16> @test_vmla_n_s16(<4 x i16> %a, <4 x i16> %b, i16 %c) #0 { 3865 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0 3866 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1 3867 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2 3868 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3 3869 // CHECK: [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]] 3870 // CHECK: [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]] 3871 // CHECK: ret <4 x i16> [[ADD_I]] 3872 int16x4_t test_vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) { 3873 return vmla_n_s16(a, b, c); 3874 } 3875 3876 // CHECK-LABEL: define <8 x i16> @test_vmlaq_n_s16(<8 x i16> %a, <8 x i16> %b, i16 %c) #0 { 3877 // CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0 3878 // CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1 3879 // CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2 3880 // CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3 3881 // CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4 3882 // CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5 3883 // CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6 3884 // CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7 3885 // CHECK: [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]] 3886 // CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]] 3887 // CHECK: ret <8 x i16> [[ADD_I]] 3888 int16x8_t test_vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) { 3889 return vmlaq_n_s16(a, b, c); 3890 } 3891 3892 // CHECK-LABEL: define <2 x i32> @test_vmla_n_s32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 { 3893 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0 3894 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1 3895 // CHECK: [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]] 3896 // CHECK: [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]] 3897 // CHECK: ret <2 x i32> [[ADD_I]] 3898 int32x2_t test_vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) { 3899 return vmla_n_s32(a, b, c); 3900 } 3901 3902 // CHECK-LABEL: define <4 x i32> @test_vmlaq_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 { 3903 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0 3904 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1 3905 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2 3906 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3 3907 // CHECK: [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]] 3908 // CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]] 3909 // CHECK: ret <4 x i32> [[ADD_I]] 3910 int32x4_t test_vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) { 3911 return vmlaq_n_s32(a, b, c); 3912 } 3913 3914 // CHECK-LABEL: define <4 x i16> @test_vmla_n_u16(<4 x i16> %a, <4 x i16> %b, i16 %c) #0 { 3915 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0 3916 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1 3917 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2 3918 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3 3919 // CHECK: [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]] 3920 // CHECK: [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]] 3921 // CHECK: ret <4 x i16> [[ADD_I]] 3922 uint16x4_t test_vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) { 3923 return vmla_n_u16(a, b, c); 3924 } 3925 3926 // CHECK-LABEL: define <8 x i16> @test_vmlaq_n_u16(<8 x i16> %a, <8 x i16> %b, i16 %c) #0 { 3927 // CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0 3928 // CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1 3929 // CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2 3930 // CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3 3931 // CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4 3932 // CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5 3933 // CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6 3934 // CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7 3935 // CHECK: [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]] 3936 // CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]] 3937 // CHECK: ret <8 x i16> [[ADD_I]] 3938 uint16x8_t test_vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) { 3939 return vmlaq_n_u16(a, b, c); 3940 } 3941 3942 // CHECK-LABEL: define <2 x i32> @test_vmla_n_u32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 { 3943 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0 3944 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1 3945 // CHECK: [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]] 3946 // CHECK: [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]] 3947 // CHECK: ret <2 x i32> [[ADD_I]] 3948 uint32x2_t test_vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) { 3949 return vmla_n_u32(a, b, c); 3950 } 3951 3952 // CHECK-LABEL: define <4 x i32> @test_vmlaq_n_u32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 { 3953 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0 3954 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1 3955 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2 3956 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3 3957 // CHECK: [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]] 3958 // CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]] 3959 // CHECK: ret <4 x i32> [[ADD_I]] 3960 uint32x4_t test_vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) { 3961 return vmlaq_n_u32(a, b, c); 3962 } 3963 3964 // CHECK-LABEL: define <4 x i32> @test_vmlal_n_s16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 { 3965 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0 3966 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1 3967 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2 3968 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3 3969 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> 3970 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> 3971 // CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 3972 // CHECK: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 3973 // CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #2 3974 // CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]] 3975 // CHECK: ret <4 x i32> [[ADD_I]] 3976 int32x4_t test_vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) { 3977 return vmlal_n_s16(a, b, c); 3978 } 3979 3980 // CHECK-LABEL: define <2 x i64> @test_vmlal_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 { 3981 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0 3982 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1 3983 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> 3984 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> 3985 // CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 3986 // CHECK: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 3987 // CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #2 3988 // CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]] 3989 // CHECK: ret <2 x i64> [[ADD_I]] 3990 int64x2_t test_vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) { 3991 return vmlal_n_s32(a, b, c); 3992 } 3993 3994 // CHECK-LABEL: define <4 x i32> @test_vmlal_n_u16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 { 3995 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0 3996 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1 3997 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2 3998 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3 3999 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> 4000 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> 4001 // CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 4002 // CHECK: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 4003 // CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #2 4004 // CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]] 4005 // CHECK: ret <4 x i32> [[ADD_I]] 4006 uint32x4_t test_vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) { 4007 return vmlal_n_u16(a, b, c); 4008 } 4009 4010 // CHECK-LABEL: define <2 x i64> @test_vmlal_n_u32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 { 4011 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0 4012 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1 4013 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> 4014 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> 4015 // CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 4016 // CHECK: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 4017 // CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #2 4018 // CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]] 4019 // CHECK: ret <2 x i64> [[ADD_I]] 4020 uint64x2_t test_vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) { 4021 return vmlal_n_u32(a, b, c); 4022 } 4023 4024 // CHECK-LABEL: define <4 x i32> @test_vqdmlal_n_s16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 { 4025 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> 4026 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> 4027 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0 4028 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1 4029 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2 4030 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3 4031 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> 4032 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 4033 // CHECK: [[VQDMLAL4_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> 4034 // CHECK: [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL4_I]]) #2 4035 // CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 4036 // CHECK: [[VQDMLAL_V6_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL5_I]]) #2 4037 // CHECK: ret <4 x i32> [[VQDMLAL_V6_I]] 4038 int32x4_t test_vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) { 4039 return vqdmlal_n_s16(a, b, c); 4040 } 4041 4042 // CHECK-LABEL: define <2 x i64> @test_vqdmlal_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 { 4043 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> 4044 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> 4045 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0 4046 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1 4047 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> 4048 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 4049 // CHECK: [[VQDMLAL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> 4050 // CHECK: [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL2_I]]) #2 4051 // CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> 4052 // CHECK: [[VQDMLAL_V4_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL3_I]]) #2 4053 // CHECK: ret <2 x i64> [[VQDMLAL_V4_I]] 4054 int64x2_t test_vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) { 4055 return vqdmlal_n_s32(a, b, c); 4056 } 4057 4058 // CHECK-LABEL: define <4 x i16> @test_vmls_n_s16(<4 x i16> %a, <4 x i16> %b, i16 %c) #0 { 4059 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0 4060 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1 4061 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2 4062 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3 4063 // CHECK: [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]] 4064 // CHECK: [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]] 4065 // CHECK: ret <4 x i16> [[SUB_I]] 4066 int16x4_t test_vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) { 4067 return vmls_n_s16(a, b, c); 4068 } 4069 4070 // CHECK-LABEL: define <8 x i16> @test_vmlsq_n_s16(<8 x i16> %a, <8 x i16> %b, i16 %c) #0 { 4071 // CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0 4072 // CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1 4073 // CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2 4074 // CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3 4075 // CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4 4076 // CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5 4077 // CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6 4078 // CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7 4079 // CHECK: [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]] 4080 // CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]] 4081 // CHECK: ret <8 x i16> [[SUB_I]] 4082 int16x8_t test_vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) { 4083 return vmlsq_n_s16(a, b, c); 4084 } 4085 4086 // CHECK-LABEL: define <2 x i32> @test_vmls_n_s32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 { 4087 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0 4088 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1 4089 // CHECK: [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]] 4090 // CHECK: [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]] 4091 // CHECK: ret <2 x i32> [[SUB_I]] 4092 int32x2_t test_vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) { 4093 return vmls_n_s32(a, b, c); 4094 } 4095 4096 // CHECK-LABEL: define <4 x i32> @test_vmlsq_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 { 4097 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0 4098 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1 4099 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2 4100 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3 4101 // CHECK: [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]] 4102 // CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]] 4103 // CHECK: ret <4 x i32> [[SUB_I]] 4104 int32x4_t test_vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) { 4105 return vmlsq_n_s32(a, b, c); 4106 } 4107 4108 // CHECK-LABEL: define <4 x i16> @test_vmls_n_u16(<4 x i16> %a, <4 x i16> %b, i16 %c) #0 { 4109 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0 4110 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1 4111 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2 4112 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3 4113 // CHECK: [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]] 4114 // CHECK: [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]] 4115 // CHECK: ret <4 x i16> [[SUB_I]] 4116 uint16x4_t test_vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) { 4117 return vmls_n_u16(a, b, c); 4118 } 4119 4120 // CHECK-LABEL: define <8 x i16> @test_vmlsq_n_u16(<8 x i16> %a, <8 x i16> %b, i16 %c) #0 { 4121 // CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0 4122 // CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1 4123 // CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2 4124 // CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3 4125 // CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4 4126 // CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5 4127 // CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6 4128 // CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7 4129 // CHECK: [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]] 4130 // CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]] 4131 // CHECK: ret <8 x i16> [[SUB_I]] 4132 uint16x8_t test_vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) { 4133 return vmlsq_n_u16(a, b, c); 4134 } 4135 4136 // CHECK-LABEL: define <2 x i32> @test_vmls_n_u32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 { 4137 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0 4138 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1 4139 // CHECK: [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]] 4140 // CHECK: [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]] 4141 // CHECK: ret <2 x i32> [[SUB_I]] 4142 uint32x2_t test_vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) { 4143 return vmls_n_u32(a, b, c); 4144 } 4145 4146 // CHECK-LABEL: define <4 x i32> @test_vmlsq_n_u32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 { 4147 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0 4148 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1 4149 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2 4150 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3 4151 // CHECK: [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]] 4152 // CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]] 4153 // CHECK: ret <4 x i32> [[SUB_I]] 4154 uint32x4_t test_vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) { 4155 return vmlsq_n_u32(a, b, c); 4156 } 4157 4158 // CHECK-LABEL: define <4 x i32> @test_vmlsl_n_s16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 { 4159 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0 4160 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1 4161 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2 4162 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3 4163 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> 4164 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> 4165 // CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 4166 // CHECK: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 4167 // CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #2 4168 // CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]] 4169 // CHECK: ret <4 x i32> [[SUB_I]] 4170 int32x4_t test_vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) { 4171 return vmlsl_n_s16(a, b, c); 4172 } 4173 4174 // CHECK-LABEL: define <2 x i64> @test_vmlsl_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 { 4175 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0 4176 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1 4177 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> 4178 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> 4179 // CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 4180 // CHECK: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 4181 // CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #2 4182 // CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]] 4183 // CHECK: ret <2 x i64> [[SUB_I]] 4184 int64x2_t test_vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) { 4185 return vmlsl_n_s32(a, b, c); 4186 } 4187 4188 // CHECK-LABEL: define <4 x i32> @test_vmlsl_n_u16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 { 4189 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0 4190 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1 4191 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2 4192 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3 4193 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> 4194 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> 4195 // CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 4196 // CHECK: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 4197 // CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #2 4198 // CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]] 4199 // CHECK: ret <4 x i32> [[SUB_I]] 4200 uint32x4_t test_vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) { 4201 return vmlsl_n_u16(a, b, c); 4202 } 4203 4204 // CHECK-LABEL: define <2 x i64> @test_vmlsl_n_u32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 { 4205 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0 4206 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1 4207 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> 4208 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> 4209 // CHECK: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 4210 // CHECK: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 4211 // CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #2 4212 // CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]] 4213 // CHECK: ret <2 x i64> [[SUB_I]] 4214 uint64x2_t test_vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) { 4215 return vmlsl_n_u32(a, b, c); 4216 } 4217 4218 // CHECK-LABEL: define <4 x i32> @test_vqdmlsl_n_s16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 { 4219 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> 4220 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> 4221 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0 4222 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1 4223 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2 4224 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3 4225 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> 4226 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 4227 // CHECK: [[VQDMLAL4_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> 4228 // CHECK: [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL4_I]]) #2 4229 // CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 4230 // CHECK: [[VQDMLSL_V6_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL5_I]]) #2 4231 // CHECK: ret <4 x i32> [[VQDMLSL_V6_I]] 4232 int32x4_t test_vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) { 4233 return vqdmlsl_n_s16(a, b, c); 4234 } 4235 4236 // CHECK-LABEL: define <2 x i64> @test_vqdmlsl_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 { 4237 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> 4238 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> 4239 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0 4240 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1 4241 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> 4242 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 4243 // CHECK: [[VQDMLAL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> 4244 // CHECK: [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL2_I]]) #2 4245 // CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> 4246 // CHECK: [[VQDMLSL_V4_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL3_I]]) #2 4247 // CHECK: ret <2 x i64> [[VQDMLSL_V4_I]] 4248 int64x2_t test_vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) { 4249 return vqdmlsl_n_s32(a, b, c); 4250 } 4251 4252 // CHECK-LABEL: define <4 x i16> @test_vmla_lane_u16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 { 4253 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer 4254 // CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]] 4255 // CHECK: [[ADD:%.*]] = add <4 x i16> %a, [[MUL]] 4256 // CHECK: ret <4 x i16> [[ADD]] 4257 uint16x4_t test_vmla_lane_u16_0(uint16x4_t a, uint16x4_t b, uint16x4_t v) { 4258 return vmla_lane_u16(a, b, v, 0); 4259 } 4260 4261 // CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_u16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 { 4262 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer 4263 // CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]] 4264 // CHECK: [[ADD:%.*]] = add <8 x i16> %a, [[MUL]] 4265 // CHECK: ret <8 x i16> [[ADD]] 4266 uint16x8_t test_vmlaq_lane_u16_0(uint16x8_t a, uint16x8_t b, uint16x4_t v) { 4267 return vmlaq_lane_u16(a, b, v, 0); 4268 } 4269 4270 // CHECK-LABEL: define <2 x i32> @test_vmla_lane_u32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 { 4271 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer 4272 // CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]] 4273 // CHECK: [[ADD:%.*]] = add <2 x i32> %a, [[MUL]] 4274 // CHECK: ret <2 x i32> [[ADD]] 4275 uint32x2_t test_vmla_lane_u32_0(uint32x2_t a, uint32x2_t b, uint32x2_t v) { 4276 return vmla_lane_u32(a, b, v, 0); 4277 } 4278 4279 // CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_u32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 { 4280 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer 4281 // CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]] 4282 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[MUL]] 4283 // CHECK: ret <4 x i32> [[ADD]] 4284 uint32x4_t test_vmlaq_lane_u32_0(uint32x4_t a, uint32x4_t b, uint32x2_t v) { 4285 return vmlaq_lane_u32(a, b, v, 0); 4286 } 4287 4288 // CHECK-LABEL: define <4 x i16> @test_vmla_laneq_u16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 { 4289 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer 4290 // CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]] 4291 // CHECK: [[ADD:%.*]] = add <4 x i16> %a, [[MUL]] 4292 // CHECK: ret <4 x i16> [[ADD]] 4293 uint16x4_t test_vmla_laneq_u16_0(uint16x4_t a, uint16x4_t b, uint16x8_t v) { 4294 return vmla_laneq_u16(a, b, v, 0); 4295 } 4296 4297 // CHECK-LABEL: define <8 x i16> @test_vmlaq_laneq_u16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 { 4298 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer 4299 // CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]] 4300 // CHECK: [[ADD:%.*]] = add <8 x i16> %a, [[MUL]] 4301 // CHECK: ret <8 x i16> [[ADD]] 4302 uint16x8_t test_vmlaq_laneq_u16_0(uint16x8_t a, uint16x8_t b, uint16x8_t v) { 4303 return vmlaq_laneq_u16(a, b, v, 0); 4304 } 4305 4306 // CHECK-LABEL: define <2 x i32> @test_vmla_laneq_u32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 { 4307 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer 4308 // CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]] 4309 // CHECK: [[ADD:%.*]] = add <2 x i32> %a, [[MUL]] 4310 // CHECK: ret <2 x i32> [[ADD]] 4311 uint32x2_t test_vmla_laneq_u32_0(uint32x2_t a, uint32x2_t b, uint32x4_t v) { 4312 return vmla_laneq_u32(a, b, v, 0); 4313 } 4314 4315 // CHECK-LABEL: define <4 x i32> @test_vmlaq_laneq_u32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 { 4316 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer 4317 // CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]] 4318 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[MUL]] 4319 // CHECK: ret <4 x i32> [[ADD]] 4320 uint32x4_t test_vmlaq_laneq_u32_0(uint32x4_t a, uint32x4_t b, uint32x4_t v) { 4321 return vmlaq_laneq_u32(a, b, v, 0); 4322 } 4323 4324 // CHECK-LABEL: define <4 x i32> @test_vqdmlal_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 { 4325 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer 4326 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> 4327 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> 4328 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 4329 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 4330 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> 4331 // CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2 4332 // CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 4333 // CHECK: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2 4334 // CHECK: ret <4 x i32> [[VQDMLAL_V3_I]] 4335 int32x4_t test_vqdmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) { 4336 return vqdmlal_laneq_s16(a, b, v, 0); 4337 } 4338 4339 // CHECK-LABEL: define <2 x i64> @test_vqdmlal_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 { 4340 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer 4341 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> 4342 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> 4343 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 4344 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 4345 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> 4346 // CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2 4347 // CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> 4348 // CHECK: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2 4349 // CHECK: ret <2 x i64> [[VQDMLAL_V3_I]] 4350 int64x2_t test_vqdmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) { 4351 return vqdmlal_laneq_s32(a, b, v, 0); 4352 } 4353 4354 // CHECK-LABEL: define <4 x i32> @test_vqdmlal_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 { 4355 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 4356 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer 4357 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> 4358 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 4359 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 4360 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 4361 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> 4362 // CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2 4363 // CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 4364 // CHECK: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2 4365 // CHECK: ret <4 x i32> [[VQDMLAL_V3_I]] 4366 int32x4_t test_vqdmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) { 4367 return vqdmlal_high_laneq_s16(a, b, v, 0); 4368 } 4369 4370 // CHECK-LABEL: define <2 x i64> @test_vqdmlal_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 { 4371 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3> 4372 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer 4373 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> 4374 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 4375 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 4376 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 4377 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> 4378 // CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2 4379 // CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> 4380 // CHECK: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2 4381 // CHECK: ret <2 x i64> [[VQDMLAL_V3_I]] 4382 int64x2_t test_vqdmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) { 4383 return vqdmlal_high_laneq_s32(a, b, v, 0); 4384 } 4385 4386 // CHECK-LABEL: define <4 x i16> @test_vmls_lane_u16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 { 4387 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer 4388 // CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]] 4389 // CHECK: [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]] 4390 // CHECK: ret <4 x i16> [[SUB]] 4391 uint16x4_t test_vmls_lane_u16_0(uint16x4_t a, uint16x4_t b, uint16x4_t v) { 4392 return vmls_lane_u16(a, b, v, 0); 4393 } 4394 4395 // CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_u16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 { 4396 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer 4397 // CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]] 4398 // CHECK: [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]] 4399 // CHECK: ret <8 x i16> [[SUB]] 4400 uint16x8_t test_vmlsq_lane_u16_0(uint16x8_t a, uint16x8_t b, uint16x4_t v) { 4401 return vmlsq_lane_u16(a, b, v, 0); 4402 } 4403 4404 // CHECK-LABEL: define <2 x i32> @test_vmls_lane_u32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 { 4405 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer 4406 // CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]] 4407 // CHECK: [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]] 4408 // CHECK: ret <2 x i32> [[SUB]] 4409 uint32x2_t test_vmls_lane_u32_0(uint32x2_t a, uint32x2_t b, uint32x2_t v) { 4410 return vmls_lane_u32(a, b, v, 0); 4411 } 4412 4413 // CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_u32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 { 4414 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer 4415 // CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]] 4416 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]] 4417 // CHECK: ret <4 x i32> [[SUB]] 4418 uint32x4_t test_vmlsq_lane_u32_0(uint32x4_t a, uint32x4_t b, uint32x2_t v) { 4419 return vmlsq_lane_u32(a, b, v, 0); 4420 } 4421 4422 // CHECK-LABEL: define <4 x i16> @test_vmls_laneq_u16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 { 4423 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer 4424 // CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]] 4425 // CHECK: [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]] 4426 // CHECK: ret <4 x i16> [[SUB]] 4427 uint16x4_t test_vmls_laneq_u16_0(uint16x4_t a, uint16x4_t b, uint16x8_t v) { 4428 return vmls_laneq_u16(a, b, v, 0); 4429 } 4430 4431 // CHECK-LABEL: define <8 x i16> @test_vmlsq_laneq_u16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 { 4432 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer 4433 // CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]] 4434 // CHECK: [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]] 4435 // CHECK: ret <8 x i16> [[SUB]] 4436 uint16x8_t test_vmlsq_laneq_u16_0(uint16x8_t a, uint16x8_t b, uint16x8_t v) { 4437 return vmlsq_laneq_u16(a, b, v, 0); 4438 } 4439 4440 // CHECK-LABEL: define <2 x i32> @test_vmls_laneq_u32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 { 4441 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer 4442 // CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]] 4443 // CHECK: [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]] 4444 // CHECK: ret <2 x i32> [[SUB]] 4445 uint32x2_t test_vmls_laneq_u32_0(uint32x2_t a, uint32x2_t b, uint32x4_t v) { 4446 return vmls_laneq_u32(a, b, v, 0); 4447 } 4448 4449 // CHECK-LABEL: define <4 x i32> @test_vmlsq_laneq_u32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 { 4450 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer 4451 // CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]] 4452 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]] 4453 // CHECK: ret <4 x i32> [[SUB]] 4454 uint32x4_t test_vmlsq_laneq_u32_0(uint32x4_t a, uint32x4_t b, uint32x4_t v) { 4455 return vmlsq_laneq_u32(a, b, v, 0); 4456 } 4457 4458 // CHECK-LABEL: define <4 x i32> @test_vqdmlsl_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 { 4459 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer 4460 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> 4461 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> 4462 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 4463 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 4464 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> 4465 // CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2 4466 // CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 4467 // CHECK: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2 4468 // CHECK: ret <4 x i32> [[VQDMLSL_V3_I]] 4469 int32x4_t test_vqdmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) { 4470 return vqdmlsl_laneq_s16(a, b, v, 0); 4471 } 4472 4473 // CHECK-LABEL: define <2 x i64> @test_vqdmlsl_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 { 4474 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer 4475 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> 4476 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> 4477 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 4478 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 4479 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> 4480 // CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2 4481 // CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> 4482 // CHECK: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2 4483 // CHECK: ret <2 x i64> [[VQDMLSL_V3_I]] 4484 int64x2_t test_vqdmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) { 4485 return vqdmlsl_laneq_s32(a, b, v, 0); 4486 } 4487 4488 // CHECK-LABEL: define <4 x i32> @test_vqdmlsl_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 { 4489 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 4490 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer 4491 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> 4492 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 4493 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 4494 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 4495 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> 4496 // CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2 4497 // CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 4498 // CHECK: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2 4499 // CHECK: ret <4 x i32> [[VQDMLSL_V3_I]] 4500 int32x4_t test_vqdmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) { 4501 return vqdmlsl_high_laneq_s16(a, b, v, 0); 4502 } 4503 4504 // CHECK-LABEL: define <2 x i64> @test_vqdmlsl_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 { 4505 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3> 4506 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer 4507 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> 4508 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 4509 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 4510 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 4511 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> 4512 // CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2 4513 // CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> 4514 // CHECK: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2 4515 // CHECK: ret <2 x i64> [[VQDMLSL_V3_I]] 4516 int64x2_t test_vqdmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) { 4517 return vqdmlsl_high_laneq_s32(a, b, v, 0); 4518 } 4519 4520 // CHECK-LABEL: define <4 x i16> @test_vqdmulh_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) #0 { 4521 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer 4522 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> 4523 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 4524 // CHECK: [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 4525 // CHECK: [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 4526 // CHECK: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) #2 4527 // CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8> 4528 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16> 4529 // CHECK: ret <4 x i16> [[TMP2]] 4530 int16x4_t test_vqdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) { 4531 return vqdmulh_laneq_s16(a, v, 0); 4532 } 4533 4534 // CHECK-LABEL: define <8 x i16> @test_vqdmulhq_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) #0 { 4535 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer 4536 // CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> 4537 // CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8> 4538 // CHECK: [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 4539 // CHECK: [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> 4540 // CHECK: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) #2 4541 // CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8> 4542 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16> 4543 // CHECK: ret <8 x i16> [[TMP2]] 4544 int16x8_t test_vqdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) { 4545 return vqdmulhq_laneq_s16(a, v, 0); 4546 } 4547 4548 // CHECK-LABEL: define <2 x i32> @test_vqdmulh_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) #0 { 4549 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer 4550 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> 4551 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 4552 // CHECK: [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 4553 // CHECK: [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 4554 // CHECK: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) #2 4555 // CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8> 4556 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32> 4557 // CHECK: ret <2 x i32> [[TMP2]] 4558 int32x2_t test_vqdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) { 4559 return vqdmulh_laneq_s32(a, v, 0); 4560 } 4561 4562 // CHECK-LABEL: define <4 x i32> @test_vqdmulhq_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) #0 { 4563 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer 4564 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> 4565 // CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8> 4566 // CHECK: [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 4567 // CHECK: [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> 4568 // CHECK: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) #2 4569 // CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8> 4570 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32> 4571 // CHECK: ret <4 x i32> [[TMP2]] 4572 int32x4_t test_vqdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) { 4573 return vqdmulhq_laneq_s32(a, v, 0); 4574 } 4575 4576 // CHECK-LABEL: define <4 x i16> @test_vqrdmulh_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) #0 { 4577 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer 4578 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> 4579 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 4580 // CHECK: [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 4581 // CHECK: [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 4582 // CHECK: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) #2 4583 // CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8> 4584 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16> 4585 // CHECK: ret <4 x i16> [[TMP2]] 4586 int16x4_t test_vqrdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) { 4587 return vqrdmulh_laneq_s16(a, v, 0); 4588 } 4589 4590 // CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) #0 { 4591 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer 4592 // CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> 4593 // CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8> 4594 // CHECK: [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 4595 // CHECK: [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> 4596 // CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) #2 4597 // CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8> 4598 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16> 4599 // CHECK: ret <8 x i16> [[TMP2]] 4600 int16x8_t test_vqrdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) { 4601 return vqrdmulhq_laneq_s16(a, v, 0); 4602 } 4603 4604 // CHECK-LABEL: define <2 x i32> @test_vqrdmulh_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) #0 { 4605 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer 4606 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> 4607 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 4608 // CHECK: [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 4609 // CHECK: [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 4610 // CHECK: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) #2 4611 // CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8> 4612 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32> 4613 // CHECK: ret <2 x i32> [[TMP2]] 4614 int32x2_t test_vqrdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) { 4615 return vqrdmulh_laneq_s32(a, v, 0); 4616 } 4617 4618 // CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) #0 { 4619 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer 4620 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> 4621 // CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8> 4622 // CHECK: [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 4623 // CHECK: [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> 4624 // CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) #2 4625 // CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8> 4626 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32> 4627 // CHECK: ret <4 x i32> [[TMP2]] 4628 int32x4_t test_vqrdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) { 4629 return vqrdmulhq_laneq_s32(a, v, 0); 4630 } 4631 4632 // CHECK-LABEL: define <4 x i16> @test_vmla_lane_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 { 4633 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 4634 // CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]] 4635 // CHECK: [[ADD:%.*]] = add <4 x i16> %a, [[MUL]] 4636 // CHECK: ret <4 x i16> [[ADD]] 4637 uint16x4_t test_vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v) { 4638 return vmla_lane_u16(a, b, v, 3); 4639 } 4640 4641 // CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_u16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 { 4642 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> 4643 // CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]] 4644 // CHECK: [[ADD:%.*]] = add <8 x i16> %a, [[MUL]] 4645 // CHECK: ret <8 x i16> [[ADD]] 4646 uint16x8_t test_vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v) { 4647 return vmlaq_lane_u16(a, b, v, 3); 4648 } 4649 4650 // CHECK-LABEL: define <2 x i32> @test_vmla_lane_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 { 4651 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1> 4652 // CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]] 4653 // CHECK: [[ADD:%.*]] = add <2 x i32> %a, [[MUL]] 4654 // CHECK: ret <2 x i32> [[ADD]] 4655 uint32x2_t test_vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v) { 4656 return vmla_lane_u32(a, b, v, 1); 4657 } 4658 4659 // CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_u32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 { 4660 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 4661 // CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]] 4662 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[MUL]] 4663 // CHECK: ret <4 x i32> [[ADD]] 4664 uint32x4_t test_vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v) { 4665 return vmlaq_lane_u32(a, b, v, 1); 4666 } 4667 4668 // CHECK-LABEL: define <4 x i16> @test_vmla_laneq_u16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 { 4669 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7> 4670 // CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]] 4671 // CHECK: [[ADD:%.*]] = add <4 x i16> %a, [[MUL]] 4672 // CHECK: ret <4 x i16> [[ADD]] 4673 uint16x4_t test_vmla_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v) { 4674 return vmla_laneq_u16(a, b, v, 7); 4675 } 4676 4677 // CHECK-LABEL: define <8 x i16> @test_vmlaq_laneq_u16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 { 4678 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> 4679 // CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]] 4680 // CHECK: [[ADD:%.*]] = add <8 x i16> %a, [[MUL]] 4681 // CHECK: ret <8 x i16> [[ADD]] 4682 uint16x8_t test_vmlaq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v) { 4683 return vmlaq_laneq_u16(a, b, v, 7); 4684 } 4685 4686 // CHECK-LABEL: define <2 x i32> @test_vmla_laneq_u32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 { 4687 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3> 4688 // CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]] 4689 // CHECK: [[ADD:%.*]] = add <2 x i32> %a, [[MUL]] 4690 // CHECK: ret <2 x i32> [[ADD]] 4691 uint32x2_t test_vmla_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v) { 4692 return vmla_laneq_u32(a, b, v, 3); 4693 } 4694 4695 // CHECK-LABEL: define <4 x i32> @test_vmlaq_laneq_u32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 { 4696 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 4697 // CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]] 4698 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[MUL]] 4699 // CHECK: ret <4 x i32> [[ADD]] 4700 uint32x4_t test_vmlaq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v) { 4701 return vmlaq_laneq_u32(a, b, v, 3); 4702 } 4703 4704 // CHECK-LABEL: define <4 x i32> @test_vqdmlal_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 { 4705 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7> 4706 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> 4707 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> 4708 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 4709 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 4710 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> 4711 // CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2 4712 // CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 4713 // CHECK: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2 4714 // CHECK: ret <4 x i32> [[VQDMLAL_V3_I]] 4715 int32x4_t test_vqdmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) { 4716 return vqdmlal_laneq_s16(a, b, v, 7); 4717 } 4718 4719 // CHECK-LABEL: define <2 x i64> @test_vqdmlal_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 { 4720 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3> 4721 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> 4722 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> 4723 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 4724 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 4725 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> 4726 // CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2 4727 // CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> 4728 // CHECK: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2 4729 // CHECK: ret <2 x i64> [[VQDMLAL_V3_I]] 4730 int64x2_t test_vqdmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) { 4731 return vqdmlal_laneq_s32(a, b, v, 3); 4732 } 4733 4734 // CHECK-LABEL: define <4 x i32> @test_vqdmlal_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 { 4735 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 4736 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7> 4737 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> 4738 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 4739 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 4740 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 4741 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> 4742 // CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2 4743 // CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 4744 // CHECK: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2 4745 // CHECK: ret <4 x i32> [[VQDMLAL_V3_I]] 4746 int32x4_t test_vqdmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) { 4747 return vqdmlal_high_laneq_s16(a, b, v, 7); 4748 } 4749 4750 // CHECK-LABEL: define <2 x i64> @test_vqdmlal_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 { 4751 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3> 4752 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3> 4753 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> 4754 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 4755 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 4756 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 4757 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> 4758 // CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2 4759 // CHECK: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> 4760 // CHECK: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2 4761 // CHECK: ret <2 x i64> [[VQDMLAL_V3_I]] 4762 int64x2_t test_vqdmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) { 4763 return vqdmlal_high_laneq_s32(a, b, v, 3); 4764 } 4765 4766 // CHECK-LABEL: define <4 x i16> @test_vmls_lane_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 { 4767 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 4768 // CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]] 4769 // CHECK: [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]] 4770 // CHECK: ret <4 x i16> [[SUB]] 4771 uint16x4_t test_vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v) { 4772 return vmls_lane_u16(a, b, v, 3); 4773 } 4774 4775 // CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_u16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 { 4776 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> 4777 // CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]] 4778 // CHECK: [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]] 4779 // CHECK: ret <8 x i16> [[SUB]] 4780 uint16x8_t test_vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v) { 4781 return vmlsq_lane_u16(a, b, v, 3); 4782 } 4783 4784 // CHECK-LABEL: define <2 x i32> @test_vmls_lane_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 { 4785 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1> 4786 // CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]] 4787 // CHECK: [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]] 4788 // CHECK: ret <2 x i32> [[SUB]] 4789 uint32x2_t test_vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v) { 4790 return vmls_lane_u32(a, b, v, 1); 4791 } 4792 4793 // CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_u32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 { 4794 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 4795 // CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]] 4796 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]] 4797 // CHECK: ret <4 x i32> [[SUB]] 4798 uint32x4_t test_vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v) { 4799 return vmlsq_lane_u32(a, b, v, 1); 4800 } 4801 4802 // CHECK-LABEL: define <4 x i16> @test_vmls_laneq_u16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 { 4803 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7> 4804 // CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]] 4805 // CHECK: [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]] 4806 // CHECK: ret <4 x i16> [[SUB]] 4807 uint16x4_t test_vmls_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v) { 4808 return vmls_laneq_u16(a, b, v, 7); 4809 } 4810 4811 // CHECK-LABEL: define <8 x i16> @test_vmlsq_laneq_u16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 { 4812 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> 4813 // CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]] 4814 // CHECK: [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]] 4815 // CHECK: ret <8 x i16> [[SUB]] 4816 uint16x8_t test_vmlsq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v) { 4817 return vmlsq_laneq_u16(a, b, v, 7); 4818 } 4819 4820 // CHECK-LABEL: define <2 x i32> @test_vmls_laneq_u32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 { 4821 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3> 4822 // CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]] 4823 // CHECK: [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]] 4824 // CHECK: ret <2 x i32> [[SUB]] 4825 uint32x2_t test_vmls_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v) { 4826 return vmls_laneq_u32(a, b, v, 3); 4827 } 4828 4829 // CHECK-LABEL: define <4 x i32> @test_vmlsq_laneq_u32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 { 4830 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 4831 // CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]] 4832 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]] 4833 // CHECK: ret <4 x i32> [[SUB]] 4834 uint32x4_t test_vmlsq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v) { 4835 return vmlsq_laneq_u32(a, b, v, 3); 4836 } 4837 4838 // CHECK-LABEL: define <4 x i32> @test_vqdmlsl_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 { 4839 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7> 4840 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> 4841 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> 4842 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 4843 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 4844 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> 4845 // CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2 4846 // CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 4847 // CHECK: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2 4848 // CHECK: ret <4 x i32> [[VQDMLSL_V3_I]] 4849 int32x4_t test_vqdmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) { 4850 return vqdmlsl_laneq_s16(a, b, v, 7); 4851 } 4852 4853 // CHECK-LABEL: define <2 x i64> @test_vqdmlsl_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 { 4854 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3> 4855 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> 4856 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> 4857 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 4858 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 4859 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> 4860 // CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2 4861 // CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> 4862 // CHECK: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2 4863 // CHECK: ret <2 x i64> [[VQDMLSL_V3_I]] 4864 int64x2_t test_vqdmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) { 4865 return vqdmlsl_laneq_s32(a, b, v, 3); 4866 } 4867 4868 // CHECK-LABEL: define <4 x i32> @test_vqdmlsl_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 { 4869 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 4870 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7> 4871 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> 4872 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 4873 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 4874 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 4875 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> 4876 // CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2 4877 // CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 4878 // CHECK: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2 4879 // CHECK: ret <4 x i32> [[VQDMLSL_V3_I]] 4880 int32x4_t test_vqdmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) { 4881 return vqdmlsl_high_laneq_s16(a, b, v, 7); 4882 } 4883 4884 // CHECK-LABEL: define <2 x i64> @test_vqdmlsl_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 { 4885 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3> 4886 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3> 4887 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> 4888 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 4889 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 4890 // CHECK: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 4891 // CHECK: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> 4892 // CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2 4893 // CHECK: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> 4894 // CHECK: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2 4895 // CHECK: ret <2 x i64> [[VQDMLSL_V3_I]] 4896 int64x2_t test_vqdmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) { 4897 return vqdmlsl_high_laneq_s32(a, b, v, 3); 4898 } 4899 4900 // CHECK-LABEL: define <4 x i16> @test_vqdmulh_laneq_s16(<4 x i16> %a, <8 x i16> %v) #0 { 4901 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7> 4902 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> 4903 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 4904 // CHECK: [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 4905 // CHECK: [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 4906 // CHECK: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) #2 4907 // CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8> 4908 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16> 4909 // CHECK: ret <4 x i16> [[TMP2]] 4910 int16x4_t test_vqdmulh_laneq_s16(int16x4_t a, int16x8_t v) { 4911 return vqdmulh_laneq_s16(a, v, 7); 4912 } 4913 4914 // CHECK-LABEL: define <8 x i16> @test_vqdmulhq_laneq_s16(<8 x i16> %a, <8 x i16> %v) #0 { 4915 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> 4916 // CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> 4917 // CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8> 4918 // CHECK: [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 4919 // CHECK: [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> 4920 // CHECK: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) #2 4921 // CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8> 4922 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16> 4923 // CHECK: ret <8 x i16> [[TMP2]] 4924 int16x8_t test_vqdmulhq_laneq_s16(int16x8_t a, int16x8_t v) { 4925 return vqdmulhq_laneq_s16(a, v, 7); 4926 } 4927 4928 // CHECK-LABEL: define <2 x i32> @test_vqdmulh_laneq_s32(<2 x i32> %a, <4 x i32> %v) #0 { 4929 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3> 4930 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> 4931 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 4932 // CHECK: [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 4933 // CHECK: [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 4934 // CHECK: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) #2 4935 // CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8> 4936 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32> 4937 // CHECK: ret <2 x i32> [[TMP2]] 4938 int32x2_t test_vqdmulh_laneq_s32(int32x2_t a, int32x4_t v) { 4939 return vqdmulh_laneq_s32(a, v, 3); 4940 } 4941 4942 // CHECK-LABEL: define <4 x i32> @test_vqdmulhq_laneq_s32(<4 x i32> %a, <4 x i32> %v) #0 { 4943 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 4944 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> 4945 // CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8> 4946 // CHECK: [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 4947 // CHECK: [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> 4948 // CHECK: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) #2 4949 // CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8> 4950 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32> 4951 // CHECK: ret <4 x i32> [[TMP2]] 4952 int32x4_t test_vqdmulhq_laneq_s32(int32x4_t a, int32x4_t v) { 4953 return vqdmulhq_laneq_s32(a, v, 3); 4954 } 4955 4956 // CHECK-LABEL: define <4 x i16> @test_vqrdmulh_laneq_s16(<4 x i16> %a, <8 x i16> %v) #0 { 4957 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7> 4958 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> 4959 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> 4960 // CHECK: [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 4961 // CHECK: [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 4962 // CHECK: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) #2 4963 // CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8> 4964 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16> 4965 // CHECK: ret <4 x i16> [[TMP2]] 4966 int16x4_t test_vqrdmulh_laneq_s16(int16x4_t a, int16x8_t v) { 4967 return vqrdmulh_laneq_s16(a, v, 7); 4968 } 4969 4970 // CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_laneq_s16(<8 x i16> %a, <8 x i16> %v) #0 { 4971 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> 4972 // CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> 4973 // CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8> 4974 // CHECK: [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 4975 // CHECK: [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> 4976 // CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) #2 4977 // CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8> 4978 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16> 4979 // CHECK: ret <8 x i16> [[TMP2]] 4980 int16x8_t test_vqrdmulhq_laneq_s16(int16x8_t a, int16x8_t v) { 4981 return vqrdmulhq_laneq_s16(a, v, 7); 4982 } 4983 4984 // CHECK-LABEL: define <2 x i32> @test_vqrdmulh_laneq_s32(<2 x i32> %a, <4 x i32> %v) #0 { 4985 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3> 4986 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> 4987 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> 4988 // CHECK: [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 4989 // CHECK: [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 4990 // CHECK: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) #2 4991 // CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8> 4992 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32> 4993 // CHECK: ret <2 x i32> [[TMP2]] 4994 int32x2_t test_vqrdmulh_laneq_s32(int32x2_t a, int32x4_t v) { 4995 return vqrdmulh_laneq_s32(a, v, 3); 4996 } 4997 4998 // CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_laneq_s32(<4 x i32> %a, <4 x i32> %v) #0 { 4999 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 5000 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> 5001 // CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8> 5002 // CHECK: [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 5003 // CHECK: [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> 5004 // CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) #2 5005 // CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8> 5006 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32> 5007 // CHECK: ret <4 x i32> [[TMP2]] 5008 int32x4_t test_vqrdmulhq_laneq_s32(int32x4_t a, int32x4_t v) { 5009 return vqrdmulhq_laneq_s32(a, v, 3); 5010 } 5011 5012