Home | History | Annotate | Download | only in CodeGen
      1 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
      2 
      3 // Test new aarch64 intrinsics and types
      4 
      5 #include <arm_neon.h>
      6 
      7 // CHECK-LABEL: define <4 x i16> @test_vmla_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 {
      8 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
      9 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
     10 // CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
     11 // CHECK:   ret <4 x i16> [[ADD]]
     12 int16x4_t test_vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) {
     13   return vmla_lane_s16(a, b, v, 3);
     14 }
     15 
     16 // CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 {
     17 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
     18 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
     19 // CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
     20 // CHECK:   ret <8 x i16> [[ADD]]
     21 int16x8_t test_vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) {
     22   return vmlaq_lane_s16(a, b, v, 3);
     23 }
     24 
     25 // CHECK-LABEL: define <2 x i32> @test_vmla_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 {
     26 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
     27 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
     28 // CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
     29 // CHECK:   ret <2 x i32> [[ADD]]
     30 int32x2_t test_vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) {
     31   return vmla_lane_s32(a, b, v, 1);
     32 }
     33 
     34 // CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 {
     35 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
     36 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
     37 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
     38 // CHECK:   ret <4 x i32> [[ADD]]
     39 int32x4_t test_vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) {
     40   return vmlaq_lane_s32(a, b, v, 1);
     41 }
     42 
     43 // CHECK-LABEL: define <4 x i16> @test_vmla_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
     44 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
     45 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
     46 // CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
     47 // CHECK:   ret <4 x i16> [[ADD]]
     48 int16x4_t test_vmla_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
     49   return vmla_laneq_s16(a, b, v, 7);
     50 }
     51 
     52 // CHECK-LABEL: define <8 x i16> @test_vmlaq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 {
     53 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
     54 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
     55 // CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
     56 // CHECK:   ret <8 x i16> [[ADD]]
     57 int16x8_t test_vmlaq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
     58   return vmlaq_laneq_s16(a, b, v, 7);
     59 }
     60 
     61 // CHECK-LABEL: define <2 x i32> @test_vmla_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 {
     62 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
     63 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
     64 // CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
     65 // CHECK:   ret <2 x i32> [[ADD]]
     66 int32x2_t test_vmla_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
     67   return vmla_laneq_s32(a, b, v, 3);
     68 }
     69 
     70 // CHECK-LABEL: define <4 x i32> @test_vmlaq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 {
     71 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
     72 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
     73 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
     74 // CHECK:   ret <4 x i32> [[ADD]]
     75 int32x4_t test_vmlaq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
     76   return vmlaq_laneq_s32(a, b, v, 3);
     77 }
     78 
     79 // CHECK-LABEL: define <4 x i16> @test_vmls_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 {
     80 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
     81 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
     82 // CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
     83 // CHECK:   ret <4 x i16> [[SUB]]
     84 int16x4_t test_vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) {
     85   return vmls_lane_s16(a, b, v, 3);
     86 }
     87 
     88 // CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 {
     89 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
     90 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
     91 // CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
     92 // CHECK:   ret <8 x i16> [[SUB]]
     93 int16x8_t test_vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) {
     94   return vmlsq_lane_s16(a, b, v, 3);
     95 }
     96 
     97 // CHECK-LABEL: define <2 x i32> @test_vmls_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 {
     98 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
     99 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
    100 // CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
    101 // CHECK:   ret <2 x i32> [[SUB]]
    102 int32x2_t test_vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) {
    103   return vmls_lane_s32(a, b, v, 1);
    104 }
    105 
    106 // CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 {
    107 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
    108 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
    109 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
    110 // CHECK:   ret <4 x i32> [[SUB]]
    111 int32x4_t test_vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) {
    112   return vmlsq_lane_s32(a, b, v, 1);
    113 }
    114 
    115 // CHECK-LABEL: define <4 x i16> @test_vmls_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
    116 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
    117 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
    118 // CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
    119 // CHECK:   ret <4 x i16> [[SUB]]
    120 int16x4_t test_vmls_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
    121   return vmls_laneq_s16(a, b, v, 7);
    122 }
    123 
    124 // CHECK-LABEL: define <8 x i16> @test_vmlsq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 {
    125 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
    126 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
    127 // CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
    128 // CHECK:   ret <8 x i16> [[SUB]]
    129 int16x8_t test_vmlsq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
    130   return vmlsq_laneq_s16(a, b, v, 7);
    131 }
    132 
    133 // CHECK-LABEL: define <2 x i32> @test_vmls_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 {
    134 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
    135 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
    136 // CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
    137 // CHECK:   ret <2 x i32> [[SUB]]
    138 int32x2_t test_vmls_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
    139   return vmls_laneq_s32(a, b, v, 3);
    140 }
    141 
    142 // CHECK-LABEL: define <4 x i32> @test_vmlsq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 {
    143 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    144 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
    145 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
    146 // CHECK:   ret <4 x i32> [[SUB]]
    147 int32x4_t test_vmlsq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
    148   return vmlsq_laneq_s32(a, b, v, 3);
    149 }
    150 
    151 // CHECK-LABEL: define <4 x i16> @test_vmul_lane_s16(<4 x i16> %a, <4 x i16> %v) #0 {
    152 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    153 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
    154 // CHECK:   ret <4 x i16> [[MUL]]
    155 int16x4_t test_vmul_lane_s16(int16x4_t a, int16x4_t v) {
    156   return vmul_lane_s16(a, v, 3);
    157 }
    158 
    159 // CHECK-LABEL: define <8 x i16> @test_vmulq_lane_s16(<8 x i16> %a, <4 x i16> %v) #0 {
    160 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
    161 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
    162 // CHECK:   ret <8 x i16> [[MUL]]
    163 int16x8_t test_vmulq_lane_s16(int16x8_t a, int16x4_t v) {
    164   return vmulq_lane_s16(a, v, 3);
    165 }
    166 
    167 // CHECK-LABEL: define <2 x i32> @test_vmul_lane_s32(<2 x i32> %a, <2 x i32> %v) #0 {
    168 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
    169 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
    170 // CHECK:   ret <2 x i32> [[MUL]]
    171 int32x2_t test_vmul_lane_s32(int32x2_t a, int32x2_t v) {
    172   return vmul_lane_s32(a, v, 1);
    173 }
    174 
    175 // CHECK-LABEL: define <4 x i32> @test_vmulq_lane_s32(<4 x i32> %a, <2 x i32> %v) #0 {
    176 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
    177 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
    178 // CHECK:   ret <4 x i32> [[MUL]]
    179 int32x4_t test_vmulq_lane_s32(int32x4_t a, int32x2_t v) {
    180   return vmulq_lane_s32(a, v, 1);
    181 }
    182 
    183 // CHECK-LABEL: define <4 x i16> @test_vmul_lane_u16(<4 x i16> %a, <4 x i16> %v) #0 {
    184 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    185 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
    186 // CHECK:   ret <4 x i16> [[MUL]]
    187 uint16x4_t test_vmul_lane_u16(uint16x4_t a, uint16x4_t v) {
    188   return vmul_lane_u16(a, v, 3);
    189 }
    190 
    191 // CHECK-LABEL: define <8 x i16> @test_vmulq_lane_u16(<8 x i16> %a, <4 x i16> %v) #0 {
    192 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
    193 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
    194 // CHECK:   ret <8 x i16> [[MUL]]
    195 uint16x8_t test_vmulq_lane_u16(uint16x8_t a, uint16x4_t v) {
    196   return vmulq_lane_u16(a, v, 3);
    197 }
    198 
    199 // CHECK-LABEL: define <2 x i32> @test_vmul_lane_u32(<2 x i32> %a, <2 x i32> %v) #0 {
    200 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
    201 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
    202 // CHECK:   ret <2 x i32> [[MUL]]
    203 uint32x2_t test_vmul_lane_u32(uint32x2_t a, uint32x2_t v) {
    204   return vmul_lane_u32(a, v, 1);
    205 }
    206 
    207 // CHECK-LABEL: define <4 x i32> @test_vmulq_lane_u32(<4 x i32> %a, <2 x i32> %v) #0 {
    208 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
    209 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
    210 // CHECK:   ret <4 x i32> [[MUL]]
    211 uint32x4_t test_vmulq_lane_u32(uint32x4_t a, uint32x2_t v) {
    212   return vmulq_lane_u32(a, v, 1);
    213 }
    214 
    215 // CHECK-LABEL: define <4 x i16> @test_vmul_laneq_s16(<4 x i16> %a, <8 x i16> %v) #0 {
    216 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
    217 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
    218 // CHECK:   ret <4 x i16> [[MUL]]
    219 int16x4_t test_vmul_laneq_s16(int16x4_t a, int16x8_t v) {
    220   return vmul_laneq_s16(a, v, 7);
    221 }
    222 
    223 // CHECK-LABEL: define <8 x i16> @test_vmulq_laneq_s16(<8 x i16> %a, <8 x i16> %v) #0 {
    224 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
    225 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
    226 // CHECK:   ret <8 x i16> [[MUL]]
    227 int16x8_t test_vmulq_laneq_s16(int16x8_t a, int16x8_t v) {
    228   return vmulq_laneq_s16(a, v, 7);
    229 }
    230 
    231 // CHECK-LABEL: define <2 x i32> @test_vmul_laneq_s32(<2 x i32> %a, <4 x i32> %v) #0 {
    232 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
    233 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
    234 // CHECK:   ret <2 x i32> [[MUL]]
    235 int32x2_t test_vmul_laneq_s32(int32x2_t a, int32x4_t v) {
    236   return vmul_laneq_s32(a, v, 3);
    237 }
    238 
    239 // CHECK-LABEL: define <4 x i32> @test_vmulq_laneq_s32(<4 x i32> %a, <4 x i32> %v) #0 {
    240 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    241 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
    242 // CHECK:   ret <4 x i32> [[MUL]]
    243 int32x4_t test_vmulq_laneq_s32(int32x4_t a, int32x4_t v) {
    244   return vmulq_laneq_s32(a, v, 3);
    245 }
    246 
    247 // CHECK-LABEL: define <4 x i16> @test_vmul_laneq_u16(<4 x i16> %a, <8 x i16> %v) #0 {
    248 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
    249 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
    250 // CHECK:   ret <4 x i16> [[MUL]]
    251 uint16x4_t test_vmul_laneq_u16(uint16x4_t a, uint16x8_t v) {
    252   return vmul_laneq_u16(a, v, 7);
    253 }
    254 
    255 // CHECK-LABEL: define <8 x i16> @test_vmulq_laneq_u16(<8 x i16> %a, <8 x i16> %v) #0 {
    256 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
    257 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
    258 // CHECK:   ret <8 x i16> [[MUL]]
    259 uint16x8_t test_vmulq_laneq_u16(uint16x8_t a, uint16x8_t v) {
    260   return vmulq_laneq_u16(a, v, 7);
    261 }
    262 
    263 // CHECK-LABEL: define <2 x i32> @test_vmul_laneq_u32(<2 x i32> %a, <4 x i32> %v) #0 {
    264 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
    265 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
    266 // CHECK:   ret <2 x i32> [[MUL]]
    267 uint32x2_t test_vmul_laneq_u32(uint32x2_t a, uint32x4_t v) {
    268   return vmul_laneq_u32(a, v, 3);
    269 }
    270 
    271 // CHECK-LABEL: define <4 x i32> @test_vmulq_laneq_u32(<4 x i32> %a, <4 x i32> %v) #0 {
    272 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    273 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
    274 // CHECK:   ret <4 x i32> [[MUL]]
    275 uint32x4_t test_vmulq_laneq_u32(uint32x4_t a, uint32x4_t v) {
    276   return vmulq_laneq_u32(a, v, 3);
    277 }
    278 
    279 // CHECK-LABEL: define <2 x float> @test_vfma_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 {
    280 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
    281 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
    282 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
    283 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
    284 // CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> <i32 1, i32 1>
    285 // CHECK:   [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
    286 // CHECK:   [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
    287 // CHECK:   [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
    288 // CHECK:   ret <2 x float> [[FMLA2]]
    289 float32x2_t test_vfma_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
    290   return vfma_lane_f32(a, b, v, 1);
    291 }
    292 
    293 // CHECK-LABEL: define <4 x float> @test_vfmaq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 {
    294 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
    295 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
    296 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
    297 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
    298 // CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
    299 // CHECK:   [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
    300 // CHECK:   [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
    301 // CHECK:   [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
    302 // CHECK:   ret <4 x float> [[FMLA2]]
    303 float32x4_t test_vfmaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
    304   return vfmaq_lane_f32(a, b, v, 1);
    305 }
    306 
    307 // CHECK-LABEL: define <2 x float> @test_vfma_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 {
    308 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
    309 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
    310 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
    311 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
    312 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
    313 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
    314 // CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> <i32 3, i32 3>
    315 // CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
    316 // CHECK:   ret <2 x float> [[TMP6]]
    317 float32x2_t test_vfma_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
    318   return vfma_laneq_f32(a, b, v, 3);
    319 }
    320 
    321 // CHECK-LABEL: define <4 x float> @test_vfmaq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 {
    322 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
    323 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
    324 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
    325 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
    326 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
    327 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
    328 // CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    329 // CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
    330 // CHECK:   ret <4 x float> [[TMP6]]
    331 float32x4_t test_vfmaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
    332   return vfmaq_laneq_f32(a, b, v, 3);
    333 }
    334 
    335 // CHECK-LABEL: define <2 x float> @test_vfms_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 {
    336 // CHECK:   [[SUB:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
    337 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
    338 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8>
    339 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
    340 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
    341 // CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> <i32 1, i32 1>
    342 // CHECK:   [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
    343 // CHECK:   [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
    344 // CHECK:   [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
    345 // CHECK:   ret <2 x float> [[FMLA2]]
    346 float32x2_t test_vfms_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
    347   return vfms_lane_f32(a, b, v, 1);
    348 }
    349 
    350 // CHECK-LABEL: define <4 x float> @test_vfmsq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 {
    351 // CHECK:   [[SUB:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
    352 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
    353 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8>
    354 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
    355 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
    356 // CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
    357 // CHECK:   [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
    358 // CHECK:   [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
    359 // CHECK:   [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
    360 // CHECK:   ret <4 x float> [[FMLA2]]
    361 float32x4_t test_vfmsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
    362   return vfmsq_lane_f32(a, b, v, 1);
    363 }
    364 
    365 // CHECK-LABEL: define <2 x float> @test_vfms_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 {
    366 // CHECK:   [[SUB:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
    367 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
    368 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8>
    369 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
    370 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
    371 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
    372 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
    373 // CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> <i32 3, i32 3>
    374 // CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
    375 // CHECK:   ret <2 x float> [[TMP6]]
    376 float32x2_t test_vfms_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
    377   return vfms_laneq_f32(a, b, v, 3);
    378 }
    379 
    380 // CHECK-LABEL: define <4 x float> @test_vfmsq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 {
    381 // CHECK:   [[SUB:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
    382 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
    383 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8>
    384 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
    385 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
    386 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
    387 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
    388 // CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    389 // CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
    390 // CHECK:   ret <4 x float> [[TMP6]]
    391 float32x4_t test_vfmsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
    392   return vfmsq_laneq_f32(a, b, v, 3);
    393 }
    394 
    395 // CHECK-LABEL: define <2 x double> @test_vfmaq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) #0 {
    396 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
    397 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
    398 // CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %v to <8 x i8>
    399 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
    400 // CHECK:   [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <2 x i32> zeroinitializer
    401 // CHECK:   [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
    402 // CHECK:   [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
    403 // CHECK:   [[FMLA2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FMLA]], <2 x double> [[LANE]], <2 x double> [[FMLA1]])
    404 // CHECK:   ret <2 x double> [[FMLA2]]
    405 float64x2_t test_vfmaq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) {
    406   return vfmaq_lane_f64(a, b, v, 0);
    407 }
    408 
    409 // CHECK-LABEL: define <2 x double> @test_vfmaq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) #0 {
    410 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
    411 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
    412 // CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
    413 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
    414 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
    415 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
    416 // CHECK:   [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> <i32 1, i32 1>
    417 // CHECK:   [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
    418 // CHECK:   ret <2 x double> [[TMP6]]
    419 float64x2_t test_vfmaq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) {
    420   return vfmaq_laneq_f64(a, b, v, 1);
    421 }
    422 
    423 // CHECK-LABEL: define <2 x double> @test_vfmsq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) #0 {
    424 // CHECK:   [[SUB:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %b
    425 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
    426 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SUB]] to <16 x i8>
    427 // CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %v to <8 x i8>
    428 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
    429 // CHECK:   [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <2 x i32> zeroinitializer
    430 // CHECK:   [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
    431 // CHECK:   [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
    432 // CHECK:   [[FMLA2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FMLA]], <2 x double> [[LANE]], <2 x double> [[FMLA1]])
    433 // CHECK:   ret <2 x double> [[FMLA2]]
    434 float64x2_t test_vfmsq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) {
    435   return vfmsq_lane_f64(a, b, v, 0);
    436 }
    437 
    438 // CHECK-LABEL: define <2 x double> @test_vfmsq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) #0 {
    439 // CHECK:   [[SUB:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %b
    440 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
    441 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SUB]] to <16 x i8>
    442 // CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
    443 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
    444 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
    445 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
    446 // CHECK:   [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> <i32 1, i32 1>
    447 // CHECK:   [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
    448 // CHECK:   ret <2 x double> [[TMP6]]
    449 float64x2_t test_vfmsq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) {
    450   return vfmsq_laneq_f64(a, b, v, 1);
    451 }
    452 
    453 // CHECK-LABEL: define float @test_vfmas_laneq_f32(float %a, float %b, <4 x float> %v) #0 {
    454 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v to <16 x i8>
    455 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
    456 // CHECK:   [[EXTRACT:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
    457 // CHECK:   [[TMP2:%.*]] = call float @llvm.fma.f32(float %b, float [[EXTRACT]], float %a)
    458 // CHECK:   ret float [[TMP2]]
    459 float32_t test_vfmas_laneq_f32(float32_t a, float32_t b, float32x4_t v) {
    460   return vfmas_laneq_f32(a, b, v, 3);
    461 }
    462 
    463 // CHECK-LABEL: define double @test_vfmsd_lane_f64(double %a, double %b, <1 x double> %v) #0 {
    464 // CHECK:   [[SUB:%.*]] = fsub double -0.000000e+00, %b
    465 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %v to <8 x i8>
    466 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
    467 // CHECK:   [[EXTRACT:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
    468 // CHECK:   [[TMP2:%.*]] = call double @llvm.fma.f64(double [[SUB]], double [[EXTRACT]], double %a)
    469 // CHECK:   ret double [[TMP2]]
    470 float64_t test_vfmsd_lane_f64(float64_t a, float64_t b, float64x1_t v) {
    471   return vfmsd_lane_f64(a, b, v, 0);
    472 }
    473 
    474 // CHECK-LABEL: define float @test_vfmss_laneq_f32(float %a, float %b, <4 x float> %v) #0 {
    475 // CHECK:   [[SUB:%.*]] = fsub float -0.000000e+00, %b
    476 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v to <16 x i8>
    477 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
    478 // CHECK:   [[EXTRACT:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
    479 // CHECK:   [[TMP2:%.*]] = call float @llvm.fma.f32(float [[SUB]], float [[EXTRACT]], float %a)
    480 // CHECK:   ret float [[TMP2]]
    481 float32_t test_vfmss_laneq_f32(float32_t a, float32_t b, float32x4_t v) {
    482   return vfmss_laneq_f32(a, b, v, 3);
    483 }
    484 
    485 // CHECK-LABEL: define double @test_vfmsd_laneq_f64(double %a, double %b, <2 x double> %v) #0 {
    486 // CHECK:   [[SUB:%.*]] = fsub double -0.000000e+00, %b
    487 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v to <16 x i8>
    488 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
    489 // CHECK:   [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
    490 // CHECK:   [[TMP2:%.*]] = call double @llvm.fma.f64(double [[SUB]], double [[EXTRACT]], double %a)
    491 // CHECK:   ret double [[TMP2]]
    492 float64_t test_vfmsd_laneq_f64(float64_t a, float64_t b, float64x2_t v) {
    493   return vfmsd_laneq_f64(a, b, v, 1);
    494 }
    495 
    496 // CHECK-LABEL: define <4 x i32> @test_vmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
    497 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    498 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
    499 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
    500 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
    501 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
    502 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
    503 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
    504 // CHECK:   ret <4 x i32> [[ADD]]
    505 int32x4_t test_vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
    506   return vmlal_lane_s16(a, b, v, 3);
    507 }
    508 
    509 // CHECK-LABEL: define <2 x i64> @test_vmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
    510 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
    511 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
    512 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
    513 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
    514 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
    515 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
    516 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
    517 // CHECK:   ret <2 x i64> [[ADD]]
    518 int64x2_t test_vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
    519   return vmlal_lane_s32(a, b, v, 1);
    520 }
    521 
    522 // CHECK-LABEL: define <4 x i32> @test_vmlal_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
    523 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
    524 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
    525 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
    526 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
    527 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
    528 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
    529 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
    530 // CHECK:   ret <4 x i32> [[ADD]]
    531 int32x4_t test_vmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
    532   return vmlal_laneq_s16(a, b, v, 7);
    533 }
    534 
    535 // CHECK-LABEL: define <2 x i64> @test_vmlal_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
    536 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
    537 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
    538 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
    539 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
    540 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
    541 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
    542 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
    543 // CHECK:   ret <2 x i64> [[ADD]]
    544 int64x2_t test_vmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
    545   return vmlal_laneq_s32(a, b, v, 3);
    546 }
    547 
    548 // CHECK-LABEL: define <4 x i32> @test_vmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
    549 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    550 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    551 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
    552 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
    553 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
    554 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
    555 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
    556 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
    557 // CHECK:   ret <4 x i32> [[ADD]]
    558 int32x4_t test_vmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
    559   return vmlal_high_lane_s16(a, b, v, 3);
    560 }
    561 
    562 // CHECK-LABEL: define <2 x i64> @test_vmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
    563 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
    564 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
    565 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
    566 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
    567 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
    568 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
    569 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
    570 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
    571 // CHECK:   ret <2 x i64> [[ADD]]
    572 int64x2_t test_vmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
    573   return vmlal_high_lane_s32(a, b, v, 1);
    574 }
    575 
    576 // CHECK-LABEL: define <4 x i32> @test_vmlal_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
    577 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    578 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
    579 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
    580 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
    581 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
    582 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
    583 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
    584 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
    585 // CHECK:   ret <4 x i32> [[ADD]]
    586 int32x4_t test_vmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
    587   return vmlal_high_laneq_s16(a, b, v, 7);
    588 }
    589 
    590 // CHECK-LABEL: define <2 x i64> @test_vmlal_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
    591 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
    592 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
    593 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
    594 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
    595 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
    596 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
    597 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
    598 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
    599 // CHECK:   ret <2 x i64> [[ADD]]
    600 int64x2_t test_vmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
    601   return vmlal_high_laneq_s32(a, b, v, 3);
    602 }
    603 
    604 // CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
    605 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    606 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
    607 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
    608 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
    609 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
    610 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
    611 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
    612 // CHECK:   ret <4 x i32> [[SUB]]
    613 int32x4_t test_vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
    614   return vmlsl_lane_s16(a, b, v, 3);
    615 }
    616 
    617 // CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
    618 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
    619 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
    620 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
    621 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
    622 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
    623 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
    624 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
    625 // CHECK:   ret <2 x i64> [[SUB]]
    626 int64x2_t test_vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
    627   return vmlsl_lane_s32(a, b, v, 1);
    628 }
    629 
    630 // CHECK-LABEL: define <4 x i32> @test_vmlsl_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
    631 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
    632 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
    633 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
    634 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
    635 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
    636 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
    637 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
    638 // CHECK:   ret <4 x i32> [[SUB]]
    639 int32x4_t test_vmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
    640   return vmlsl_laneq_s16(a, b, v, 7);
    641 }
    642 
    643 // CHECK-LABEL: define <2 x i64> @test_vmlsl_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
    644 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
    645 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
    646 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
    647 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
    648 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
    649 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
    650 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
    651 // CHECK:   ret <2 x i64> [[SUB]]
    652 int64x2_t test_vmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
    653   return vmlsl_laneq_s32(a, b, v, 3);
    654 }
    655 
    656 // CHECK-LABEL: define <4 x i32> @test_vmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
    657 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    658 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    659 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
    660 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
    661 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
    662 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
    663 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
    664 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
    665 // CHECK:   ret <4 x i32> [[SUB]]
    666 int32x4_t test_vmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
    667   return vmlsl_high_lane_s16(a, b, v, 3);
    668 }
    669 
    670 // CHECK-LABEL: define <2 x i64> @test_vmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
    671 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
    672 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
    673 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
    674 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
    675 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
    676 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
    677 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
    678 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
    679 // CHECK:   ret <2 x i64> [[SUB]]
    680 int64x2_t test_vmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
    681   return vmlsl_high_lane_s32(a, b, v, 1);
    682 }
    683 
    684 // CHECK-LABEL: define <4 x i32> @test_vmlsl_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
    685 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    686 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
    687 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
    688 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
    689 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
    690 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
    691 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
    692 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
    693 // CHECK:   ret <4 x i32> [[SUB]]
    694 int32x4_t test_vmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
    695   return vmlsl_high_laneq_s16(a, b, v, 7);
    696 }
    697 
    698 // CHECK-LABEL: define <2 x i64> @test_vmlsl_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
    699 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
    700 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
    701 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
    702 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
    703 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
    704 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
    705 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
    706 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
    707 // CHECK:   ret <2 x i64> [[SUB]]
    708 int64x2_t test_vmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
    709   return vmlsl_high_laneq_s32(a, b, v, 3);
    710 }
    711 
    712 // CHECK-LABEL: define <4 x i32> @test_vmlal_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
    713 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    714 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
    715 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
    716 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
    717 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
    718 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
    719 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
    720 // CHECK:   ret <4 x i32> [[ADD]]
    721 int32x4_t test_vmlal_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) {
    722   return vmlal_lane_u16(a, b, v, 3);
    723 }
    724 
    725 // CHECK-LABEL: define <2 x i64> @test_vmlal_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
    726 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
    727 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
    728 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
    729 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
    730 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
    731 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
    732 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
    733 // CHECK:   ret <2 x i64> [[ADD]]
    734 int64x2_t test_vmlal_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) {
    735   return vmlal_lane_u32(a, b, v, 1);
    736 }
    737 
    738 // CHECK-LABEL: define <4 x i32> @test_vmlal_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
    739 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
    740 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
    741 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
    742 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
    743 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
    744 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
    745 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
    746 // CHECK:   ret <4 x i32> [[ADD]]
    747 int32x4_t test_vmlal_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) {
    748   return vmlal_laneq_u16(a, b, v, 7);
    749 }
    750 
    751 // CHECK-LABEL: define <2 x i64> @test_vmlal_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
    752 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
    753 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
    754 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
    755 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
    756 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
    757 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
    758 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
    759 // CHECK:   ret <2 x i64> [[ADD]]
    760 int64x2_t test_vmlal_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) {
    761   return vmlal_laneq_u32(a, b, v, 3);
    762 }
    763 
    764 // CHECK-LABEL: define <4 x i32> @test_vmlal_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
    765 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    766 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    767 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
    768 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
    769 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
    770 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
    771 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
    772 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
    773 // CHECK:   ret <4 x i32> [[ADD]]
    774 int32x4_t test_vmlal_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) {
    775   return vmlal_high_lane_u16(a, b, v, 3);
    776 }
    777 
    778 // CHECK-LABEL: define <2 x i64> @test_vmlal_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
    779 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
    780 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
    781 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
    782 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
    783 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
    784 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
    785 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
    786 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
    787 // CHECK:   ret <2 x i64> [[ADD]]
    788 int64x2_t test_vmlal_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) {
    789   return vmlal_high_lane_u32(a, b, v, 1);
    790 }
    791 
    792 // CHECK-LABEL: define <4 x i32> @test_vmlal_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
    793 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    794 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
    795 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
    796 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
    797 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
    798 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
    799 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
    800 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
    801 // CHECK:   ret <4 x i32> [[ADD]]
    802 int32x4_t test_vmlal_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) {
    803   return vmlal_high_laneq_u16(a, b, v, 7);
    804 }
    805 
    806 // CHECK-LABEL: define <2 x i64> @test_vmlal_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
    807 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
    808 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
    809 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
    810 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
    811 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
    812 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
    813 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
    814 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
    815 // CHECK:   ret <2 x i64> [[ADD]]
    816 int64x2_t test_vmlal_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) {
    817   return vmlal_high_laneq_u32(a, b, v, 3);
    818 }
    819 
    820 // CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
    821 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    822 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
    823 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
    824 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
    825 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
    826 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
    827 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
    828 // CHECK:   ret <4 x i32> [[SUB]]
    829 int32x4_t test_vmlsl_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) {
    830   return vmlsl_lane_u16(a, b, v, 3);
    831 }
    832 
    833 // CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
    834 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
    835 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
    836 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
    837 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
    838 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
    839 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
    840 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
    841 // CHECK:   ret <2 x i64> [[SUB]]
    842 int64x2_t test_vmlsl_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) {
    843   return vmlsl_lane_u32(a, b, v, 1);
    844 }
    845 
    846 // CHECK-LABEL: define <4 x i32> @test_vmlsl_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
    847 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
    848 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
    849 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
    850 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
    851 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
    852 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
    853 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
    854 // CHECK:   ret <4 x i32> [[SUB]]
    855 int32x4_t test_vmlsl_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) {
    856   return vmlsl_laneq_u16(a, b, v, 7);
    857 }
    858 
    859 // CHECK-LABEL: define <2 x i64> @test_vmlsl_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
    860 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
    861 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
    862 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
    863 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
    864 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
    865 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
    866 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
    867 // CHECK:   ret <2 x i64> [[SUB]]
    868 int64x2_t test_vmlsl_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) {
    869   return vmlsl_laneq_u32(a, b, v, 3);
    870 }
    871 
    872 // CHECK-LABEL: define <4 x i32> @test_vmlsl_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
    873 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    874 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    875 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
    876 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
    877 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
    878 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
    879 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
    880 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
    881 // CHECK:   ret <4 x i32> [[SUB]]
    882 int32x4_t test_vmlsl_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) {
    883   return vmlsl_high_lane_u16(a, b, v, 3);
    884 }
    885 
    886 // CHECK-LABEL: define <2 x i64> @test_vmlsl_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
    887 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
    888 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
    889 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
    890 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
    891 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
    892 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
    893 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
    894 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
    895 // CHECK:   ret <2 x i64> [[SUB]]
    896 int64x2_t test_vmlsl_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) {
    897   return vmlsl_high_lane_u32(a, b, v, 1);
    898 }
    899 
    900 // CHECK-LABEL: define <4 x i32> @test_vmlsl_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
    901 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    902 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
    903 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
    904 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
    905 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
    906 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
    907 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
    908 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
    909 // CHECK:   ret <4 x i32> [[SUB]]
    910 int32x4_t test_vmlsl_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) {
    911   return vmlsl_high_laneq_u16(a, b, v, 7);
    912 }
    913 
    914 // CHECK-LABEL: define <2 x i64> @test_vmlsl_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
    915 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
    916 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
    917 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
    918 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
    919 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
    920 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
    921 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
    922 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
    923 // CHECK:   ret <2 x i64> [[SUB]]
    924 int64x2_t test_vmlsl_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) {
    925   return vmlsl_high_laneq_u32(a, b, v, 3);
    926 }
    927 
    928 // CHECK-LABEL: define <4 x i32> @test_vmull_lane_s16(<4 x i16> %a, <4 x i16> %v) #0 {
    929 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    930 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
    931 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
    932 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
    933 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
    934 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
    935 // CHECK:   ret <4 x i32> [[VMULL2_I]]
    936 int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t v) {
    937   return vmull_lane_s16(a, v, 3);
    938 }
    939 
    940 // CHECK-LABEL: define <2 x i64> @test_vmull_lane_s32(<2 x i32> %a, <2 x i32> %v) #0 {
    941 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
    942 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
    943 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
    944 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
    945 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
    946 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
    947 // CHECK:   ret <2 x i64> [[VMULL2_I]]
    948 int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t v) {
    949   return vmull_lane_s32(a, v, 1);
    950 }
    951 
    952 // CHECK-LABEL: define <4 x i32> @test_vmull_lane_u16(<4 x i16> %a, <4 x i16> %v) #0 {
    953 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    954 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
    955 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
    956 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
    957 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
    958 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
    959 // CHECK:   ret <4 x i32> [[VMULL2_I]]
    960 uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t v) {
    961   return vmull_lane_u16(a, v, 3);
    962 }
    963 
    964 // CHECK-LABEL: define <2 x i64> @test_vmull_lane_u32(<2 x i32> %a, <2 x i32> %v) #0 {
    965 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
    966 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
    967 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
    968 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
    969 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
    970 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
    971 // CHECK:   ret <2 x i64> [[VMULL2_I]]
    972 uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t v) {
    973   return vmull_lane_u32(a, v, 1);
    974 }
    975 
    976 // CHECK-LABEL: define <4 x i32> @test_vmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) #0 {
    977 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    978 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    979 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
    980 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
    981 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
    982 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
    983 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
    984 // CHECK:   ret <4 x i32> [[VMULL2_I]]
    985 int32x4_t test_vmull_high_lane_s16(int16x8_t a, int16x4_t v) {
    986   return vmull_high_lane_s16(a, v, 3);
    987 }
    988 
    989 // CHECK-LABEL: define <2 x i64> @test_vmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) #0 {
    990 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
    991 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
    992 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
    993 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
    994 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
    995 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
    996 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
    997 // CHECK:   ret <2 x i64> [[VMULL2_I]]
    998 int64x2_t test_vmull_high_lane_s32(int32x4_t a, int32x2_t v) {
    999   return vmull_high_lane_s32(a, v, 1);
   1000 }
   1001 
   1002 // CHECK-LABEL: define <4 x i32> @test_vmull_high_lane_u16(<8 x i16> %a, <4 x i16> %v) #0 {
   1003 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   1004 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   1005 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
   1006 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   1007 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   1008 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   1009 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
   1010 // CHECK:   ret <4 x i32> [[VMULL2_I]]
   1011 uint32x4_t test_vmull_high_lane_u16(uint16x8_t a, uint16x4_t v) {
   1012   return vmull_high_lane_u16(a, v, 3);
   1013 }
   1014 
   1015 // CHECK-LABEL: define <2 x i64> @test_vmull_high_lane_u32(<4 x i32> %a, <2 x i32> %v) #0 {
   1016 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
   1017 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
   1018 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
   1019 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   1020 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   1021 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   1022 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
   1023 // CHECK:   ret <2 x i64> [[VMULL2_I]]
   1024 uint64x2_t test_vmull_high_lane_u32(uint32x4_t a, uint32x2_t v) {
   1025   return vmull_high_lane_u32(a, v, 1);
   1026 }
   1027 
   1028 // CHECK-LABEL: define <4 x i32> @test_vmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) #0 {
   1029 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   1030 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   1031 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   1032 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   1033 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   1034 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
   1035 // CHECK:   ret <4 x i32> [[VMULL2_I]]
   1036 int32x4_t test_vmull_laneq_s16(int16x4_t a, int16x8_t v) {
   1037   return vmull_laneq_s16(a, v, 7);
   1038 }
   1039 
   1040 // CHECK-LABEL: define <2 x i64> @test_vmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) #0 {
   1041 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
   1042 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   1043 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   1044 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   1045 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   1046 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
   1047 // CHECK:   ret <2 x i64> [[VMULL2_I]]
   1048 int64x2_t test_vmull_laneq_s32(int32x2_t a, int32x4_t v) {
   1049   return vmull_laneq_s32(a, v, 3);
   1050 }
   1051 
   1052 // CHECK-LABEL: define <4 x i32> @test_vmull_laneq_u16(<4 x i16> %a, <8 x i16> %v) #0 {
   1053 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   1054 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   1055 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   1056 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   1057 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   1058 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
   1059 // CHECK:   ret <4 x i32> [[VMULL2_I]]
   1060 uint32x4_t test_vmull_laneq_u16(uint16x4_t a, uint16x8_t v) {
   1061   return vmull_laneq_u16(a, v, 7);
   1062 }
   1063 
   1064 // CHECK-LABEL: define <2 x i64> @test_vmull_laneq_u32(<2 x i32> %a, <4 x i32> %v) #0 {
   1065 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
   1066 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   1067 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   1068 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   1069 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   1070 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
   1071 // CHECK:   ret <2 x i64> [[VMULL2_I]]
   1072 uint64x2_t test_vmull_laneq_u32(uint32x2_t a, uint32x4_t v) {
   1073   return vmull_laneq_u32(a, v, 3);
   1074 }
   1075 
   1076 // CHECK-LABEL: define <4 x i32> @test_vmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) #0 {
   1077 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   1078 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   1079 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
   1080 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   1081 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   1082 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   1083 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
   1084 // CHECK:   ret <4 x i32> [[VMULL2_I]]
   1085 int32x4_t test_vmull_high_laneq_s16(int16x8_t a, int16x8_t v) {
   1086   return vmull_high_laneq_s16(a, v, 7);
   1087 }
   1088 
   1089 // CHECK-LABEL: define <2 x i64> @test_vmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) #0 {
   1090 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
   1091 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
   1092 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
   1093 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   1094 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   1095 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   1096 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
   1097 // CHECK:   ret <2 x i64> [[VMULL2_I]]
   1098 int64x2_t test_vmull_high_laneq_s32(int32x4_t a, int32x4_t v) {
   1099   return vmull_high_laneq_s32(a, v, 3);
   1100 }
   1101 
   1102 // CHECK-LABEL: define <4 x i32> @test_vmull_high_laneq_u16(<8 x i16> %a, <8 x i16> %v) #0 {
   1103 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   1104 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   1105 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
   1106 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   1107 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   1108 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   1109 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
   1110 // CHECK:   ret <4 x i32> [[VMULL2_I]]
   1111 uint32x4_t test_vmull_high_laneq_u16(uint16x8_t a, uint16x8_t v) {
   1112   return vmull_high_laneq_u16(a, v, 7);
   1113 }
   1114 
   1115 // CHECK-LABEL: define <2 x i64> @test_vmull_high_laneq_u32(<4 x i32> %a, <4 x i32> %v) #0 {
   1116 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
   1117 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
   1118 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
   1119 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   1120 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   1121 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   1122 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
   1123 // CHECK:   ret <2 x i64> [[VMULL2_I]]
   1124 uint64x2_t test_vmull_high_laneq_u32(uint32x4_t a, uint32x4_t v) {
   1125   return vmull_high_laneq_u32(a, v, 3);
   1126 }
   1127 
   1128 // CHECK-LABEL: define <4 x i32> @test_vqdmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
   1129 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   1130 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   1131 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   1132 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   1133 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   1134 // CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
   1135 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
   1136 // CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   1137 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
   1138 // CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
   1139 int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
   1140   return vqdmlal_lane_s16(a, b, v, 3);
   1141 }
   1142 
   1143 // CHECK-LABEL: define <2 x i64> @test_vqdmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
   1144 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
   1145 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   1146 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   1147 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   1148 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   1149 // CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
   1150 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
   1151 // CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   1152 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
   1153 // CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
   1154 int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
   1155   return vqdmlal_lane_s32(a, b, v, 1);
   1156 }
   1157 
   1158 // CHECK-LABEL: define <4 x i32> @test_vqdmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
   1159 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   1160 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   1161 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   1162 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
   1163 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   1164 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   1165 // CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
   1166 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
   1167 // CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   1168 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
   1169 // CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
   1170 int32x4_t test_vqdmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
   1171   return vqdmlal_high_lane_s16(a, b, v, 3);
   1172 }
   1173 
   1174 // CHECK-LABEL: define <2 x i64> @test_vqdmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
   1175 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
   1176 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
   1177 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   1178 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
   1179 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   1180 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   1181 // CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
   1182 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
   1183 // CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   1184 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
   1185 // CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
   1186 int64x2_t test_vqdmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
   1187   return vqdmlal_high_lane_s32(a, b, v, 1);
   1188 }
   1189 
   1190 // CHECK-LABEL: define <4 x i32> @test_vqdmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
   1191 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   1192 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   1193 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   1194 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   1195 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   1196 // CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
   1197 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
   1198 // CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   1199 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
   1200 // CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
   1201 int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
   1202   return vqdmlsl_lane_s16(a, b, v, 3);
   1203 }
   1204 
   1205 // CHECK-LABEL: define <2 x i64> @test_vqdmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
   1206 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
   1207 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   1208 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   1209 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   1210 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   1211 // CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
   1212 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
   1213 // CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   1214 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
   1215 // CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
   1216 int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
   1217   return vqdmlsl_lane_s32(a, b, v, 1);
   1218 }
   1219 
   1220 // CHECK-LABEL: define <4 x i32> @test_vqdmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
   1221 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   1222 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   1223 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   1224 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
   1225 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   1226 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   1227 // CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
   1228 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
   1229 // CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   1230 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
   1231 // CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
   1232 int32x4_t test_vqdmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
   1233   return vqdmlsl_high_lane_s16(a, b, v, 3);
   1234 }
   1235 
   1236 // CHECK-LABEL: define <2 x i64> @test_vqdmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
   1237 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
   1238 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
   1239 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   1240 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
   1241 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   1242 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   1243 // CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
   1244 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
   1245 // CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   1246 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
   1247 // CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
   1248 int64x2_t test_vqdmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
   1249   return vqdmlsl_high_lane_s32(a, b, v, 1);
   1250 }
   1251 
   1252 // CHECK-LABEL: define <4 x i32> @test_vqdmull_lane_s16(<4 x i16> %a, <4 x i16> %v) #0 {
   1253 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   1254 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   1255 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   1256 // CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   1257 // CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   1258 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2
   1259 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
   1260 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
   1261 // CHECK:   ret <4 x i32> [[TMP2]]
   1262 int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t v) {
   1263   return vqdmull_lane_s16(a, v, 3);
   1264 }
   1265 
   1266 // CHECK-LABEL: define <2 x i64> @test_vqdmull_lane_s32(<2 x i32> %a, <2 x i32> %v) #0 {
   1267 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
   1268 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   1269 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   1270 // CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   1271 // CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   1272 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2
   1273 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
   1274 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
   1275 // CHECK:   ret <2 x i64> [[TMP2]]
   1276 int64x2_t test_vqdmull_lane_s32(int32x2_t a, int32x2_t v) {
   1277   return vqdmull_lane_s32(a, v, 1);
   1278 }
   1279 
   1280 // CHECK-LABEL: define <4 x i32> @test_vqdmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) #0 {
   1281 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   1282 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   1283 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   1284 // CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   1285 // CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   1286 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2
   1287 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
   1288 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
   1289 // CHECK:   ret <4 x i32> [[TMP2]]
   1290 int32x4_t test_vqdmull_laneq_s16(int16x4_t a, int16x8_t v) {
   1291   return vqdmull_laneq_s16(a, v, 3);
   1292 }
   1293 
   1294 // CHECK-LABEL: define <2 x i64> @test_vqdmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) #0 {
   1295 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
   1296 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   1297 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   1298 // CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   1299 // CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   1300 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2
   1301 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
   1302 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
   1303 // CHECK:   ret <2 x i64> [[TMP2]]
   1304 int64x2_t test_vqdmull_laneq_s32(int32x2_t a, int32x4_t v) {
   1305   return vqdmull_laneq_s32(a, v, 3);
   1306 }
   1307 
   1308 // CHECK-LABEL: define <4 x i32> @test_vqdmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) #0 {
   1309 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   1310 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   1311 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
   1312 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   1313 // CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   1314 // CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   1315 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2
   1316 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
   1317 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
   1318 // CHECK:   ret <4 x i32> [[TMP2]]
   1319 int32x4_t test_vqdmull_high_lane_s16(int16x8_t a, int16x4_t v) {
   1320   return vqdmull_high_lane_s16(a, v, 3);
   1321 }
   1322 
   1323 // CHECK-LABEL: define <2 x i64> @test_vqdmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) #0 {
   1324 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
   1325 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
   1326 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
   1327 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   1328 // CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   1329 // CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   1330 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2
   1331 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
   1332 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
   1333 // CHECK:   ret <2 x i64> [[TMP2]]
   1334 int64x2_t test_vqdmull_high_lane_s32(int32x4_t a, int32x2_t v) {
   1335   return vqdmull_high_lane_s32(a, v, 1);
   1336 }
   1337 
   1338 // CHECK-LABEL: define <4 x i32> @test_vqdmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) #0 {
   1339 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   1340 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   1341 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
   1342 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   1343 // CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   1344 // CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   1345 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2
   1346 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
   1347 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
   1348 // CHECK:   ret <4 x i32> [[TMP2]]
   1349 int32x4_t test_vqdmull_high_laneq_s16(int16x8_t a, int16x8_t v) {
   1350   return vqdmull_high_laneq_s16(a, v, 7);
   1351 }
   1352 
   1353 // CHECK-LABEL: define <2 x i64> @test_vqdmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) #0 {
   1354 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
   1355 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
   1356 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
   1357 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   1358 // CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   1359 // CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   1360 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2
   1361 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
   1362 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
   1363 // CHECK:   ret <2 x i64> [[TMP2]]
   1364 int64x2_t test_vqdmull_high_laneq_s32(int32x4_t a, int32x4_t v) {
   1365   return vqdmull_high_laneq_s32(a, v, 3);
   1366 }
   1367 
   1368 // CHECK-LABEL: define <4 x i16> @test_vqdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) #0 {
   1369 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   1370 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   1371 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   1372 // CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   1373 // CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   1374 // CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) #2
   1375 // CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
   1376 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16>
   1377 // CHECK:   ret <4 x i16> [[TMP2]]
   1378 int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t v) {
   1379   return vqdmulh_lane_s16(a, v, 3);
   1380 }
   1381 
   1382 // CHECK-LABEL: define <8 x i16> @test_vqdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) #0 {
   1383 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
   1384 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   1385 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
   1386 // CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   1387 // CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   1388 // CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) #2
   1389 // CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
   1390 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16>
   1391 // CHECK:   ret <8 x i16> [[TMP2]]
   1392 int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t v) {
   1393   return vqdmulhq_lane_s16(a, v, 3);
   1394 }
   1395 
   1396 // CHECK-LABEL: define <2 x i32> @test_vqdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) #0 {
   1397 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
   1398 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   1399 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   1400 // CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   1401 // CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   1402 // CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) #2
   1403 // CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
   1404 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32>
   1405 // CHECK:   ret <2 x i32> [[TMP2]]
   1406 int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t v) {
   1407   return vqdmulh_lane_s32(a, v, 1);
   1408 }
   1409 
   1410 // CHECK-LABEL: define <4 x i32> @test_vqdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) #0 {
   1411 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   1412 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   1413 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
   1414 // CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   1415 // CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   1416 // CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) #2
   1417 // CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
   1418 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32>
   1419 // CHECK:   ret <4 x i32> [[TMP2]]
   1420 int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t v) {
   1421   return vqdmulhq_lane_s32(a, v, 1);
   1422 }
   1423 
   1424 // CHECK-LABEL: define <4 x i16> @test_vqrdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) #0 {
   1425 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   1426 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   1427 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   1428 // CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   1429 // CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   1430 // CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) #2
   1431 // CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
   1432 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16>
   1433 // CHECK:   ret <4 x i16> [[TMP2]]
   1434 int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t v) {
   1435   return vqrdmulh_lane_s16(a, v, 3);
   1436 }
   1437 
   1438 // CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) #0 {
   1439 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
   1440 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   1441 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
   1442 // CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   1443 // CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   1444 // CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) #2
   1445 // CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
   1446 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16>
   1447 // CHECK:   ret <8 x i16> [[TMP2]]
   1448 int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t v) {
   1449   return vqrdmulhq_lane_s16(a, v, 3);
   1450 }
   1451 
   1452 // CHECK-LABEL: define <2 x i32> @test_vqrdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) #0 {
   1453 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
   1454 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   1455 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   1456 // CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   1457 // CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   1458 // CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) #2
   1459 // CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
   1460 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32>
   1461 // CHECK:   ret <2 x i32> [[TMP2]]
   1462 int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t v) {
   1463   return vqrdmulh_lane_s32(a, v, 1);
   1464 }
   1465 
   1466 // CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) #0 {
   1467 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   1468 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   1469 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
   1470 // CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   1471 // CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   1472 // CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) #2
   1473 // CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
   1474 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32>
   1475 // CHECK:   ret <4 x i32> [[TMP2]]
   1476 int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t v) {
   1477   return vqrdmulhq_lane_s32(a, v, 1);
   1478 }
   1479 
   1480 // CHECK-LABEL: define <2 x float> @test_vmul_lane_f32(<2 x float> %a, <2 x float> %v) #0 {
   1481 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> <i32 1, i32 1>
   1482 // CHECK:   [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
   1483 // CHECK:   ret <2 x float> [[MUL]]
   1484 float32x2_t test_vmul_lane_f32(float32x2_t a, float32x2_t v) {
   1485   return vmul_lane_f32(a, v, 1);
   1486 }
   1487 
   1488 
   1489 // CHECK-LABEL: define <1 x double> @test_vmul_lane_f64(<1 x double> %a, <1 x double> %v) #0 {
   1490 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
   1491 // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %v to <8 x i8>
   1492 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double
   1493 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
   1494 // CHECK:   [[EXTRACT:%.*]] = extractelement <1 x double> [[TMP3]], i32 0
   1495 // CHECK:   [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]]
   1496 // CHECK:   [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double>
   1497 // CHECK:   ret <1 x double> [[TMP5]]
   1498 float64x1_t test_vmul_lane_f64(float64x1_t a, float64x1_t v) {
   1499   return vmul_lane_f64(a, v, 0);
   1500 }
   1501 
   1502 
   1503 // CHECK-LABEL: define <4 x float> @test_vmulq_lane_f32(<4 x float> %a, <2 x float> %v) #0 {
   1504 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   1505 // CHECK:   [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
   1506 // CHECK:   ret <4 x float> [[MUL]]
   1507 float32x4_t test_vmulq_lane_f32(float32x4_t a, float32x2_t v) {
   1508   return vmulq_lane_f32(a, v, 1);
   1509 }
   1510 
   1511 // CHECK-LABEL: define <2 x double> @test_vmulq_lane_f64(<2 x double> %a, <1 x double> %v) #0 {
   1512 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x double> %v, <1 x double> %v, <2 x i32> zeroinitializer
   1513 // CHECK:   [[MUL:%.*]] = fmul <2 x double> %a, [[SHUFFLE]]
   1514 // CHECK:   ret <2 x double> [[MUL]]
   1515 float64x2_t test_vmulq_lane_f64(float64x2_t a, float64x1_t v) {
   1516   return vmulq_lane_f64(a, v, 0);
   1517 }
   1518 
   1519 // CHECK-LABEL: define <2 x float> @test_vmul_laneq_f32(<2 x float> %a, <4 x float> %v) #0 {
   1520 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> <i32 3, i32 3>
   1521 // CHECK:   [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
   1522 // CHECK:   ret <2 x float> [[MUL]]
   1523 float32x2_t test_vmul_laneq_f32(float32x2_t a, float32x4_t v) {
   1524   return vmul_laneq_f32(a, v, 3);
   1525 }
   1526 
   1527 // CHECK-LABEL: define <1 x double> @test_vmul_laneq_f64(<1 x double> %a, <2 x double> %v) #0 {
   1528 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
   1529 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v to <16 x i8>
   1530 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double
   1531 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
   1532 // CHECK:   [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
   1533 // CHECK:   [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]]
   1534 // CHECK:   [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double>
   1535 // CHECK:   ret <1 x double> [[TMP5]]
   1536 float64x1_t test_vmul_laneq_f64(float64x1_t a, float64x2_t v) {
   1537   return vmul_laneq_f64(a, v, 1);
   1538 }
   1539 
   1540 
   1541 // CHECK-LABEL: define <4 x float> @test_vmulq_laneq_f32(<4 x float> %a, <4 x float> %v) #0 {
   1542 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   1543 // CHECK:   [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
   1544 // CHECK:   ret <4 x float> [[MUL]]
   1545 float32x4_t test_vmulq_laneq_f32(float32x4_t a, float32x4_t v) {
   1546   return vmulq_laneq_f32(a, v, 3);
   1547 }
   1548 
   1549 // CHECK-LABEL: define <2 x double> @test_vmulq_laneq_f64(<2 x double> %a, <2 x double> %v) #0 {
   1550 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x double> %v, <2 x double> %v, <2 x i32> <i32 1, i32 1>
   1551 // CHECK:   [[MUL:%.*]] = fmul <2 x double> %a, [[SHUFFLE]]
   1552 // CHECK:   ret <2 x double> [[MUL]]
   1553 float64x2_t test_vmulq_laneq_f64(float64x2_t a, float64x2_t v) {
   1554   return vmulq_laneq_f64(a, v, 1);
   1555 }
   1556 
   1557 // CHECK-LABEL: define <2 x float> @test_vmulx_lane_f32(<2 x float> %a, <2 x float> %v) #0 {
   1558 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> <i32 1, i32 1>
   1559 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   1560 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8>
   1561 // CHECK:   [[VMULX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   1562 // CHECK:   [[VMULX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
   1563 // CHECK:   [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[VMULX_I]], <2 x float> [[VMULX1_I]]) #2
   1564 // CHECK:   ret <2 x float> [[VMULX2_I]]
   1565 float32x2_t test_vmulx_lane_f32(float32x2_t a, float32x2_t v) {
   1566   return vmulx_lane_f32(a, v, 1);
   1567 }
   1568 
   1569 // CHECK-LABEL: define <4 x float> @test_vmulxq_lane_f32(<4 x float> %a, <2 x float> %v) #0 {
   1570 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   1571 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
   1572 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8>
   1573 // CHECK:   [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
   1574 // CHECK:   [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
   1575 // CHECK:   [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[VMULX_I]], <4 x float> [[VMULX1_I]]) #2
   1576 // CHECK:   ret <4 x float> [[VMULX2_I]]
   1577 float32x4_t test_vmulxq_lane_f32(float32x4_t a, float32x2_t v) {
   1578   return vmulxq_lane_f32(a, v, 1);
   1579 }
   1580 
   1581 // CHECK-LABEL: define <2 x double> @test_vmulxq_lane_f64(<2 x double> %a, <1 x double> %v) #0 {
   1582 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x double> %v, <1 x double> %v, <2 x i32> zeroinitializer
   1583 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
   1584 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8>
   1585 // CHECK:   [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
   1586 // CHECK:   [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
   1587 // CHECK:   [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[VMULX_I]], <2 x double> [[VMULX1_I]]) #2
   1588 // CHECK:   ret <2 x double> [[VMULX2_I]]
   1589 float64x2_t test_vmulxq_lane_f64(float64x2_t a, float64x1_t v) {
   1590   return vmulxq_lane_f64(a, v, 0);
   1591 }
   1592 
   1593 // CHECK-LABEL: define <2 x float> @test_vmulx_laneq_f32(<2 x float> %a, <4 x float> %v) #0 {
   1594 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> <i32 3, i32 3>
   1595 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   1596 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8>
   1597 // CHECK:   [[VMULX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   1598 // CHECK:   [[VMULX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
   1599 // CHECK:   [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[VMULX_I]], <2 x float> [[VMULX1_I]]) #2
   1600 // CHECK:   ret <2 x float> [[VMULX2_I]]
   1601 float32x2_t test_vmulx_laneq_f32(float32x2_t a, float32x4_t v) {
   1602   return vmulx_laneq_f32(a, v, 3);
   1603 }
   1604 
   1605 // CHECK-LABEL: define <4 x float> @test_vmulxq_laneq_f32(<4 x float> %a, <4 x float> %v) #0 {
   1606 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   1607 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
   1608 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8>
   1609 // CHECK:   [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
   1610 // CHECK:   [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
   1611 // CHECK:   [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[VMULX_I]], <4 x float> [[VMULX1_I]]) #2
   1612 // CHECK:   ret <4 x float> [[VMULX2_I]]
   1613 float32x4_t test_vmulxq_laneq_f32(float32x4_t a, float32x4_t v) {
   1614   return vmulxq_laneq_f32(a, v, 3);
   1615 }
   1616 
   1617 // CHECK-LABEL: define <2 x double> @test_vmulxq_laneq_f64(<2 x double> %a, <2 x double> %v) #0 {
   1618 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x double> %v, <2 x double> %v, <2 x i32> <i32 1, i32 1>
   1619 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
   1620 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8>
   1621 // CHECK:   [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
   1622 // CHECK:   [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
   1623 // CHECK:   [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[VMULX_I]], <2 x double> [[VMULX1_I]]) #2
   1624 // CHECK:   ret <2 x double> [[VMULX2_I]]
   1625 float64x2_t test_vmulxq_laneq_f64(float64x2_t a, float64x2_t v) {
   1626   return vmulxq_laneq_f64(a, v, 1);
   1627 }
   1628 
   1629 // CHECK-LABEL: define <4 x i16> @test_vmla_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 {
   1630 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
   1631 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
   1632 // CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
   1633 // CHECK:   ret <4 x i16> [[ADD]]
   1634 int16x4_t test_vmla_lane_s16_0(int16x4_t a, int16x4_t b, int16x4_t v) {
   1635   return vmla_lane_s16(a, b, v, 0);
   1636 }
   1637 
   1638 // CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 {
   1639 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
   1640 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
   1641 // CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
   1642 // CHECK:   ret <8 x i16> [[ADD]]
   1643 int16x8_t test_vmlaq_lane_s16_0(int16x8_t a, int16x8_t b, int16x4_t v) {
   1644   return vmlaq_lane_s16(a, b, v, 0);
   1645 }
   1646 
   1647 // CHECK-LABEL: define <2 x i32> @test_vmla_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 {
   1648 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
   1649 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
   1650 // CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
   1651 // CHECK:   ret <2 x i32> [[ADD]]
   1652 int32x2_t test_vmla_lane_s32_0(int32x2_t a, int32x2_t b, int32x2_t v) {
   1653   return vmla_lane_s32(a, b, v, 0);
   1654 }
   1655 
   1656 // CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 {
   1657 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
   1658 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
   1659 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
   1660 // CHECK:   ret <4 x i32> [[ADD]]
   1661 int32x4_t test_vmlaq_lane_s32_0(int32x4_t a, int32x4_t b, int32x2_t v) {
   1662   return vmlaq_lane_s32(a, b, v, 0);
   1663 }
   1664 
   1665 // CHECK-LABEL: define <4 x i16> @test_vmla_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
   1666 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
   1667 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
   1668 // CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
   1669 // CHECK:   ret <4 x i16> [[ADD]]
   1670 int16x4_t test_vmla_laneq_s16_0(int16x4_t a, int16x4_t b, int16x8_t v) {
   1671   return vmla_laneq_s16(a, b, v, 0);
   1672 }
   1673 
   1674 // CHECK-LABEL: define <8 x i16> @test_vmlaq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 {
   1675 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
   1676 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
   1677 // CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
   1678 // CHECK:   ret <8 x i16> [[ADD]]
   1679 int16x8_t test_vmlaq_laneq_s16_0(int16x8_t a, int16x8_t b, int16x8_t v) {
   1680   return vmlaq_laneq_s16(a, b, v, 0);
   1681 }
   1682 
   1683 // CHECK-LABEL: define <2 x i32> @test_vmla_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 {
   1684 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
   1685 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
   1686 // CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
   1687 // CHECK:   ret <2 x i32> [[ADD]]
   1688 int32x2_t test_vmla_laneq_s32_0(int32x2_t a, int32x2_t b, int32x4_t v) {
   1689   return vmla_laneq_s32(a, b, v, 0);
   1690 }
   1691 
   1692 // CHECK-LABEL: define <4 x i32> @test_vmlaq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 {
   1693 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
   1694 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
   1695 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
   1696 // CHECK:   ret <4 x i32> [[ADD]]
   1697 int32x4_t test_vmlaq_laneq_s32_0(int32x4_t a, int32x4_t b, int32x4_t v) {
   1698   return vmlaq_laneq_s32(a, b, v, 0);
   1699 }
   1700 
   1701 // CHECK-LABEL: define <4 x i16> @test_vmls_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 {
   1702 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
   1703 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
   1704 // CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
   1705 // CHECK:   ret <4 x i16> [[SUB]]
   1706 int16x4_t test_vmls_lane_s16_0(int16x4_t a, int16x4_t b, int16x4_t v) {
   1707   return vmls_lane_s16(a, b, v, 0);
   1708 }
   1709 
   1710 // CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 {
   1711 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
   1712 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
   1713 // CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
   1714 // CHECK:   ret <8 x i16> [[SUB]]
   1715 int16x8_t test_vmlsq_lane_s16_0(int16x8_t a, int16x8_t b, int16x4_t v) {
   1716   return vmlsq_lane_s16(a, b, v, 0);
   1717 }
   1718 
   1719 // CHECK-LABEL: define <2 x i32> @test_vmls_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 {
   1720 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
   1721 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
   1722 // CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
   1723 // CHECK:   ret <2 x i32> [[SUB]]
   1724 int32x2_t test_vmls_lane_s32_0(int32x2_t a, int32x2_t b, int32x2_t v) {
   1725   return vmls_lane_s32(a, b, v, 0);
   1726 }
   1727 
   1728 // CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 {
   1729 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
   1730 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
   1731 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
   1732 // CHECK:   ret <4 x i32> [[SUB]]
   1733 int32x4_t test_vmlsq_lane_s32_0(int32x4_t a, int32x4_t b, int32x2_t v) {
   1734   return vmlsq_lane_s32(a, b, v, 0);
   1735 }
   1736 
   1737 // CHECK-LABEL: define <4 x i16> @test_vmls_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
   1738 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
   1739 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
   1740 // CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
   1741 // CHECK:   ret <4 x i16> [[SUB]]
   1742 int16x4_t test_vmls_laneq_s16_0(int16x4_t a, int16x4_t b, int16x8_t v) {
   1743   return vmls_laneq_s16(a, b, v, 0);
   1744 }
   1745 
   1746 // CHECK-LABEL: define <8 x i16> @test_vmlsq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 {
   1747 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
   1748 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
   1749 // CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
   1750 // CHECK:   ret <8 x i16> [[SUB]]
   1751 int16x8_t test_vmlsq_laneq_s16_0(int16x8_t a, int16x8_t b, int16x8_t v) {
   1752   return vmlsq_laneq_s16(a, b, v, 0);
   1753 }
   1754 
   1755 // CHECK-LABEL: define <2 x i32> @test_vmls_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 {
   1756 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
   1757 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
   1758 // CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
   1759 // CHECK:   ret <2 x i32> [[SUB]]
   1760 int32x2_t test_vmls_laneq_s32_0(int32x2_t a, int32x2_t b, int32x4_t v) {
   1761   return vmls_laneq_s32(a, b, v, 0);
   1762 }
   1763 
   1764 // CHECK-LABEL: define <4 x i32> @test_vmlsq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 {
   1765 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
   1766 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
   1767 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
   1768 // CHECK:   ret <4 x i32> [[SUB]]
   1769 int32x4_t test_vmlsq_laneq_s32_0(int32x4_t a, int32x4_t b, int32x4_t v) {
   1770   return vmlsq_laneq_s32(a, b, v, 0);
   1771 }
   1772 
   1773 // CHECK-LABEL: define <4 x i16> @test_vmul_lane_s16_0(<4 x i16> %a, <4 x i16> %v) #0 {
   1774 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
   1775 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
   1776 // CHECK:   ret <4 x i16> [[MUL]]
   1777 int16x4_t test_vmul_lane_s16_0(int16x4_t a, int16x4_t v) {
   1778   return vmul_lane_s16(a, v, 0);
   1779 }
   1780 
   1781 // CHECK-LABEL: define <8 x i16> @test_vmulq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) #0 {
   1782 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
   1783 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
   1784 // CHECK:   ret <8 x i16> [[MUL]]
   1785 int16x8_t test_vmulq_lane_s16_0(int16x8_t a, int16x4_t v) {
   1786   return vmulq_lane_s16(a, v, 0);
   1787 }
   1788 
   1789 // CHECK-LABEL: define <2 x i32> @test_vmul_lane_s32_0(<2 x i32> %a, <2 x i32> %v) #0 {
   1790 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
   1791 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
   1792 // CHECK:   ret <2 x i32> [[MUL]]
   1793 int32x2_t test_vmul_lane_s32_0(int32x2_t a, int32x2_t v) {
   1794   return vmul_lane_s32(a, v, 0);
   1795 }
   1796 
   1797 // CHECK-LABEL: define <4 x i32> @test_vmulq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) #0 {
   1798 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
   1799 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
   1800 // CHECK:   ret <4 x i32> [[MUL]]
   1801 int32x4_t test_vmulq_lane_s32_0(int32x4_t a, int32x2_t v) {
   1802   return vmulq_lane_s32(a, v, 0);
   1803 }
   1804 
   1805 // CHECK-LABEL: define <4 x i16> @test_vmul_lane_u16_0(<4 x i16> %a, <4 x i16> %v) #0 {
   1806 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
   1807 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
   1808 // CHECK:   ret <4 x i16> [[MUL]]
   1809 uint16x4_t test_vmul_lane_u16_0(uint16x4_t a, uint16x4_t v) {
   1810   return vmul_lane_u16(a, v, 0);
   1811 }
   1812 
   1813 // CHECK-LABEL: define <8 x i16> @test_vmulq_lane_u16_0(<8 x i16> %a, <4 x i16> %v) #0 {
   1814 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
   1815 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
   1816 // CHECK:   ret <8 x i16> [[MUL]]
   1817 uint16x8_t test_vmulq_lane_u16_0(uint16x8_t a, uint16x4_t v) {
   1818   return vmulq_lane_u16(a, v, 0);
   1819 }
   1820 
   1821 // CHECK-LABEL: define <2 x i32> @test_vmul_lane_u32_0(<2 x i32> %a, <2 x i32> %v) #0 {
   1822 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
   1823 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
   1824 // CHECK:   ret <2 x i32> [[MUL]]
   1825 uint32x2_t test_vmul_lane_u32_0(uint32x2_t a, uint32x2_t v) {
   1826   return vmul_lane_u32(a, v, 0);
   1827 }
   1828 
   1829 // CHECK-LABEL: define <4 x i32> @test_vmulq_lane_u32_0(<4 x i32> %a, <2 x i32> %v) #0 {
   1830 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
   1831 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
   1832 // CHECK:   ret <4 x i32> [[MUL]]
   1833 uint32x4_t test_vmulq_lane_u32_0(uint32x4_t a, uint32x2_t v) {
   1834   return vmulq_lane_u32(a, v, 0);
   1835 }
   1836 
   1837 // CHECK-LABEL: define <4 x i16> @test_vmul_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) #0 {
   1838 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
   1839 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
   1840 // CHECK:   ret <4 x i16> [[MUL]]
   1841 int16x4_t test_vmul_laneq_s16_0(int16x4_t a, int16x8_t v) {
   1842   return vmul_laneq_s16(a, v, 0);
   1843 }
   1844 
   1845 // CHECK-LABEL: define <8 x i16> @test_vmulq_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) #0 {
   1846 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
   1847 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
   1848 // CHECK:   ret <8 x i16> [[MUL]]
   1849 int16x8_t test_vmulq_laneq_s16_0(int16x8_t a, int16x8_t v) {
   1850   return vmulq_laneq_s16(a, v, 0);
   1851 }
   1852 
   1853 // CHECK-LABEL: define <2 x i32> @test_vmul_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) #0 {
   1854 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
   1855 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
   1856 // CHECK:   ret <2 x i32> [[MUL]]
   1857 int32x2_t test_vmul_laneq_s32_0(int32x2_t a, int32x4_t v) {
   1858   return vmul_laneq_s32(a, v, 0);
   1859 }
   1860 
   1861 // CHECK-LABEL: define <4 x i32> @test_vmulq_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) #0 {
   1862 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
   1863 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
   1864 // CHECK:   ret <4 x i32> [[MUL]]
   1865 int32x4_t test_vmulq_laneq_s32_0(int32x4_t a, int32x4_t v) {
   1866   return vmulq_laneq_s32(a, v, 0);
   1867 }
   1868 
   1869 // CHECK-LABEL: define <4 x i16> @test_vmul_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) #0 {
   1870 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
   1871 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
   1872 // CHECK:   ret <4 x i16> [[MUL]]
   1873 uint16x4_t test_vmul_laneq_u16_0(uint16x4_t a, uint16x8_t v) {
   1874   return vmul_laneq_u16(a, v, 0);
   1875 }
   1876 
   1877 // CHECK-LABEL: define <8 x i16> @test_vmulq_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) #0 {
   1878 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
   1879 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
   1880 // CHECK:   ret <8 x i16> [[MUL]]
   1881 uint16x8_t test_vmulq_laneq_u16_0(uint16x8_t a, uint16x8_t v) {
   1882   return vmulq_laneq_u16(a, v, 0);
   1883 }
   1884 
   1885 // CHECK-LABEL: define <2 x i32> @test_vmul_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) #0 {
   1886 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
   1887 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
   1888 // CHECK:   ret <2 x i32> [[MUL]]
   1889 uint32x2_t test_vmul_laneq_u32_0(uint32x2_t a, uint32x4_t v) {
   1890   return vmul_laneq_u32(a, v, 0);
   1891 }
   1892 
   1893 // CHECK-LABEL: define <4 x i32> @test_vmulq_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) #0 {
   1894 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
   1895 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
   1896 // CHECK:   ret <4 x i32> [[MUL]]
   1897 uint32x4_t test_vmulq_laneq_u32_0(uint32x4_t a, uint32x4_t v) {
   1898   return vmulq_laneq_u32(a, v, 0);
   1899 }
   1900 
   1901 // CHECK-LABEL: define <2 x float> @test_vfma_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 {
   1902 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   1903 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
   1904 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
   1905 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
   1906 // CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer
   1907 // CHECK:   [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
   1908 // CHECK:   [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   1909 // CHECK:   [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
   1910 // CHECK:   ret <2 x float> [[FMLA2]]
   1911 float32x2_t test_vfma_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
   1912   return vfma_lane_f32(a, b, v, 0);
   1913 }
   1914 
   1915 // CHECK-LABEL: define <4 x float> @test_vfmaq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 {
   1916 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
   1917 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
   1918 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
   1919 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
   1920 // CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> zeroinitializer
   1921 // CHECK:   [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
   1922 // CHECK:   [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
   1923 // CHECK:   [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
   1924 // CHECK:   ret <4 x float> [[FMLA2]]
   1925 float32x4_t test_vfmaq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) {
   1926   return vfmaq_lane_f32(a, b, v, 0);
   1927 }
   1928 
   1929 // CHECK-LABEL: define <2 x float> @test_vfma_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 {
   1930 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   1931 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
   1932 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
   1933 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   1934 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
   1935 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
   1936 // CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> zeroinitializer
   1937 // CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
   1938 // CHECK:   ret <2 x float> [[TMP6]]
   1939 float32x2_t test_vfma_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
   1940   return vfma_laneq_f32(a, b, v, 0);
   1941 }
   1942 
   1943 // CHECK-LABEL: define <4 x float> @test_vfmaq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 {
   1944 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
   1945 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
   1946 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
   1947 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
   1948 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
   1949 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
   1950 // CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> zeroinitializer
   1951 // CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
   1952 // CHECK:   ret <4 x float> [[TMP6]]
   1953 float32x4_t test_vfmaq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) {
   1954   return vfmaq_laneq_f32(a, b, v, 0);
   1955 }
   1956 
   1957 // CHECK-LABEL: define <2 x float> @test_vfms_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 {
   1958 // CHECK:   [[SUB:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
   1959 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   1960 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8>
   1961 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
   1962 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
   1963 // CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer
   1964 // CHECK:   [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
   1965 // CHECK:   [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   1966 // CHECK:   [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
   1967 // CHECK:   ret <2 x float> [[FMLA2]]
   1968 float32x2_t test_vfms_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
   1969   return vfms_lane_f32(a, b, v, 0);
   1970 }
   1971 
   1972 // CHECK-LABEL: define <4 x float> @test_vfmsq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 {
   1973 // CHECK:   [[SUB:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
   1974 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
   1975 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8>
   1976 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
   1977 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
   1978 // CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> zeroinitializer
   1979 // CHECK:   [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
   1980 // CHECK:   [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
   1981 // CHECK:   [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
   1982 // CHECK:   ret <4 x float> [[FMLA2]]
   1983 float32x4_t test_vfmsq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) {
   1984   return vfmsq_lane_f32(a, b, v, 0);
   1985 }
   1986 
   1987 // CHECK-LABEL: define <2 x float> @test_vfms_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 {
   1988 // CHECK:   [[SUB:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
   1989 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   1990 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8>
   1991 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
   1992 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   1993 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
   1994 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
   1995 // CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> zeroinitializer
   1996 // CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
   1997 // CHECK:   ret <2 x float> [[TMP6]]
   1998 float32x2_t test_vfms_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
   1999   return vfms_laneq_f32(a, b, v, 0);
   2000 }
   2001 
   2002 // CHECK-LABEL: define <4 x float> @test_vfmsq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 {
   2003 // CHECK:   [[SUB:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
   2004 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
   2005 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8>
   2006 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
   2007 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
   2008 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
   2009 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
   2010 // CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> zeroinitializer
   2011 // CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
   2012 // CHECK:   ret <4 x float> [[TMP6]]
   2013 float32x4_t test_vfmsq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) {
   2014   return vfmsq_laneq_f32(a, b, v, 0);
   2015 }
   2016 
   2017 // CHECK-LABEL: define <2 x double> @test_vfmaq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) #0 {
   2018 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
   2019 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
   2020 // CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
   2021 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
   2022 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
   2023 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
   2024 // CHECK:   [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> zeroinitializer
   2025 // CHECK:   [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
   2026 // CHECK:   ret <2 x double> [[TMP6]]
   2027 float64x2_t test_vfmaq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) {
   2028   return vfmaq_laneq_f64(a, b, v, 0);
   2029 }
   2030 
   2031 // CHECK-LABEL: define <2 x double> @test_vfmsq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) #0 {
   2032 // CHECK:   [[SUB:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %b
   2033 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
   2034 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SUB]] to <16 x i8>
   2035 // CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
   2036 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
   2037 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
   2038 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
   2039 // CHECK:   [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> zeroinitializer
   2040 // CHECK:   [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
   2041 // CHECK:   ret <2 x double> [[TMP6]]
   2042 float64x2_t test_vfmsq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) {
   2043   return vfmsq_laneq_f64(a, b, v, 0);
   2044 }
   2045 
   2046 // CHECK-LABEL: define <4 x i32> @test_vmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
   2047 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
   2048 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   2049 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   2050 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   2051 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   2052 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
   2053 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
   2054 // CHECK:   ret <4 x i32> [[ADD]]
   2055 int32x4_t test_vmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
   2056   return vmlal_lane_s16(a, b, v, 0);
   2057 }
   2058 
   2059 // CHECK-LABEL: define <2 x i64> @test_vmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
   2060 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
   2061 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   2062 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   2063 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   2064 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   2065 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
   2066 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
   2067 // CHECK:   ret <2 x i64> [[ADD]]
   2068 int64x2_t test_vmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
   2069   return vmlal_lane_s32(a, b, v, 0);
   2070 }
   2071 
   2072 // CHECK-LABEL: define <4 x i32> @test_vmlal_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
   2073 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
   2074 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   2075 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   2076 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   2077 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   2078 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
   2079 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
   2080 // CHECK:   ret <4 x i32> [[ADD]]
   2081 int32x4_t test_vmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
   2082   return vmlal_laneq_s16(a, b, v, 0);
   2083 }
   2084 
   2085 // CHECK-LABEL: define <2 x i64> @test_vmlal_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
   2086 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
   2087 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   2088 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   2089 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   2090 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   2091 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
   2092 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
   2093 // CHECK:   ret <2 x i64> [[ADD]]
   2094 int64x2_t test_vmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
   2095   return vmlal_laneq_s32(a, b, v, 0);
   2096 }
   2097 
   2098 // CHECK-LABEL: define <4 x i32> @test_vmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
   2099 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   2100 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
   2101 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
   2102 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   2103 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   2104 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   2105 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
   2106 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
   2107 // CHECK:   ret <4 x i32> [[ADD]]
   2108 int32x4_t test_vmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
   2109   return vmlal_high_lane_s16(a, b, v, 0);
   2110 }
   2111 
   2112 // CHECK-LABEL: define <2 x i64> @test_vmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
   2113 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
   2114 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
   2115 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
   2116 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   2117 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   2118 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   2119 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
   2120 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
   2121 // CHECK:   ret <2 x i64> [[ADD]]
   2122 int64x2_t test_vmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
   2123   return vmlal_high_lane_s32(a, b, v, 0);
   2124 }
   2125 
   2126 // CHECK-LABEL: define <4 x i32> @test_vmlal_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
   2127 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   2128 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
   2129 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
   2130 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   2131 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   2132 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   2133 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
   2134 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
   2135 // CHECK:   ret <4 x i32> [[ADD]]
   2136 int32x4_t test_vmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
   2137   return vmlal_high_laneq_s16(a, b, v, 0);
   2138 }
   2139 
   2140 // CHECK-LABEL: define <2 x i64> @test_vmlal_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
   2141 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
   2142 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
   2143 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
   2144 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   2145 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   2146 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   2147 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
   2148 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
   2149 // CHECK:   ret <2 x i64> [[ADD]]
   2150 int64x2_t test_vmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
   2151   return vmlal_high_laneq_s32(a, b, v, 0);
   2152 }
   2153 
   2154 // CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
   2155 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
   2156 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   2157 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   2158 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   2159 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   2160 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
   2161 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
   2162 // CHECK:   ret <4 x i32> [[SUB]]
   2163 int32x4_t test_vmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
   2164   return vmlsl_lane_s16(a, b, v, 0);
   2165 }
   2166 
   2167 // CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
   2168 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
   2169 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   2170 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   2171 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   2172 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   2173 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
   2174 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
   2175 // CHECK:   ret <2 x i64> [[SUB]]
   2176 int64x2_t test_vmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
   2177   return vmlsl_lane_s32(a, b, v, 0);
   2178 }
   2179 
   2180 // CHECK-LABEL: define <4 x i32> @test_vmlsl_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
   2181 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
   2182 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   2183 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   2184 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   2185 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   2186 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
   2187 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
   2188 // CHECK:   ret <4 x i32> [[SUB]]
   2189 int32x4_t test_vmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
   2190   return vmlsl_laneq_s16(a, b, v, 0);
   2191 }
   2192 
   2193 // CHECK-LABEL: define <2 x i64> @test_vmlsl_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
   2194 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
   2195 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   2196 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   2197 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   2198 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   2199 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
   2200 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
   2201 // CHECK:   ret <2 x i64> [[SUB]]
   2202 int64x2_t test_vmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
   2203   return vmlsl_laneq_s32(a, b, v, 0);
   2204 }
   2205 
   2206 // CHECK-LABEL: define <4 x i32> @test_vmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
   2207 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   2208 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
   2209 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
   2210 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   2211 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   2212 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   2213 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
   2214 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
   2215 // CHECK:   ret <4 x i32> [[SUB]]
   2216 int32x4_t test_vmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
   2217   return vmlsl_high_lane_s16(a, b, v, 0);
   2218 }
   2219 
   2220 // CHECK-LABEL: define <2 x i64> @test_vmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
   2221 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
   2222 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
   2223 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
   2224 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   2225 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   2226 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   2227 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
   2228 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
   2229 // CHECK:   ret <2 x i64> [[SUB]]
   2230 int64x2_t test_vmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
   2231   return vmlsl_high_lane_s32(a, b, v, 0);
   2232 }
   2233 
   2234 // CHECK-LABEL: define <4 x i32> @test_vmlsl_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
   2235 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   2236 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
   2237 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
   2238 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   2239 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   2240 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   2241 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
   2242 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
   2243 // CHECK:   ret <4 x i32> [[SUB]]
   2244 int32x4_t test_vmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
   2245   return vmlsl_high_laneq_s16(a, b, v, 0);
   2246 }
   2247 
   2248 // CHECK-LABEL: define <2 x i64> @test_vmlsl_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
   2249 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
   2250 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
   2251 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
   2252 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   2253 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   2254 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   2255 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
   2256 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
   2257 // CHECK:   ret <2 x i64> [[SUB]]
   2258 int64x2_t test_vmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
   2259   return vmlsl_high_laneq_s32(a, b, v, 0);
   2260 }
   2261 
   2262 // CHECK-LABEL: define <4 x i32> @test_vmlal_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
   2263 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
   2264 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   2265 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   2266 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   2267 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   2268 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
   2269 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
   2270 // CHECK:   ret <4 x i32> [[ADD]]
   2271 int32x4_t test_vmlal_lane_u16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
   2272   return vmlal_lane_u16(a, b, v, 0);
   2273 }
   2274 
   2275 // CHECK-LABEL: define <2 x i64> @test_vmlal_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
   2276 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
   2277 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   2278 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   2279 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   2280 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   2281 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
   2282 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
   2283 // CHECK:   ret <2 x i64> [[ADD]]
   2284 int64x2_t test_vmlal_lane_u32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
   2285   return vmlal_lane_u32(a, b, v, 0);
   2286 }
   2287 
   2288 // CHECK-LABEL: define <4 x i32> @test_vmlal_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
   2289 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
   2290 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   2291 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   2292 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   2293 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   2294 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
   2295 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
   2296 // CHECK:   ret <4 x i32> [[ADD]]
   2297 int32x4_t test_vmlal_laneq_u16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
   2298   return vmlal_laneq_u16(a, b, v, 0);
   2299 }
   2300 
   2301 // CHECK-LABEL: define <2 x i64> @test_vmlal_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
   2302 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
   2303 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   2304 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   2305 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   2306 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   2307 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
   2308 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
   2309 // CHECK:   ret <2 x i64> [[ADD]]
   2310 int64x2_t test_vmlal_laneq_u32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
   2311   return vmlal_laneq_u32(a, b, v, 0);
   2312 }
   2313 
   2314 // CHECK-LABEL: define <4 x i32> @test_vmlal_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
   2315 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   2316 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
   2317 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
   2318 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   2319 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   2320 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   2321 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
   2322 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
   2323 // CHECK:   ret <4 x i32> [[ADD]]
   2324 int32x4_t test_vmlal_high_lane_u16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
   2325   return vmlal_high_lane_u16(a, b, v, 0);
   2326 }
   2327 
   2328 // CHECK-LABEL: define <2 x i64> @test_vmlal_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
   2329 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
   2330 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
   2331 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
   2332 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   2333 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   2334 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   2335 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
   2336 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
   2337 // CHECK:   ret <2 x i64> [[ADD]]
   2338 int64x2_t test_vmlal_high_lane_u32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
   2339   return vmlal_high_lane_u32(a, b, v, 0);
   2340 }
   2341 
   2342 // CHECK-LABEL: define <4 x i32> @test_vmlal_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
   2343 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   2344 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
   2345 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
   2346 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   2347 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   2348 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   2349 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
   2350 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
   2351 // CHECK:   ret <4 x i32> [[ADD]]
   2352 int32x4_t test_vmlal_high_laneq_u16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
   2353   return vmlal_high_laneq_u16(a, b, v, 0);
   2354 }
   2355 
   2356 // CHECK-LABEL: define <2 x i64> @test_vmlal_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
   2357 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
   2358 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
   2359 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
   2360 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   2361 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   2362 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   2363 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
   2364 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
   2365 // CHECK:   ret <2 x i64> [[ADD]]
   2366 int64x2_t test_vmlal_high_laneq_u32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
   2367   return vmlal_high_laneq_u32(a, b, v, 0);
   2368 }
   2369 
   2370 // CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
   2371 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
   2372 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   2373 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   2374 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   2375 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   2376 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
   2377 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
   2378 // CHECK:   ret <4 x i32> [[SUB]]
   2379 int32x4_t test_vmlsl_lane_u16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
   2380   return vmlsl_lane_u16(a, b, v, 0);
   2381 }
   2382 
   2383 // CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
   2384 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
   2385 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   2386 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   2387 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   2388 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   2389 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
   2390 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
   2391 // CHECK:   ret <2 x i64> [[SUB]]
   2392 int64x2_t test_vmlsl_lane_u32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
   2393   return vmlsl_lane_u32(a, b, v, 0);
   2394 }
   2395 
   2396 // CHECK-LABEL: define <4 x i32> @test_vmlsl_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
   2397 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
   2398 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   2399 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   2400 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   2401 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   2402 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
   2403 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
   2404 // CHECK:   ret <4 x i32> [[SUB]]
   2405 int32x4_t test_vmlsl_laneq_u16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
   2406   return vmlsl_laneq_u16(a, b, v, 0);
   2407 }
   2408 
   2409 // CHECK-LABEL: define <2 x i64> @test_vmlsl_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
   2410 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
   2411 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   2412 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   2413 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   2414 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   2415 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
   2416 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
   2417 // CHECK:   ret <2 x i64> [[SUB]]
   2418 int64x2_t test_vmlsl_laneq_u32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
   2419   return vmlsl_laneq_u32(a, b, v, 0);
   2420 }
   2421 
   2422 // CHECK-LABEL: define <4 x i32> @test_vmlsl_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
   2423 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   2424 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
   2425 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
   2426 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   2427 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   2428 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   2429 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
   2430 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
   2431 // CHECK:   ret <4 x i32> [[SUB]]
   2432 int32x4_t test_vmlsl_high_lane_u16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
   2433   return vmlsl_high_lane_u16(a, b, v, 0);
   2434 }
   2435 
   2436 // CHECK-LABEL: define <2 x i64> @test_vmlsl_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
   2437 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
   2438 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
   2439 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
   2440 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   2441 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   2442 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   2443 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
   2444 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
   2445 // CHECK:   ret <2 x i64> [[SUB]]
   2446 int64x2_t test_vmlsl_high_lane_u32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
   2447   return vmlsl_high_lane_u32(a, b, v, 0);
   2448 }
   2449 
   2450 // CHECK-LABEL: define <4 x i32> @test_vmlsl_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
   2451 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   2452 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
   2453 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
   2454 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   2455 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   2456 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   2457 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
   2458 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
   2459 // CHECK:   ret <4 x i32> [[SUB]]
   2460 int32x4_t test_vmlsl_high_laneq_u16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
   2461   return vmlsl_high_laneq_u16(a, b, v, 0);
   2462 }
   2463 
   2464 // CHECK-LABEL: define <2 x i64> @test_vmlsl_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
   2465 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
   2466 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
   2467 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
   2468 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   2469 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   2470 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   2471 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
   2472 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
   2473 // CHECK:   ret <2 x i64> [[SUB]]
   2474 int64x2_t test_vmlsl_high_laneq_u32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
   2475   return vmlsl_high_laneq_u32(a, b, v, 0);
   2476 }
   2477 
   2478 // CHECK-LABEL: define <4 x i32> @test_vmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) #0 {
   2479 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
   2480 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   2481 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   2482 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   2483 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   2484 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
   2485 // CHECK:   ret <4 x i32> [[VMULL2_I]]
   2486 int32x4_t test_vmull_lane_s16_0(int16x4_t a, int16x4_t v) {
   2487   return vmull_lane_s16(a, v, 0);
   2488 }
   2489 
   2490 // CHECK-LABEL: define <2 x i64> @test_vmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) #0 {
   2491 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
   2492 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   2493 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   2494 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   2495 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   2496 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
   2497 // CHECK:   ret <2 x i64> [[VMULL2_I]]
   2498 int64x2_t test_vmull_lane_s32_0(int32x2_t a, int32x2_t v) {
   2499   return vmull_lane_s32(a, v, 0);
   2500 }
   2501 
   2502 // CHECK-LABEL: define <4 x i32> @test_vmull_lane_u16_0(<4 x i16> %a, <4 x i16> %v) #0 {
   2503 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
   2504 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   2505 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   2506 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   2507 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   2508 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
   2509 // CHECK:   ret <4 x i32> [[VMULL2_I]]
   2510 uint32x4_t test_vmull_lane_u16_0(uint16x4_t a, uint16x4_t v) {
   2511   return vmull_lane_u16(a, v, 0);
   2512 }
   2513 
   2514 // CHECK-LABEL: define <2 x i64> @test_vmull_lane_u32_0(<2 x i32> %a, <2 x i32> %v) #0 {
   2515 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
   2516 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   2517 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   2518 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   2519 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   2520 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
   2521 // CHECK:   ret <2 x i64> [[VMULL2_I]]
   2522 uint64x2_t test_vmull_lane_u32_0(uint32x2_t a, uint32x2_t v) {
   2523   return vmull_lane_u32(a, v, 0);
   2524 }
   2525 
   2526 // CHECK-LABEL: define <4 x i32> @test_vmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) #0 {
   2527 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   2528 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
   2529 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
   2530 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   2531 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   2532 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   2533 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
   2534 // CHECK:   ret <4 x i32> [[VMULL2_I]]
   2535 int32x4_t test_vmull_high_lane_s16_0(int16x8_t a, int16x4_t v) {
   2536   return vmull_high_lane_s16(a, v, 0);
   2537 }
   2538 
   2539 // CHECK-LABEL: define <2 x i64> @test_vmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) #0 {
   2540 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
   2541 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
   2542 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
   2543 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   2544 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   2545 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   2546 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
   2547 // CHECK:   ret <2 x i64> [[VMULL2_I]]
   2548 int64x2_t test_vmull_high_lane_s32_0(int32x4_t a, int32x2_t v) {
   2549   return vmull_high_lane_s32(a, v, 0);
   2550 }
   2551 
   2552 // CHECK-LABEL: define <4 x i32> @test_vmull_high_lane_u16_0(<8 x i16> %a, <4 x i16> %v) #0 {
   2553 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   2554 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
   2555 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
   2556 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   2557 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   2558 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   2559 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
   2560 // CHECK:   ret <4 x i32> [[VMULL2_I]]
   2561 uint32x4_t test_vmull_high_lane_u16_0(uint16x8_t a, uint16x4_t v) {
   2562   return vmull_high_lane_u16(a, v, 0);
   2563 }
   2564 
   2565 // CHECK-LABEL: define <2 x i64> @test_vmull_high_lane_u32_0(<4 x i32> %a, <2 x i32> %v) #0 {
   2566 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
   2567 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
   2568 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
   2569 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   2570 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   2571 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   2572 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
   2573 // CHECK:   ret <2 x i64> [[VMULL2_I]]
   2574 uint64x2_t test_vmull_high_lane_u32_0(uint32x4_t a, uint32x2_t v) {
   2575   return vmull_high_lane_u32(a, v, 0);
   2576 }
   2577 
   2578 // CHECK-LABEL: define <4 x i32> @test_vmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) #0 {
   2579 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
   2580 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   2581 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   2582 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   2583 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   2584 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
   2585 // CHECK:   ret <4 x i32> [[VMULL2_I]]
   2586 int32x4_t test_vmull_laneq_s16_0(int16x4_t a, int16x8_t v) {
   2587   return vmull_laneq_s16(a, v, 0);
   2588 }
   2589 
   2590 // CHECK-LABEL: define <2 x i64> @test_vmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) #0 {
   2591 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
   2592 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   2593 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   2594 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   2595 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   2596 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
   2597 // CHECK:   ret <2 x i64> [[VMULL2_I]]
   2598 int64x2_t test_vmull_laneq_s32_0(int32x2_t a, int32x4_t v) {
   2599   return vmull_laneq_s32(a, v, 0);
   2600 }
   2601 
   2602 // CHECK-LABEL: define <4 x i32> @test_vmull_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) #0 {
   2603 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
   2604 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   2605 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   2606 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   2607 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   2608 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
   2609 // CHECK:   ret <4 x i32> [[VMULL2_I]]
   2610 uint32x4_t test_vmull_laneq_u16_0(uint16x4_t a, uint16x8_t v) {
   2611   return vmull_laneq_u16(a, v, 0);
   2612 }
   2613 
   2614 // CHECK-LABEL: define <2 x i64> @test_vmull_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) #0 {
   2615 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
   2616 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   2617 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   2618 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   2619 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   2620 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
   2621 // CHECK:   ret <2 x i64> [[VMULL2_I]]
   2622 uint64x2_t test_vmull_laneq_u32_0(uint32x2_t a, uint32x4_t v) {
   2623   return vmull_laneq_u32(a, v, 0);
   2624 }
   2625 
   2626 // CHECK-LABEL: define <4 x i32> @test_vmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) #0 {
   2627 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   2628 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
   2629 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
   2630 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   2631 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   2632 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   2633 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
   2634 // CHECK:   ret <4 x i32> [[VMULL2_I]]
   2635 int32x4_t test_vmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) {
   2636   return vmull_high_laneq_s16(a, v, 0);
   2637 }
   2638 
   2639 // CHECK-LABEL: define <2 x i64> @test_vmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) #0 {
   2640 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
   2641 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
   2642 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
   2643 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   2644 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   2645 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   2646 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
   2647 // CHECK:   ret <2 x i64> [[VMULL2_I]]
   2648 int64x2_t test_vmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) {
   2649   return vmull_high_laneq_s32(a, v, 0);
   2650 }
   2651 
   2652 // CHECK-LABEL: define <4 x i32> @test_vmull_high_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) #0 {
   2653 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   2654 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
   2655 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
   2656 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   2657 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   2658 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   2659 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
   2660 // CHECK:   ret <4 x i32> [[VMULL2_I]]
   2661 uint32x4_t test_vmull_high_laneq_u16_0(uint16x8_t a, uint16x8_t v) {
   2662   return vmull_high_laneq_u16(a, v, 0);
   2663 }
   2664 
   2665 // CHECK-LABEL: define <2 x i64> @test_vmull_high_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) #0 {
   2666 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
   2667 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
   2668 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
   2669 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   2670 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   2671 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   2672 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
   2673 // CHECK:   ret <2 x i64> [[VMULL2_I]]
   2674 uint64x2_t test_vmull_high_laneq_u32_0(uint32x4_t a, uint32x4_t v) {
   2675   return vmull_high_laneq_u32(a, v, 0);
   2676 }
   2677 
   2678 // CHECK-LABEL: define <4 x i32> @test_vqdmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
   2679 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
   2680 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   2681 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   2682 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   2683 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   2684 // CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
   2685 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
   2686 // CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   2687 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
   2688 // CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
   2689 int32x4_t test_vqdmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
   2690   return vqdmlal_lane_s16(a, b, v, 0);
   2691 }
   2692 
   2693 // CHECK-LABEL: define <2 x i64> @test_vqdmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
   2694 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
   2695 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   2696 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   2697 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   2698 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   2699 // CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
   2700 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
   2701 // CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   2702 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
   2703 // CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
   2704 int64x2_t test_vqdmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
   2705   return vqdmlal_lane_s32(a, b, v, 0);
   2706 }
   2707 
   2708 // CHECK-LABEL: define <4 x i32> @test_vqdmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
   2709 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   2710 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
   2711 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   2712 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
   2713 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   2714 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   2715 // CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
   2716 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
   2717 // CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   2718 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
   2719 // CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
   2720 int32x4_t test_vqdmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
   2721   return vqdmlal_high_lane_s16(a, b, v, 0);
   2722 }
   2723 
   2724 // CHECK-LABEL: define <2 x i64> @test_vqdmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
   2725 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
   2726 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
   2727 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   2728 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
   2729 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   2730 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   2731 // CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
   2732 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
   2733 // CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   2734 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
   2735 // CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
   2736 int64x2_t test_vqdmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
   2737   return vqdmlal_high_lane_s32(a, b, v, 0);
   2738 }
   2739 
   2740 // CHECK-LABEL: define <4 x i32> @test_vqdmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
   2741 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
   2742 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   2743 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   2744 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   2745 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   2746 // CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
   2747 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
   2748 // CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   2749 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
   2750 // CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
   2751 int32x4_t test_vqdmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
   2752   return vqdmlsl_lane_s16(a, b, v, 0);
   2753 }
   2754 
   2755 // CHECK-LABEL: define <2 x i64> @test_vqdmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
   2756 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
   2757 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   2758 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   2759 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   2760 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   2761 // CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
   2762 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
   2763 // CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   2764 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
   2765 // CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
   2766 int64x2_t test_vqdmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
   2767   return vqdmlsl_lane_s32(a, b, v, 0);
   2768 }
   2769 
   2770 // CHECK-LABEL: define <4 x i32> @test_vqdmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
   2771 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   2772 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
   2773 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   2774 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
   2775 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   2776 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   2777 // CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
   2778 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
   2779 // CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   2780 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
   2781 // CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
   2782 int32x4_t test_vqdmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
   2783   return vqdmlsl_high_lane_s16(a, b, v, 0);
   2784 }
   2785 
   2786 // CHECK-LABEL: define <2 x i64> @test_vqdmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
   2787 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
   2788 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
   2789 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   2790 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
   2791 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   2792 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   2793 // CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
   2794 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
   2795 // CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   2796 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
   2797 // CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
   2798 int64x2_t test_vqdmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
   2799   return vqdmlsl_high_lane_s32(a, b, v, 0);
   2800 }
   2801 
   2802 // CHECK-LABEL: define <4 x i32> @test_vqdmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) #0 {
   2803 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
   2804 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   2805 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   2806 // CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   2807 // CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   2808 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2
   2809 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
   2810 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
   2811 // CHECK:   ret <4 x i32> [[TMP2]]
   2812 int32x4_t test_vqdmull_lane_s16_0(int16x4_t a, int16x4_t v) {
   2813   return vqdmull_lane_s16(a, v, 0);
   2814 }
   2815 
   2816 // CHECK-LABEL: define <2 x i64> @test_vqdmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) #0 {
   2817 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
   2818 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   2819 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   2820 // CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   2821 // CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   2822 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2
   2823 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
   2824 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
   2825 // CHECK:   ret <2 x i64> [[TMP2]]
   2826 int64x2_t test_vqdmull_lane_s32_0(int32x2_t a, int32x2_t v) {
   2827   return vqdmull_lane_s32(a, v, 0);
   2828 }
   2829 
   2830 // CHECK-LABEL: define <4 x i32> @test_vqdmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) #0 {
   2831 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
   2832 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   2833 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   2834 // CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   2835 // CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   2836 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2
   2837 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
   2838 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
   2839 // CHECK:   ret <4 x i32> [[TMP2]]
   2840 int32x4_t test_vqdmull_laneq_s16_0(int16x4_t a, int16x8_t v) {
   2841   return vqdmull_laneq_s16(a, v, 0);
   2842 }
   2843 
   2844 // CHECK-LABEL: define <2 x i64> @test_vqdmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) #0 {
   2845 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
   2846 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   2847 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   2848 // CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   2849 // CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   2850 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2
   2851 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
   2852 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
   2853 // CHECK:   ret <2 x i64> [[TMP2]]
   2854 int64x2_t test_vqdmull_laneq_s32_0(int32x2_t a, int32x4_t v) {
   2855   return vqdmull_laneq_s32(a, v, 0);
   2856 }
   2857 
   2858 // CHECK-LABEL: define <4 x i32> @test_vqdmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) #0 {
   2859 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   2860 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
   2861 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
   2862 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   2863 // CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   2864 // CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   2865 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2
   2866 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
   2867 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
   2868 // CHECK:   ret <4 x i32> [[TMP2]]
   2869 int32x4_t test_vqdmull_high_lane_s16_0(int16x8_t a, int16x4_t v) {
   2870   return vqdmull_high_lane_s16(a, v, 0);
   2871 }
   2872 
   2873 // CHECK-LABEL: define <2 x i64> @test_vqdmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) #0 {
   2874 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
   2875 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
   2876 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
   2877 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   2878 // CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   2879 // CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   2880 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2
   2881 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
   2882 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
   2883 // CHECK:   ret <2 x i64> [[TMP2]]
   2884 int64x2_t test_vqdmull_high_lane_s32_0(int32x4_t a, int32x2_t v) {
   2885   return vqdmull_high_lane_s32(a, v, 0);
   2886 }
   2887 
   2888 // CHECK-LABEL: define <4 x i32> @test_vqdmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) #0 {
   2889 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   2890 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
   2891 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
   2892 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   2893 // CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   2894 // CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   2895 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2
   2896 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
   2897 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
   2898 // CHECK:   ret <4 x i32> [[TMP2]]
   2899 int32x4_t test_vqdmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) {
   2900   return vqdmull_high_laneq_s16(a, v, 0);
   2901 }
   2902 
   2903 // CHECK-LABEL: define <2 x i64> @test_vqdmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) #0 {
   2904 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
   2905 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
   2906 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
   2907 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   2908 // CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   2909 // CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   2910 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2
   2911 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
   2912 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
   2913 // CHECK:   ret <2 x i64> [[TMP2]]
   2914 int64x2_t test_vqdmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) {
   2915   return vqdmull_high_laneq_s32(a, v, 0);
   2916 }
   2917 
   2918 // CHECK-LABEL: define <4 x i16> @test_vqdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) #0 {
   2919 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
   2920 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   2921 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   2922 // CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   2923 // CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   2924 // CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) #2
   2925 // CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
   2926 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16>
   2927 // CHECK:   ret <4 x i16> [[TMP2]]
   2928 int16x4_t test_vqdmulh_lane_s16_0(int16x4_t a, int16x4_t v) {
   2929   return vqdmulh_lane_s16(a, v, 0);
   2930 }
   2931 
   2932 // CHECK-LABEL: define <8 x i16> @test_vqdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) #0 {
   2933 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
   2934 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   2935 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
   2936 // CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   2937 // CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   2938 // CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) #2
   2939 // CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
   2940 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16>
   2941 // CHECK:   ret <8 x i16> [[TMP2]]
   2942 int16x8_t test_vqdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) {
   2943   return vqdmulhq_lane_s16(a, v, 0);
   2944 }
   2945 
   2946 // CHECK-LABEL: define <2 x i32> @test_vqdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) #0 {
   2947 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
   2948 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   2949 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   2950 // CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   2951 // CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   2952 // CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) #2
   2953 // CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
   2954 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32>
   2955 // CHECK:   ret <2 x i32> [[TMP2]]
   2956 int32x2_t test_vqdmulh_lane_s32_0(int32x2_t a, int32x2_t v) {
   2957   return vqdmulh_lane_s32(a, v, 0);
   2958 }
   2959 
   2960 // CHECK-LABEL: define <4 x i32> @test_vqdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) #0 {
   2961 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
   2962 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   2963 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
   2964 // CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   2965 // CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   2966 // CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) #2
   2967 // CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
   2968 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32>
   2969 // CHECK:   ret <4 x i32> [[TMP2]]
   2970 int32x4_t test_vqdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) {
   2971   return vqdmulhq_lane_s32(a, v, 0);
   2972 }
   2973 
   2974 // CHECK-LABEL: define <4 x i16> @test_vqrdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) #0 {
   2975 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
   2976 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   2977 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   2978 // CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   2979 // CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   2980 // CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) #2
   2981 // CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
   2982 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16>
   2983 // CHECK:   ret <4 x i16> [[TMP2]]
   2984 int16x4_t test_vqrdmulh_lane_s16_0(int16x4_t a, int16x4_t v) {
   2985   return vqrdmulh_lane_s16(a, v, 0);
   2986 }
   2987 
   2988 // CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) #0 {
   2989 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
   2990 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   2991 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
   2992 // CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   2993 // CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   2994 // CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) #2
   2995 // CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
   2996 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16>
   2997 // CHECK:   ret <8 x i16> [[TMP2]]
   2998 int16x8_t test_vqrdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) {
   2999   return vqrdmulhq_lane_s16(a, v, 0);
   3000 }
   3001 
   3002 // CHECK-LABEL: define <2 x i32> @test_vqrdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) #0 {
   3003 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
   3004 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   3005 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   3006 // CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   3007 // CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   3008 // CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) #2
   3009 // CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
   3010 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32>
   3011 // CHECK:   ret <2 x i32> [[TMP2]]
   3012 int32x2_t test_vqrdmulh_lane_s32_0(int32x2_t a, int32x2_t v) {
   3013   return vqrdmulh_lane_s32(a, v, 0);
   3014 }
   3015 
   3016 // CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) #0 {
   3017 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
   3018 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   3019 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
   3020 // CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   3021 // CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   3022 // CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) #2
   3023 // CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
   3024 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32>
   3025 // CHECK:   ret <4 x i32> [[TMP2]]
   3026 int32x4_t test_vqrdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) {
   3027   return vqrdmulhq_lane_s32(a, v, 0);
   3028 }
   3029 
   3030 // CHECK-LABEL: define <2 x float> @test_vmul_lane_f32_0(<2 x float> %a, <2 x float> %v) #0 {
   3031 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> zeroinitializer
   3032 // CHECK:   [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
   3033 // CHECK:   ret <2 x float> [[MUL]]
   3034 float32x2_t test_vmul_lane_f32_0(float32x2_t a, float32x2_t v) {
   3035   return vmul_lane_f32(a, v, 0);
   3036 }
   3037 
   3038 // CHECK-LABEL: define <4 x float> @test_vmulq_lane_f32_0(<4 x float> %a, <2 x float> %v) #0 {
   3039 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> zeroinitializer
   3040 // CHECK:   [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
   3041 // CHECK:   ret <4 x float> [[MUL]]
   3042 float32x4_t test_vmulq_lane_f32_0(float32x4_t a, float32x2_t v) {
   3043   return vmulq_lane_f32(a, v, 0);
   3044 }
   3045 
   3046 // CHECK-LABEL: define <2 x float> @test_vmul_laneq_f32_0(<2 x float> %a, <4 x float> %v) #0 {
   3047 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> zeroinitializer
   3048 // CHECK:   [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
   3049 // CHECK:   ret <2 x float> [[MUL]]
   3050 float32x2_t test_vmul_laneq_f32_0(float32x2_t a, float32x4_t v) {
   3051   return vmul_laneq_f32(a, v, 0);
   3052 }
   3053 
   3054 // CHECK-LABEL: define <1 x double> @test_vmul_laneq_f64_0(<1 x double> %a, <2 x double> %v) #0 {
   3055 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
   3056 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v to <16 x i8>
   3057 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double
   3058 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
   3059 // CHECK:   [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
   3060 // CHECK:   [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]]
   3061 // CHECK:   [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double>
   3062 // CHECK:   ret <1 x double> [[TMP5]]
   3063 float64x1_t test_vmul_laneq_f64_0(float64x1_t a, float64x2_t v) {
   3064   return vmul_laneq_f64(a, v, 0);
   3065 }
   3066 
   3067 // CHECK-LABEL: define <4 x float> @test_vmulq_laneq_f32_0(<4 x float> %a, <4 x float> %v) #0 {
   3068 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> zeroinitializer
   3069 // CHECK:   [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
   3070 // CHECK:   ret <4 x float> [[MUL]]
   3071 float32x4_t test_vmulq_laneq_f32_0(float32x4_t a, float32x4_t v) {
   3072   return vmulq_laneq_f32(a, v, 0);
   3073 }
   3074 
   3075 // CHECK-LABEL: define <2 x double> @test_vmulq_laneq_f64_0(<2 x double> %a, <2 x double> %v) #0 {
   3076 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x double> %v, <2 x double> %v, <2 x i32> zeroinitializer
   3077 // CHECK:   [[MUL:%.*]] = fmul <2 x double> %a, [[SHUFFLE]]
   3078 // CHECK:   ret <2 x double> [[MUL]]
   3079 float64x2_t test_vmulq_laneq_f64_0(float64x2_t a, float64x2_t v) {
   3080   return vmulq_laneq_f64(a, v, 0);
   3081 }
   3082 
   3083 // CHECK-LABEL: define <2 x float> @test_vmulx_lane_f32_0(<2 x float> %a, <2 x float> %v) #0 {
   3084 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> zeroinitializer
   3085 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   3086 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8>
   3087 // CHECK:   [[VMULX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   3088 // CHECK:   [[VMULX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
   3089 // CHECK:   [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[VMULX_I]], <2 x float> [[VMULX1_I]]) #2
   3090 // CHECK:   ret <2 x float> [[VMULX2_I]]
   3091 float32x2_t test_vmulx_lane_f32_0(float32x2_t a, float32x2_t v) {
   3092   return vmulx_lane_f32(a, v, 0);
   3093 }
   3094 
   3095 // CHECK-LABEL: define <4 x float> @test_vmulxq_lane_f32_0(<4 x float> %a, <2 x float> %v) #0 {
   3096 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> zeroinitializer
   3097 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
   3098 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8>
   3099 // CHECK:   [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
   3100 // CHECK:   [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
   3101 // CHECK:   [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[VMULX_I]], <4 x float> [[VMULX1_I]]) #2
   3102 // CHECK:   ret <4 x float> [[VMULX2_I]]
   3103 float32x4_t test_vmulxq_lane_f32_0(float32x4_t a, float32x2_t v) {
   3104   return vmulxq_lane_f32(a, v, 0);
   3105 }
   3106 
   3107 // CHECK-LABEL: define <2 x double> @test_vmulxq_lane_f64_0(<2 x double> %a, <1 x double> %v) #0 {
   3108 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x double> %v, <1 x double> %v, <2 x i32> zeroinitializer
   3109 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
   3110 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8>
   3111 // CHECK:   [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
   3112 // CHECK:   [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
   3113 // CHECK:   [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[VMULX_I]], <2 x double> [[VMULX1_I]]) #2
   3114 // CHECK:   ret <2 x double> [[VMULX2_I]]
   3115 float64x2_t test_vmulxq_lane_f64_0(float64x2_t a, float64x1_t v) {
   3116   return vmulxq_lane_f64(a, v, 0);
   3117 }
   3118 
   3119 // CHECK-LABEL: define <2 x float> @test_vmulx_laneq_f32_0(<2 x float> %a, <4 x float> %v) #0 {
   3120 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> zeroinitializer
   3121 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   3122 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8>
   3123 // CHECK:   [[VMULX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   3124 // CHECK:   [[VMULX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
   3125 // CHECK:   [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[VMULX_I]], <2 x float> [[VMULX1_I]]) #2
   3126 // CHECK:   ret <2 x float> [[VMULX2_I]]
   3127 float32x2_t test_vmulx_laneq_f32_0(float32x2_t a, float32x4_t v) {
   3128   return vmulx_laneq_f32(a, v, 0);
   3129 }
   3130 
   3131 // CHECK-LABEL: define <4 x float> @test_vmulxq_laneq_f32_0(<4 x float> %a, <4 x float> %v) #0 {
   3132 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> zeroinitializer
   3133 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
   3134 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8>
   3135 // CHECK:   [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
   3136 // CHECK:   [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
   3137 // CHECK:   [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[VMULX_I]], <4 x float> [[VMULX1_I]]) #2
   3138 // CHECK:   ret <4 x float> [[VMULX2_I]]
   3139 float32x4_t test_vmulxq_laneq_f32_0(float32x4_t a, float32x4_t v) {
   3140   return vmulxq_laneq_f32(a, v, 0);
   3141 }
   3142 
   3143 // CHECK-LABEL: define <2 x double> @test_vmulxq_laneq_f64_0(<2 x double> %a, <2 x double> %v) #0 {
   3144 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x double> %v, <2 x double> %v, <2 x i32> zeroinitializer
   3145 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
   3146 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8>
   3147 // CHECK:   [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
   3148 // CHECK:   [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
   3149 // CHECK:   [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[VMULX_I]], <2 x double> [[VMULX1_I]]) #2
   3150 // CHECK:   ret <2 x double> [[VMULX2_I]]
   3151 float64x2_t test_vmulxq_laneq_f64_0(float64x2_t a, float64x2_t v) {
   3152   return vmulxq_laneq_f64(a, v, 0);
   3153 }
   3154 
   3155 // CHECK-LABEL: define <4 x i32> @test_vmull_high_n_s16(<8 x i16> %a, i16 %b) #0 {
   3156 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   3157 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
   3158 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
   3159 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %b, i32 1
   3160 // CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %b, i32 2
   3161 // CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %b, i32 3
   3162 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
   3163 // CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   3164 // CHECK:   [[VMULL4_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   3165 // CHECK:   [[VMULL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL4_I_I]]) #2
   3166 // CHECK:   ret <4 x i32> [[VMULL5_I_I]]
   3167 int32x4_t test_vmull_high_n_s16(int16x8_t a, int16_t b) {
   3168   return vmull_high_n_s16(a, b);
   3169 }
   3170 
   3171 // CHECK-LABEL: define <2 x i64> @test_vmull_high_n_s32(<4 x i32> %a, i32 %b) #0 {
   3172 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
   3173 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
   3174 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
   3175 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %b, i32 1
   3176 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
   3177 // CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   3178 // CHECK:   [[VMULL2_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   3179 // CHECK:   [[VMULL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL2_I_I]]) #2
   3180 // CHECK:   ret <2 x i64> [[VMULL3_I_I]]
   3181 int64x2_t test_vmull_high_n_s32(int32x4_t a, int32_t b) {
   3182   return vmull_high_n_s32(a, b);
   3183 }
   3184 
   3185 // CHECK-LABEL: define <4 x i32> @test_vmull_high_n_u16(<8 x i16> %a, i16 %b) #0 {
   3186 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   3187 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
   3188 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
   3189 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %b, i32 1
   3190 // CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %b, i32 2
   3191 // CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %b, i32 3
   3192 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
   3193 // CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   3194 // CHECK:   [[VMULL4_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   3195 // CHECK:   [[VMULL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL4_I_I]]) #2
   3196 // CHECK:   ret <4 x i32> [[VMULL5_I_I]]
   3197 uint32x4_t test_vmull_high_n_u16(uint16x8_t a, uint16_t b) {
   3198   return vmull_high_n_u16(a, b);
   3199 }
   3200 
   3201 // CHECK-LABEL: define <2 x i64> @test_vmull_high_n_u32(<4 x i32> %a, i32 %b) #0 {
   3202 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
   3203 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
   3204 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
   3205 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %b, i32 1
   3206 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
   3207 // CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   3208 // CHECK:   [[VMULL2_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   3209 // CHECK:   [[VMULL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL2_I_I]]) #2
   3210 // CHECK:   ret <2 x i64> [[VMULL3_I_I]]
   3211 uint64x2_t test_vmull_high_n_u32(uint32x4_t a, uint32_t b) {
   3212   return vmull_high_n_u32(a, b);
   3213 }
   3214 
   3215 // CHECK-LABEL: define <4 x i32> @test_vqdmull_high_n_s16(<8 x i16> %a, i16 %b) #0 {
   3216 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   3217 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
   3218 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
   3219 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %b, i32 1
   3220 // CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %b, i32 2
   3221 // CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %b, i32 3
   3222 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
   3223 // CHECK:   [[VQDMULL_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   3224 // CHECK:   [[VQDMULL_V4_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   3225 // CHECK:   [[VQDMULL_V5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I_I]], <4 x i16> [[VQDMULL_V4_I_I]]) #2
   3226 // CHECK:   [[VQDMULL_V6_I_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V5_I_I]] to <16 x i8>
   3227 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V6_I_I]] to <4 x i32>
   3228 // CHECK:   ret <4 x i32> [[TMP2]]
   3229 int32x4_t test_vqdmull_high_n_s16(int16x8_t a, int16_t b) {
   3230   return vqdmull_high_n_s16(a, b);
   3231 }
   3232 
   3233 // CHECK-LABEL: define <2 x i64> @test_vqdmull_high_n_s32(<4 x i32> %a, i32 %b) #0 {
   3234 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
   3235 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
   3236 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
   3237 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %b, i32 1
   3238 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
   3239 // CHECK:   [[VQDMULL_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   3240 // CHECK:   [[VQDMULL_V2_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   3241 // CHECK:   [[VQDMULL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I_I]], <2 x i32> [[VQDMULL_V2_I_I]]) #2
   3242 // CHECK:   [[VQDMULL_V4_I_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V3_I_I]] to <16 x i8>
   3243 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V4_I_I]] to <2 x i64>
   3244 // CHECK:   ret <2 x i64> [[TMP2]]
   3245 int64x2_t test_vqdmull_high_n_s32(int32x4_t a, int32_t b) {
   3246   return vqdmull_high_n_s32(a, b);
   3247 }
   3248 
   3249 // CHECK-LABEL: define <4 x i32> @test_vmlal_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
   3250 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   3251 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
   3252 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
   3253 // CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
   3254 // CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
   3255 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
   3256 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
   3257 // CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   3258 // CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   3259 // CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) #2
   3260 // CHECK:   [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I_I]]
   3261 // CHECK:   ret <4 x i32> [[ADD_I_I]]
   3262 int32x4_t test_vmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
   3263   return vmlal_high_n_s16(a, b, c);
   3264 }
   3265 
   3266 // CHECK-LABEL: define <2 x i64> @test_vmlal_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
   3267 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
   3268 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
   3269 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
   3270 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
   3271 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
   3272 // CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   3273 // CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   3274 // CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) #2
   3275 // CHECK:   [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I_I]]
   3276 // CHECK:   ret <2 x i64> [[ADD_I_I]]
   3277 int64x2_t test_vmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
   3278   return vmlal_high_n_s32(a, b, c);
   3279 }
   3280 
   3281 // CHECK-LABEL: define <4 x i32> @test_vmlal_high_n_u16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
   3282 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   3283 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
   3284 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
   3285 // CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
   3286 // CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
   3287 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
   3288 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
   3289 // CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   3290 // CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   3291 // CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) #2
   3292 // CHECK:   [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I_I]]
   3293 // CHECK:   ret <4 x i32> [[ADD_I_I]]
   3294 uint32x4_t test_vmlal_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) {
   3295   return vmlal_high_n_u16(a, b, c);
   3296 }
   3297 
   3298 // CHECK-LABEL: define <2 x i64> @test_vmlal_high_n_u32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
   3299 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
   3300 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
   3301 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
   3302 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
   3303 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
   3304 // CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   3305 // CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   3306 // CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) #2
   3307 // CHECK:   [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I_I]]
   3308 // CHECK:   ret <2 x i64> [[ADD_I_I]]
   3309 uint64x2_t test_vmlal_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) {
   3310   return vmlal_high_n_u32(a, b, c);
   3311 }
   3312 
   3313 // CHECK-LABEL: define <4 x i32> @test_vqdmlal_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
   3314 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   3315 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   3316 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
   3317 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
   3318 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
   3319 // CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
   3320 // CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
   3321 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
   3322 // CHECK:   [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   3323 // CHECK:   [[VQDMLAL4_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
   3324 // CHECK:   [[VQDMLAL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I_I]], <4 x i16> [[VQDMLAL4_I_I]]) #2
   3325 // CHECK:   [[VQDMLAL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   3326 // CHECK:   [[VQDMLAL_V6_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I_I]], <4 x i32> [[VQDMLAL5_I_I]]) #2
   3327 // CHECK:   ret <4 x i32> [[VQDMLAL_V6_I_I]]
   3328 int32x4_t test_vqdmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
   3329   return vqdmlal_high_n_s16(a, b, c);
   3330 }
   3331 
   3332 // CHECK-LABEL: define <2 x i64> @test_vqdmlal_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
   3333 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
   3334 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   3335 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
   3336 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
   3337 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
   3338 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
   3339 // CHECK:   [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   3340 // CHECK:   [[VQDMLAL2_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
   3341 // CHECK:   [[VQDMLAL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I_I]], <2 x i32> [[VQDMLAL2_I_I]]) #2
   3342 // CHECK:   [[VQDMLAL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   3343 // CHECK:   [[VQDMLAL_V4_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I_I]], <2 x i64> [[VQDMLAL3_I_I]]) #2
   3344 // CHECK:   ret <2 x i64> [[VQDMLAL_V4_I_I]]
   3345 int64x2_t test_vqdmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
   3346   return vqdmlal_high_n_s32(a, b, c);
   3347 }
   3348 
   3349 // CHECK-LABEL: define <4 x i32> @test_vmlsl_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
   3350 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   3351 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
   3352 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
   3353 // CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
   3354 // CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
   3355 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
   3356 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
   3357 // CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   3358 // CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   3359 // CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) #2
   3360 // CHECK:   [[SUB_I_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I_I]]
   3361 // CHECK:   ret <4 x i32> [[SUB_I_I]]
   3362 int32x4_t test_vmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
   3363   return vmlsl_high_n_s16(a, b, c);
   3364 }
   3365 
   3366 // CHECK-LABEL: define <2 x i64> @test_vmlsl_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
   3367 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
   3368 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
   3369 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
   3370 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
   3371 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
   3372 // CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   3373 // CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   3374 // CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) #2
   3375 // CHECK:   [[SUB_I_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I_I]]
   3376 // CHECK:   ret <2 x i64> [[SUB_I_I]]
   3377 int64x2_t test_vmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
   3378   return vmlsl_high_n_s32(a, b, c);
   3379 }
   3380 
   3381 // CHECK-LABEL: define <4 x i32> @test_vmlsl_high_n_u16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
   3382 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   3383 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
   3384 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
   3385 // CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
   3386 // CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
   3387 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
   3388 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
   3389 // CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   3390 // CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   3391 // CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) #2
   3392 // CHECK:   [[SUB_I_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I_I]]
   3393 // CHECK:   ret <4 x i32> [[SUB_I_I]]
   3394 uint32x4_t test_vmlsl_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) {
   3395   return vmlsl_high_n_u16(a, b, c);
   3396 }
   3397 
   3398 // CHECK-LABEL: define <2 x i64> @test_vmlsl_high_n_u32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
   3399 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
   3400 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
   3401 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
   3402 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
   3403 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
   3404 // CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   3405 // CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   3406 // CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) #2
   3407 // CHECK:   [[SUB_I_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I_I]]
   3408 // CHECK:   ret <2 x i64> [[SUB_I_I]]
   3409 uint64x2_t test_vmlsl_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) {
   3410   return vmlsl_high_n_u32(a, b, c);
   3411 }
   3412 
   3413 // CHECK-LABEL: define <4 x i32> @test_vqdmlsl_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
   3414 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   3415 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   3416 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
   3417 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
   3418 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
   3419 // CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
   3420 // CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
   3421 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
   3422 // CHECK:   [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   3423 // CHECK:   [[VQDMLAL4_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
   3424 // CHECK:   [[VQDMLAL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I_I]], <4 x i16> [[VQDMLAL4_I_I]]) #2
   3425 // CHECK:   [[VQDMLSL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   3426 // CHECK:   [[VQDMLSL_V6_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I_I]], <4 x i32> [[VQDMLAL5_I_I]]) #2
   3427 // CHECK:   ret <4 x i32> [[VQDMLSL_V6_I_I]]
   3428 int32x4_t test_vqdmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
   3429   return vqdmlsl_high_n_s16(a, b, c);
   3430 }
   3431 
   3432 // CHECK-LABEL: define <2 x i64> @test_vqdmlsl_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
   3433 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
   3434 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   3435 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
   3436 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
   3437 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
   3438 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
   3439 // CHECK:   [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   3440 // CHECK:   [[VQDMLAL2_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
   3441 // CHECK:   [[VQDMLAL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I_I]], <2 x i32> [[VQDMLAL2_I_I]]) #2
   3442 // CHECK:   [[VQDMLSL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   3443 // CHECK:   [[VQDMLSL_V4_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I_I]], <2 x i64> [[VQDMLAL3_I_I]]) #2
   3444 // CHECK:   ret <2 x i64> [[VQDMLSL_V4_I_I]]
   3445 int64x2_t test_vqdmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
   3446   return vqdmlsl_high_n_s32(a, b, c);
   3447 }
   3448 
   3449 // CHECK-LABEL: define <2 x float> @test_vmul_n_f32(<2 x float> %a, float %b) #0 {
   3450 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %b, i32 0
   3451 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %b, i32 1
   3452 // CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %a, [[VECINIT1_I]]
   3453 // CHECK:   ret <2 x float> [[MUL_I]]
   3454 float32x2_t test_vmul_n_f32(float32x2_t a, float32_t b) {
   3455   return vmul_n_f32(a, b);
   3456 }
   3457 
   3458 // CHECK-LABEL: define <4 x float> @test_vmulq_n_f32(<4 x float> %a, float %b) #0 {
   3459 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %b, i32 0
   3460 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %b, i32 1
   3461 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %b, i32 2
   3462 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %b, i32 3
   3463 // CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %a, [[VECINIT3_I]]
   3464 // CHECK:   ret <4 x float> [[MUL_I]]
   3465 float32x4_t test_vmulq_n_f32(float32x4_t a, float32_t b) {
   3466   return vmulq_n_f32(a, b);
   3467 }
   3468 
   3469 // CHECK-LABEL: define <2 x double> @test_vmulq_n_f64(<2 x double> %a, double %b) #0 {
   3470 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double %b, i32 0
   3471 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %b, i32 1
   3472 // CHECK:   [[MUL_I:%.*]] = fmul <2 x double> %a, [[VECINIT1_I]]
   3473 // CHECK:   ret <2 x double> [[MUL_I]]
   3474 float64x2_t test_vmulq_n_f64(float64x2_t a, float64_t b) {
   3475   return vmulq_n_f64(a, b);
   3476 }
   3477 
   3478 // CHECK-LABEL: define <2 x float> @test_vfma_n_f32(<2 x float> %a, <2 x float> %b, float %n) #0 {
   3479 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %n, i32 0
   3480 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %n, i32 1
   3481 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   3482 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
   3483 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8>
   3484 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   3485 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
   3486 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
   3487 // CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x float> [[TMP3]]) #2
   3488 // CHECK:   ret <2 x float> [[TMP6]]
   3489 float32x2_t test_vfma_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
   3490   return vfma_n_f32(a, b, n);
   3491 }
   3492 
   3493 // CHECK-LABEL: define <4 x float> @test_vfmaq_n_f32(<4 x float> %a, <4 x float> %b, float %n) #0 {
   3494 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %n, i32 0
   3495 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %n, i32 1
   3496 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %n, i32 2
   3497 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %n, i32 3
   3498 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
   3499 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
   3500 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8>
   3501 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
   3502 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
   3503 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
   3504 // CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x float> [[TMP3]]) #2
   3505 // CHECK:   ret <4 x float> [[TMP6]]
   3506 float32x4_t test_vfmaq_n_f32(float32x4_t a, float32x4_t b, float32_t n) {
   3507   return vfmaq_n_f32(a, b, n);
   3508 }
   3509 
   3510 // CHECK-LABEL: define <2 x float> @test_vfms_n_f32(<2 x float> %a, <2 x float> %b, float %n) #0 {
   3511 // CHECK:   [[SUB_I:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
   3512 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %n, i32 0
   3513 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %n, i32 1
   3514 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   3515 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SUB_I]] to <8 x i8>
   3516 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8>
   3517 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   3518 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
   3519 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
   3520 // CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x float> [[TMP3]]) #2
   3521 // CHECK:   ret <2 x float> [[TMP6]]
   3522 float32x2_t test_vfms_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
   3523   return vfms_n_f32(a, b, n);
   3524 }
   3525 
   3526 // CHECK-LABEL: define <4 x float> @test_vfmsq_n_f32(<4 x float> %a, <4 x float> %b, float %n) #0 {
   3527 // CHECK:   [[SUB_I:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
   3528 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %n, i32 0
   3529 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %n, i32 1
   3530 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %n, i32 2
   3531 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %n, i32 3
   3532 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
   3533 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SUB_I]] to <16 x i8>
   3534 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8>
   3535 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
   3536 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
   3537 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
   3538 // CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x float> [[TMP3]]) #2
   3539 // CHECK:   ret <4 x float> [[TMP6]]
   3540 float32x4_t test_vfmsq_n_f32(float32x4_t a, float32x4_t b, float32_t n) {
   3541   return vfmsq_n_f32(a, b, n);
   3542 }
   3543 
   3544 // CHECK-LABEL: define <4 x i16> @test_vmul_n_s16(<4 x i16> %a, i16 %b) #0 {
   3545 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
   3546 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
   3547 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
   3548 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
   3549 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %a, [[VECINIT3_I]]
   3550 // CHECK:   ret <4 x i16> [[MUL_I]]
   3551 int16x4_t test_vmul_n_s16(int16x4_t a, int16_t b) {
   3552   return vmul_n_s16(a, b);
   3553 }
   3554 
   3555 // CHECK-LABEL: define <8 x i16> @test_vmulq_n_s16(<8 x i16> %a, i16 %b) #0 {
   3556 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
   3557 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
   3558 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
   3559 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
   3560 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
   3561 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
   3562 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
   3563 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
   3564 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %a, [[VECINIT7_I]]
   3565 // CHECK:   ret <8 x i16> [[MUL_I]]
   3566 int16x8_t test_vmulq_n_s16(int16x8_t a, int16_t b) {
   3567   return vmulq_n_s16(a, b);
   3568 }
   3569 
   3570 // CHECK-LABEL: define <2 x i32> @test_vmul_n_s32(<2 x i32> %a, i32 %b) #0 {
   3571 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
   3572 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
   3573 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, [[VECINIT1_I]]
   3574 // CHECK:   ret <2 x i32> [[MUL_I]]
   3575 int32x2_t test_vmul_n_s32(int32x2_t a, int32_t b) {
   3576   return vmul_n_s32(a, b);
   3577 }
   3578 
   3579 // CHECK-LABEL: define <4 x i32> @test_vmulq_n_s32(<4 x i32> %a, i32 %b) #0 {
   3580 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
   3581 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
   3582 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
   3583 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
   3584 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %a, [[VECINIT3_I]]
   3585 // CHECK:   ret <4 x i32> [[MUL_I]]
   3586 int32x4_t test_vmulq_n_s32(int32x4_t a, int32_t b) {
   3587   return vmulq_n_s32(a, b);
   3588 }
   3589 
   3590 // CHECK-LABEL: define <4 x i16> @test_vmul_n_u16(<4 x i16> %a, i16 %b) #0 {
   3591 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
   3592 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
   3593 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
   3594 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
   3595 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %a, [[VECINIT3_I]]
   3596 // CHECK:   ret <4 x i16> [[MUL_I]]
   3597 uint16x4_t test_vmul_n_u16(uint16x4_t a, uint16_t b) {
   3598   return vmul_n_u16(a, b);
   3599 }
   3600 
   3601 // CHECK-LABEL: define <8 x i16> @test_vmulq_n_u16(<8 x i16> %a, i16 %b) #0 {
   3602 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
   3603 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
   3604 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
   3605 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
   3606 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
   3607 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
   3608 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
   3609 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
   3610 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %a, [[VECINIT7_I]]
   3611 // CHECK:   ret <8 x i16> [[MUL_I]]
   3612 uint16x8_t test_vmulq_n_u16(uint16x8_t a, uint16_t b) {
   3613   return vmulq_n_u16(a, b);
   3614 }
   3615 
   3616 // CHECK-LABEL: define <2 x i32> @test_vmul_n_u32(<2 x i32> %a, i32 %b) #0 {
   3617 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
   3618 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
   3619 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, [[VECINIT1_I]]
   3620 // CHECK:   ret <2 x i32> [[MUL_I]]
   3621 uint32x2_t test_vmul_n_u32(uint32x2_t a, uint32_t b) {
   3622   return vmul_n_u32(a, b);
   3623 }
   3624 
   3625 // CHECK-LABEL: define <4 x i32> @test_vmulq_n_u32(<4 x i32> %a, i32 %b) #0 {
   3626 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
   3627 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
   3628 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
   3629 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
   3630 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %a, [[VECINIT3_I]]
   3631 // CHECK:   ret <4 x i32> [[MUL_I]]
   3632 uint32x4_t test_vmulq_n_u32(uint32x4_t a, uint32_t b) {
   3633   return vmulq_n_u32(a, b);
   3634 }
   3635 
   3636 // CHECK-LABEL: define <4 x i32> @test_vmull_n_s16(<4 x i16> %a, i16 %b) #0 {
   3637 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   3638 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
   3639 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
   3640 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
   3641 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
   3642 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
   3643 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   3644 // CHECK:   [[VMULL4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   3645 // CHECK:   [[VMULL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL4_I]]) #2
   3646 // CHECK:   ret <4 x i32> [[VMULL5_I]]
   3647 int32x4_t test_vmull_n_s16(int16x4_t a, int16_t b) {
   3648   return vmull_n_s16(a, b);
   3649 }
   3650 
   3651 // CHECK-LABEL: define <2 x i64> @test_vmull_n_s32(<2 x i32> %a, i32 %b) #0 {
   3652 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   3653 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
   3654 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
   3655 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
   3656 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   3657 // CHECK:   [[VMULL2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   3658 // CHECK:   [[VMULL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL2_I]]) #2
   3659 // CHECK:   ret <2 x i64> [[VMULL3_I]]
   3660 int64x2_t test_vmull_n_s32(int32x2_t a, int32_t b) {
   3661   return vmull_n_s32(a, b);
   3662 }
   3663 
   3664 // CHECK-LABEL: define <4 x i32> @test_vmull_n_u16(<4 x i16> %a, i16 %b) #0 {
   3665 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   3666 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
   3667 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
   3668 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
   3669 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
   3670 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
   3671 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   3672 // CHECK:   [[VMULL4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   3673 // CHECK:   [[VMULL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL4_I]]) #2
   3674 // CHECK:   ret <4 x i32> [[VMULL5_I]]
   3675 uint32x4_t test_vmull_n_u16(uint16x4_t a, uint16_t b) {
   3676   return vmull_n_u16(a, b);
   3677 }
   3678 
   3679 // CHECK-LABEL: define <2 x i64> @test_vmull_n_u32(<2 x i32> %a, i32 %b) #0 {
   3680 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   3681 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
   3682 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
   3683 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
   3684 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   3685 // CHECK:   [[VMULL2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   3686 // CHECK:   [[VMULL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL2_I]]) #2
   3687 // CHECK:   ret <2 x i64> [[VMULL3_I]]
   3688 uint64x2_t test_vmull_n_u32(uint32x2_t a, uint32_t b) {
   3689   return vmull_n_u32(a, b);
   3690 }
   3691 
   3692 // CHECK-LABEL: define <4 x i32> @test_vqdmull_n_s16(<4 x i16> %a, i16 %b) #0 {
   3693 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   3694 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
   3695 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
   3696 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
   3697 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
   3698 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
   3699 // CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   3700 // CHECK:   [[VQDMULL_V4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   3701 // CHECK:   [[VQDMULL_V5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V4_I]]) #2
   3702 // CHECK:   [[VQDMULL_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V5_I]] to <16 x i8>
   3703 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V6_I]] to <4 x i32>
   3704 // CHECK:   ret <4 x i32> [[TMP2]]
   3705 int32x4_t test_vqdmull_n_s16(int16x4_t a, int16_t b) {
   3706   return vqdmull_n_s16(a, b);
   3707 }
   3708 
   3709 // CHECK-LABEL: define <2 x i64> @test_vqdmull_n_s32(<2 x i32> %a, i32 %b) #0 {
   3710 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   3711 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
   3712 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
   3713 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
   3714 // CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   3715 // CHECK:   [[VQDMULL_V2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   3716 // CHECK:   [[VQDMULL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V2_I]]) #2
   3717 // CHECK:   [[VQDMULL_V4_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V3_I]] to <16 x i8>
   3718 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V4_I]] to <2 x i64>
   3719 // CHECK:   ret <2 x i64> [[TMP2]]
   3720 int64x2_t test_vqdmull_n_s32(int32x2_t a, int32_t b) {
   3721   return vqdmull_n_s32(a, b);
   3722 }
   3723 
   3724 // CHECK-LABEL: define <4 x i16> @test_vqdmulh_n_s16(<4 x i16> %a, i16 %b) #0 {
   3725 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   3726 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
   3727 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
   3728 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
   3729 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
   3730 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
   3731 // CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   3732 // CHECK:   [[VQDMULH_V4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   3733 // CHECK:   [[VQDMULH_V5_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V4_I]]) #2
   3734 // CHECK:   [[VQDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V5_I]] to <8 x i8>
   3735 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V6_I]] to <4 x i16>
   3736 // CHECK:   ret <4 x i16> [[TMP2]]
   3737 int16x4_t test_vqdmulh_n_s16(int16x4_t a, int16_t b) {
   3738   return vqdmulh_n_s16(a, b);
   3739 }
   3740 
   3741 // CHECK-LABEL: define <8 x i16> @test_vqdmulhq_n_s16(<8 x i16> %a, i16 %b) #0 {
   3742 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   3743 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
   3744 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
   3745 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
   3746 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
   3747 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
   3748 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
   3749 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
   3750 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
   3751 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
   3752 // CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   3753 // CHECK:   [[VQDMULHQ_V8_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   3754 // CHECK:   [[VQDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V8_I]]) #2
   3755 // CHECK:   [[VQDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V9_I]] to <16 x i8>
   3756 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V10_I]] to <8 x i16>
   3757 // CHECK:   ret <8 x i16> [[TMP2]]
   3758 int16x8_t test_vqdmulhq_n_s16(int16x8_t a, int16_t b) {
   3759   return vqdmulhq_n_s16(a, b);
   3760 }
   3761 
   3762 // CHECK-LABEL: define <2 x i32> @test_vqdmulh_n_s32(<2 x i32> %a, i32 %b) #0 {
   3763 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   3764 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
   3765 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
   3766 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
   3767 // CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   3768 // CHECK:   [[VQDMULH_V2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   3769 // CHECK:   [[VQDMULH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V2_I]]) #2
   3770 // CHECK:   [[VQDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V3_I]] to <8 x i8>
   3771 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V4_I]] to <2 x i32>
   3772 // CHECK:   ret <2 x i32> [[TMP2]]
   3773 int32x2_t test_vqdmulh_n_s32(int32x2_t a, int32_t b) {
   3774   return vqdmulh_n_s32(a, b);
   3775 }
   3776 
   3777 // CHECK-LABEL: define <4 x i32> @test_vqdmulhq_n_s32(<4 x i32> %a, i32 %b) #0 {
   3778 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   3779 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
   3780 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
   3781 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
   3782 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
   3783 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
   3784 // CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   3785 // CHECK:   [[VQDMULHQ_V4_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   3786 // CHECK:   [[VQDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V4_I]]) #2
   3787 // CHECK:   [[VQDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V5_I]] to <16 x i8>
   3788 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V6_I]] to <4 x i32>
   3789 // CHECK:   ret <4 x i32> [[TMP2]]
   3790 int32x4_t test_vqdmulhq_n_s32(int32x4_t a, int32_t b) {
   3791   return vqdmulhq_n_s32(a, b);
   3792 }
   3793 
   3794 // CHECK-LABEL: define <4 x i16> @test_vqrdmulh_n_s16(<4 x i16> %a, i16 %b) #0 {
   3795 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   3796 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
   3797 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
   3798 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
   3799 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
   3800 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
   3801 // CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   3802 // CHECK:   [[VQRDMULH_V4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   3803 // CHECK:   [[VQRDMULH_V5_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V4_I]]) #2
   3804 // CHECK:   [[VQRDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V5_I]] to <8 x i8>
   3805 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V6_I]] to <4 x i16>
   3806 // CHECK:   ret <4 x i16> [[TMP2]]
   3807 int16x4_t test_vqrdmulh_n_s16(int16x4_t a, int16_t b) {
   3808   return vqrdmulh_n_s16(a, b);
   3809 }
   3810 
   3811 // CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_n_s16(<8 x i16> %a, i16 %b) #0 {
   3812 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   3813 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
   3814 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
   3815 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
   3816 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
   3817 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
   3818 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
   3819 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
   3820 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
   3821 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
   3822 // CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   3823 // CHECK:   [[VQRDMULHQ_V8_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   3824 // CHECK:   [[VQRDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V8_I]]) #2
   3825 // CHECK:   [[VQRDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V9_I]] to <16 x i8>
   3826 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V10_I]] to <8 x i16>
   3827 // CHECK:   ret <8 x i16> [[TMP2]]
   3828 int16x8_t test_vqrdmulhq_n_s16(int16x8_t a, int16_t b) {
   3829   return vqrdmulhq_n_s16(a, b);
   3830 }
   3831 
   3832 // CHECK-LABEL: define <2 x i32> @test_vqrdmulh_n_s32(<2 x i32> %a, i32 %b) #0 {
   3833 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   3834 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
   3835 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
   3836 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
   3837 // CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   3838 // CHECK:   [[VQRDMULH_V2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   3839 // CHECK:   [[VQRDMULH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V2_I]]) #2
   3840 // CHECK:   [[VQRDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V3_I]] to <8 x i8>
   3841 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V4_I]] to <2 x i32>
   3842 // CHECK:   ret <2 x i32> [[TMP2]]
   3843 int32x2_t test_vqrdmulh_n_s32(int32x2_t a, int32_t b) {
   3844   return vqrdmulh_n_s32(a, b);
   3845 }
   3846 
   3847 // CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_n_s32(<4 x i32> %a, i32 %b) #0 {
   3848 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   3849 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
   3850 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
   3851 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
   3852 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
   3853 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
   3854 // CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   3855 // CHECK:   [[VQRDMULHQ_V4_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   3856 // CHECK:   [[VQRDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V4_I]]) #2
   3857 // CHECK:   [[VQRDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V5_I]] to <16 x i8>
   3858 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V6_I]] to <4 x i32>
   3859 // CHECK:   ret <4 x i32> [[TMP2]]
   3860 int32x4_t test_vqrdmulhq_n_s32(int32x4_t a, int32_t b) {
   3861   return vqrdmulhq_n_s32(a, b);
   3862 }
   3863 
   3864 // CHECK-LABEL: define <4 x i16> @test_vmla_n_s16(<4 x i16> %a, <4 x i16> %b, i16 %c) #0 {
   3865 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
   3866 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
   3867 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
   3868 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
   3869 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
   3870 // CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
   3871 // CHECK:   ret <4 x i16> [[ADD_I]]
   3872 int16x4_t test_vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
   3873   return vmla_n_s16(a, b, c);
   3874 }
   3875 
   3876 // CHECK-LABEL: define <8 x i16> @test_vmlaq_n_s16(<8 x i16> %a, <8 x i16> %b, i16 %c) #0 {
   3877 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
   3878 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
   3879 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
   3880 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
   3881 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
   3882 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
   3883 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
   3884 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
   3885 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
   3886 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
   3887 // CHECK:   ret <8 x i16> [[ADD_I]]
   3888 int16x8_t test_vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
   3889   return vmlaq_n_s16(a, b, c);
   3890 }
   3891 
   3892 // CHECK-LABEL: define <2 x i32> @test_vmla_n_s32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
   3893 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
   3894 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
   3895 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
   3896 // CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
   3897 // CHECK:   ret <2 x i32> [[ADD_I]]
   3898 int32x2_t test_vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
   3899   return vmla_n_s32(a, b, c);
   3900 }
   3901 
   3902 // CHECK-LABEL: define <4 x i32> @test_vmlaq_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
   3903 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
   3904 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
   3905 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
   3906 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
   3907 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
   3908 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
   3909 // CHECK:   ret <4 x i32> [[ADD_I]]
   3910 int32x4_t test_vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
   3911   return vmlaq_n_s32(a, b, c);
   3912 }
   3913 
   3914 // CHECK-LABEL: define <4 x i16> @test_vmla_n_u16(<4 x i16> %a, <4 x i16> %b, i16 %c) #0 {
   3915 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
   3916 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
   3917 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
   3918 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
   3919 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
   3920 // CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
   3921 // CHECK:   ret <4 x i16> [[ADD_I]]
   3922 uint16x4_t test_vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
   3923   return vmla_n_u16(a, b, c);
   3924 }
   3925 
   3926 // CHECK-LABEL: define <8 x i16> @test_vmlaq_n_u16(<8 x i16> %a, <8 x i16> %b, i16 %c) #0 {
   3927 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
   3928 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
   3929 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
   3930 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
   3931 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
   3932 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
   3933 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
   3934 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
   3935 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
   3936 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
   3937 // CHECK:   ret <8 x i16> [[ADD_I]]
   3938 uint16x8_t test_vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
   3939   return vmlaq_n_u16(a, b, c);
   3940 }
   3941 
   3942 // CHECK-LABEL: define <2 x i32> @test_vmla_n_u32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
   3943 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
   3944 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
   3945 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
   3946 // CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
   3947 // CHECK:   ret <2 x i32> [[ADD_I]]
   3948 uint32x2_t test_vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
   3949   return vmla_n_u32(a, b, c);
   3950 }
   3951 
   3952 // CHECK-LABEL: define <4 x i32> @test_vmlaq_n_u32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
   3953 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
   3954 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
   3955 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
   3956 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
   3957 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
   3958 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
   3959 // CHECK:   ret <4 x i32> [[ADD_I]]
   3960 uint32x4_t test_vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
   3961   return vmlaq_n_u32(a, b, c);
   3962 }
   3963 
   3964 // CHECK-LABEL: define <4 x i32> @test_vmlal_n_s16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 {
   3965 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
   3966 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
   3967 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
   3968 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
   3969 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   3970 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
   3971 // CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   3972 // CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   3973 // CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #2
   3974 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
   3975 // CHECK:   ret <4 x i32> [[ADD_I]]
   3976 int32x4_t test_vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
   3977   return vmlal_n_s16(a, b, c);
   3978 }
   3979 
   3980 // CHECK-LABEL: define <2 x i64> @test_vmlal_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
   3981 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
   3982 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
   3983 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   3984 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
   3985 // CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   3986 // CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   3987 // CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #2
   3988 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
   3989 // CHECK:   ret <2 x i64> [[ADD_I]]
   3990 int64x2_t test_vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
   3991   return vmlal_n_s32(a, b, c);
   3992 }
   3993 
   3994 // CHECK-LABEL: define <4 x i32> @test_vmlal_n_u16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 {
   3995 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
   3996 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
   3997 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
   3998 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
   3999 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   4000 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
   4001 // CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   4002 // CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   4003 // CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #2
   4004 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
   4005 // CHECK:   ret <4 x i32> [[ADD_I]]
   4006 uint32x4_t test_vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
   4007   return vmlal_n_u16(a, b, c);
   4008 }
   4009 
   4010 // CHECK-LABEL: define <2 x i64> @test_vmlal_n_u32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
   4011 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
   4012 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
   4013 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   4014 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
   4015 // CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   4016 // CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   4017 // CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #2
   4018 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
   4019 // CHECK:   ret <2 x i64> [[ADD_I]]
   4020 uint64x2_t test_vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
   4021   return vmlal_n_u32(a, b, c);
   4022 }
   4023 
   4024 // CHECK-LABEL: define <4 x i32> @test_vqdmlal_n_s16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 {
   4025 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   4026 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   4027 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
   4028 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
   4029 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
   4030 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
   4031 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
   4032 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   4033 // CHECK:   [[VQDMLAL4_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
   4034 // CHECK:   [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL4_I]]) #2
   4035 // CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   4036 // CHECK:   [[VQDMLAL_V6_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL5_I]]) #2
   4037 // CHECK:   ret <4 x i32> [[VQDMLAL_V6_I]]
   4038 int32x4_t test_vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
   4039   return vqdmlal_n_s16(a, b, c);
   4040 }
   4041 
   4042 // CHECK-LABEL: define <2 x i64> @test_vqdmlal_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
   4043 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   4044 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   4045 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
   4046 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
   4047 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
   4048 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   4049 // CHECK:   [[VQDMLAL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
   4050 // CHECK:   [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL2_I]]) #2
   4051 // CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   4052 // CHECK:   [[VQDMLAL_V4_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL3_I]]) #2
   4053 // CHECK:   ret <2 x i64> [[VQDMLAL_V4_I]]
   4054 int64x2_t test_vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
   4055   return vqdmlal_n_s32(a, b, c);
   4056 }
   4057 
   4058 // CHECK-LABEL: define <4 x i16> @test_vmls_n_s16(<4 x i16> %a, <4 x i16> %b, i16 %c) #0 {
   4059 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
   4060 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
   4061 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
   4062 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
   4063 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
   4064 // CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
   4065 // CHECK:   ret <4 x i16> [[SUB_I]]
   4066 int16x4_t test_vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
   4067   return vmls_n_s16(a, b, c);
   4068 }
   4069 
   4070 // CHECK-LABEL: define <8 x i16> @test_vmlsq_n_s16(<8 x i16> %a, <8 x i16> %b, i16 %c) #0 {
   4071 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
   4072 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
   4073 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
   4074 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
   4075 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
   4076 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
   4077 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
   4078 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
   4079 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
   4080 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
   4081 // CHECK:   ret <8 x i16> [[SUB_I]]
   4082 int16x8_t test_vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
   4083   return vmlsq_n_s16(a, b, c);
   4084 }
   4085 
   4086 // CHECK-LABEL: define <2 x i32> @test_vmls_n_s32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
   4087 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
   4088 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
   4089 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
   4090 // CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
   4091 // CHECK:   ret <2 x i32> [[SUB_I]]
   4092 int32x2_t test_vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
   4093   return vmls_n_s32(a, b, c);
   4094 }
   4095 
   4096 // CHECK-LABEL: define <4 x i32> @test_vmlsq_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
   4097 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
   4098 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
   4099 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
   4100 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
   4101 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
   4102 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
   4103 // CHECK:   ret <4 x i32> [[SUB_I]]
   4104 int32x4_t test_vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
   4105   return vmlsq_n_s32(a, b, c);
   4106 }
   4107 
   4108 // CHECK-LABEL: define <4 x i16> @test_vmls_n_u16(<4 x i16> %a, <4 x i16> %b, i16 %c) #0 {
   4109 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
   4110 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
   4111 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
   4112 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
   4113 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
   4114 // CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
   4115 // CHECK:   ret <4 x i16> [[SUB_I]]
   4116 uint16x4_t test_vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
   4117   return vmls_n_u16(a, b, c);
   4118 }
   4119 
   4120 // CHECK-LABEL: define <8 x i16> @test_vmlsq_n_u16(<8 x i16> %a, <8 x i16> %b, i16 %c) #0 {
   4121 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
   4122 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
   4123 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
   4124 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
   4125 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
   4126 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
   4127 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
   4128 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
   4129 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
   4130 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
   4131 // CHECK:   ret <8 x i16> [[SUB_I]]
   4132 uint16x8_t test_vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
   4133   return vmlsq_n_u16(a, b, c);
   4134 }
   4135 
   4136 // CHECK-LABEL: define <2 x i32> @test_vmls_n_u32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
   4137 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
   4138 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
   4139 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
   4140 // CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
   4141 // CHECK:   ret <2 x i32> [[SUB_I]]
   4142 uint32x2_t test_vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
   4143   return vmls_n_u32(a, b, c);
   4144 }
   4145 
   4146 // CHECK-LABEL: define <4 x i32> @test_vmlsq_n_u32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
   4147 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
   4148 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
   4149 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
   4150 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
   4151 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
   4152 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
   4153 // CHECK:   ret <4 x i32> [[SUB_I]]
   4154 uint32x4_t test_vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
   4155   return vmlsq_n_u32(a, b, c);
   4156 }
   4157 
   4158 // CHECK-LABEL: define <4 x i32> @test_vmlsl_n_s16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 {
   4159 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
   4160 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
   4161 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
   4162 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
   4163 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   4164 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
   4165 // CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   4166 // CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   4167 // CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #2
   4168 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
   4169 // CHECK:   ret <4 x i32> [[SUB_I]]
   4170 int32x4_t test_vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
   4171   return vmlsl_n_s16(a, b, c);
   4172 }
   4173 
   4174 // CHECK-LABEL: define <2 x i64> @test_vmlsl_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
   4175 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
   4176 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
   4177 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   4178 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
   4179 // CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   4180 // CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   4181 // CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #2
   4182 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
   4183 // CHECK:   ret <2 x i64> [[SUB_I]]
   4184 int64x2_t test_vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
   4185   return vmlsl_n_s32(a, b, c);
   4186 }
   4187 
   4188 // CHECK-LABEL: define <4 x i32> @test_vmlsl_n_u16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 {
   4189 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
   4190 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
   4191 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
   4192 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
   4193 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   4194 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
   4195 // CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   4196 // CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   4197 // CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #2
   4198 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
   4199 // CHECK:   ret <4 x i32> [[SUB_I]]
   4200 uint32x4_t test_vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
   4201   return vmlsl_n_u16(a, b, c);
   4202 }
   4203 
   4204 // CHECK-LABEL: define <2 x i64> @test_vmlsl_n_u32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
   4205 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
   4206 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
   4207 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   4208 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
   4209 // CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   4210 // CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   4211 // CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #2
   4212 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
   4213 // CHECK:   ret <2 x i64> [[SUB_I]]
   4214 uint64x2_t test_vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
   4215   return vmlsl_n_u32(a, b, c);
   4216 }
   4217 
   4218 // CHECK-LABEL: define <4 x i32> @test_vqdmlsl_n_s16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 {
   4219 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   4220 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   4221 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
   4222 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
   4223 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
   4224 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
   4225 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
   4226 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   4227 // CHECK:   [[VQDMLAL4_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
   4228 // CHECK:   [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL4_I]]) #2
   4229 // CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   4230 // CHECK:   [[VQDMLSL_V6_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL5_I]]) #2
   4231 // CHECK:   ret <4 x i32> [[VQDMLSL_V6_I]]
   4232 int32x4_t test_vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
   4233   return vqdmlsl_n_s16(a, b, c);
   4234 }
   4235 
   4236 // CHECK-LABEL: define <2 x i64> @test_vqdmlsl_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
   4237 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   4238 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   4239 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
   4240 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
   4241 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
   4242 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   4243 // CHECK:   [[VQDMLAL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
   4244 // CHECK:   [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL2_I]]) #2
   4245 // CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   4246 // CHECK:   [[VQDMLSL_V4_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL3_I]]) #2
   4247 // CHECK:   ret <2 x i64> [[VQDMLSL_V4_I]]
   4248 int64x2_t test_vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
   4249   return vqdmlsl_n_s32(a, b, c);
   4250 }
   4251 
   4252 // CHECK-LABEL: define <4 x i16> @test_vmla_lane_u16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 {
   4253 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
   4254 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
   4255 // CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
   4256 // CHECK:   ret <4 x i16> [[ADD]]
   4257 uint16x4_t test_vmla_lane_u16_0(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
   4258   return vmla_lane_u16(a, b, v, 0);
   4259 }
   4260 
   4261 // CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_u16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 {
   4262 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
   4263 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
   4264 // CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
   4265 // CHECK:   ret <8 x i16> [[ADD]]
   4266 uint16x8_t test_vmlaq_lane_u16_0(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
   4267   return vmlaq_lane_u16(a, b, v, 0);
   4268 }
   4269 
   4270 // CHECK-LABEL: define <2 x i32> @test_vmla_lane_u32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 {
   4271 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
   4272 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
   4273 // CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
   4274 // CHECK:   ret <2 x i32> [[ADD]]
   4275 uint32x2_t test_vmla_lane_u32_0(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
   4276   return vmla_lane_u32(a, b, v, 0);
   4277 }
   4278 
   4279 // CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_u32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 {
   4280 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
   4281 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
   4282 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
   4283 // CHECK:   ret <4 x i32> [[ADD]]
   4284 uint32x4_t test_vmlaq_lane_u32_0(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
   4285   return vmlaq_lane_u32(a, b, v, 0);
   4286 }
   4287 
   4288 // CHECK-LABEL: define <4 x i16> @test_vmla_laneq_u16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
   4289 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
   4290 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
   4291 // CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
   4292 // CHECK:   ret <4 x i16> [[ADD]]
   4293 uint16x4_t test_vmla_laneq_u16_0(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
   4294   return vmla_laneq_u16(a, b, v, 0);
   4295 }
   4296 
   4297 // CHECK-LABEL: define <8 x i16> @test_vmlaq_laneq_u16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 {
   4298 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
   4299 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
   4300 // CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
   4301 // CHECK:   ret <8 x i16> [[ADD]]
   4302 uint16x8_t test_vmlaq_laneq_u16_0(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
   4303   return vmlaq_laneq_u16(a, b, v, 0);
   4304 }
   4305 
   4306 // CHECK-LABEL: define <2 x i32> @test_vmla_laneq_u32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 {
   4307 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
   4308 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
   4309 // CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
   4310 // CHECK:   ret <2 x i32> [[ADD]]
   4311 uint32x2_t test_vmla_laneq_u32_0(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
   4312   return vmla_laneq_u32(a, b, v, 0);
   4313 }
   4314 
   4315 // CHECK-LABEL: define <4 x i32> @test_vmlaq_laneq_u32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 {
   4316 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
   4317 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
   4318 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
   4319 // CHECK:   ret <4 x i32> [[ADD]]
   4320 uint32x4_t test_vmlaq_laneq_u32_0(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
   4321   return vmlaq_laneq_u32(a, b, v, 0);
   4322 }
   4323 
   4324 // CHECK-LABEL: define <4 x i32> @test_vqdmlal_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
   4325 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
   4326 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   4327 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   4328 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   4329 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   4330 // CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
   4331 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
   4332 // CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   4333 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
   4334 // CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
   4335 int32x4_t test_vqdmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
   4336   return vqdmlal_laneq_s16(a, b, v, 0);
   4337 }
   4338 
   4339 // CHECK-LABEL: define <2 x i64> @test_vqdmlal_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
   4340 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
   4341 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   4342 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   4343 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   4344 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   4345 // CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
   4346 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
   4347 // CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   4348 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
   4349 // CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
   4350 int64x2_t test_vqdmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
   4351   return vqdmlal_laneq_s32(a, b, v, 0);
   4352 }
   4353 
   4354 // CHECK-LABEL: define <4 x i32> @test_vqdmlal_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
   4355 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   4356 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
   4357 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   4358 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
   4359 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   4360 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   4361 // CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
   4362 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
   4363 // CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   4364 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
   4365 // CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
   4366 int32x4_t test_vqdmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
   4367   return vqdmlal_high_laneq_s16(a, b, v, 0);
   4368 }
   4369 
   4370 // CHECK-LABEL: define <2 x i64> @test_vqdmlal_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
   4371 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
   4372 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
   4373 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   4374 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
   4375 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   4376 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   4377 // CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
   4378 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
   4379 // CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   4380 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
   4381 // CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
   4382 int64x2_t test_vqdmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
   4383   return vqdmlal_high_laneq_s32(a, b, v, 0);
   4384 }
   4385 
   4386 // CHECK-LABEL: define <4 x i16> @test_vmls_lane_u16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 {
   4387 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
   4388 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
   4389 // CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
   4390 // CHECK:   ret <4 x i16> [[SUB]]
   4391 uint16x4_t test_vmls_lane_u16_0(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
   4392   return vmls_lane_u16(a, b, v, 0);
   4393 }
   4394 
   4395 // CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_u16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 {
   4396 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
   4397 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
   4398 // CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
   4399 // CHECK:   ret <8 x i16> [[SUB]]
   4400 uint16x8_t test_vmlsq_lane_u16_0(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
   4401   return vmlsq_lane_u16(a, b, v, 0);
   4402 }
   4403 
   4404 // CHECK-LABEL: define <2 x i32> @test_vmls_lane_u32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 {
   4405 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
   4406 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
   4407 // CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
   4408 // CHECK:   ret <2 x i32> [[SUB]]
   4409 uint32x2_t test_vmls_lane_u32_0(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
   4410   return vmls_lane_u32(a, b, v, 0);
   4411 }
   4412 
   4413 // CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_u32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 {
   4414 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
   4415 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
   4416 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
   4417 // CHECK:   ret <4 x i32> [[SUB]]
   4418 uint32x4_t test_vmlsq_lane_u32_0(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
   4419   return vmlsq_lane_u32(a, b, v, 0);
   4420 }
   4421 
   4422 // CHECK-LABEL: define <4 x i16> @test_vmls_laneq_u16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
   4423 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
   4424 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
   4425 // CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
   4426 // CHECK:   ret <4 x i16> [[SUB]]
   4427 uint16x4_t test_vmls_laneq_u16_0(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
   4428   return vmls_laneq_u16(a, b, v, 0);
   4429 }
   4430 
   4431 // CHECK-LABEL: define <8 x i16> @test_vmlsq_laneq_u16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 {
   4432 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
   4433 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
   4434 // CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
   4435 // CHECK:   ret <8 x i16> [[SUB]]
   4436 uint16x8_t test_vmlsq_laneq_u16_0(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
   4437   return vmlsq_laneq_u16(a, b, v, 0);
   4438 }
   4439 
   4440 // CHECK-LABEL: define <2 x i32> @test_vmls_laneq_u32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 {
   4441 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
   4442 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
   4443 // CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
   4444 // CHECK:   ret <2 x i32> [[SUB]]
   4445 uint32x2_t test_vmls_laneq_u32_0(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
   4446   return vmls_laneq_u32(a, b, v, 0);
   4447 }
   4448 
   4449 // CHECK-LABEL: define <4 x i32> @test_vmlsq_laneq_u32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 {
   4450 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
   4451 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
   4452 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
   4453 // CHECK:   ret <4 x i32> [[SUB]]
   4454 uint32x4_t test_vmlsq_laneq_u32_0(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
   4455   return vmlsq_laneq_u32(a, b, v, 0);
   4456 }
   4457 
   4458 // CHECK-LABEL: define <4 x i32> @test_vqdmlsl_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
   4459 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
   4460 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   4461 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   4462 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   4463 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   4464 // CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
   4465 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
   4466 // CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   4467 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
   4468 // CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
   4469 int32x4_t test_vqdmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
   4470   return vqdmlsl_laneq_s16(a, b, v, 0);
   4471 }
   4472 
   4473 // CHECK-LABEL: define <2 x i64> @test_vqdmlsl_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
   4474 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
   4475 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   4476 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   4477 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   4478 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   4479 // CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
   4480 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
   4481 // CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   4482 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
   4483 // CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
   4484 int64x2_t test_vqdmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
   4485   return vqdmlsl_laneq_s32(a, b, v, 0);
   4486 }
   4487 
   4488 // CHECK-LABEL: define <4 x i32> @test_vqdmlsl_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
   4489 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   4490 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
   4491 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   4492 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
   4493 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   4494 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   4495 // CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
   4496 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
   4497 // CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   4498 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
   4499 // CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
   4500 int32x4_t test_vqdmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
   4501   return vqdmlsl_high_laneq_s16(a, b, v, 0);
   4502 }
   4503 
   4504 // CHECK-LABEL: define <2 x i64> @test_vqdmlsl_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
   4505 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
   4506 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
   4507 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   4508 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
   4509 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   4510 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   4511 // CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
   4512 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
   4513 // CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   4514 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
   4515 // CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
   4516 int64x2_t test_vqdmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
   4517   return vqdmlsl_high_laneq_s32(a, b, v, 0);
   4518 }
   4519 
   4520 // CHECK-LABEL: define <4 x i16> @test_vqdmulh_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) #0 {
   4521 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
   4522 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   4523 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   4524 // CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   4525 // CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   4526 // CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) #2
   4527 // CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
   4528 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16>
   4529 // CHECK:   ret <4 x i16> [[TMP2]]
   4530 int16x4_t test_vqdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) {
   4531   return vqdmulh_laneq_s16(a, v, 0);
   4532 }
   4533 
   4534 // CHECK-LABEL: define <8 x i16> @test_vqdmulhq_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) #0 {
   4535 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
   4536 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   4537 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
   4538 // CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   4539 // CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   4540 // CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) #2
   4541 // CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
   4542 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16>
   4543 // CHECK:   ret <8 x i16> [[TMP2]]
   4544 int16x8_t test_vqdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) {
   4545   return vqdmulhq_laneq_s16(a, v, 0);
   4546 }
   4547 
   4548 // CHECK-LABEL: define <2 x i32> @test_vqdmulh_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) #0 {
   4549 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
   4550 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   4551 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   4552 // CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   4553 // CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   4554 // CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) #2
   4555 // CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
   4556 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32>
   4557 // CHECK:   ret <2 x i32> [[TMP2]]
   4558 int32x2_t test_vqdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) {
   4559   return vqdmulh_laneq_s32(a, v, 0);
   4560 }
   4561 
   4562 // CHECK-LABEL: define <4 x i32> @test_vqdmulhq_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) #0 {
   4563 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
   4564 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   4565 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
   4566 // CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   4567 // CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   4568 // CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) #2
   4569 // CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
   4570 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32>
   4571 // CHECK:   ret <4 x i32> [[TMP2]]
   4572 int32x4_t test_vqdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) {
   4573   return vqdmulhq_laneq_s32(a, v, 0);
   4574 }
   4575 
   4576 // CHECK-LABEL: define <4 x i16> @test_vqrdmulh_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) #0 {
   4577 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
   4578 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   4579 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   4580 // CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   4581 // CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   4582 // CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) #2
   4583 // CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
   4584 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16>
   4585 // CHECK:   ret <4 x i16> [[TMP2]]
   4586 int16x4_t test_vqrdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) {
   4587   return vqrdmulh_laneq_s16(a, v, 0);
   4588 }
   4589 
   4590 // CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) #0 {
   4591 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
   4592 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   4593 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
   4594 // CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   4595 // CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   4596 // CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) #2
   4597 // CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
   4598 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16>
   4599 // CHECK:   ret <8 x i16> [[TMP2]]
   4600 int16x8_t test_vqrdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) {
   4601   return vqrdmulhq_laneq_s16(a, v, 0);
   4602 }
   4603 
   4604 // CHECK-LABEL: define <2 x i32> @test_vqrdmulh_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) #0 {
   4605 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
   4606 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   4607 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   4608 // CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   4609 // CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   4610 // CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) #2
   4611 // CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
   4612 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32>
   4613 // CHECK:   ret <2 x i32> [[TMP2]]
   4614 int32x2_t test_vqrdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) {
   4615   return vqrdmulh_laneq_s32(a, v, 0);
   4616 }
   4617 
   4618 // CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) #0 {
   4619 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
   4620 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   4621 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
   4622 // CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   4623 // CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   4624 // CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) #2
   4625 // CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
   4626 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32>
   4627 // CHECK:   ret <4 x i32> [[TMP2]]
   4628 int32x4_t test_vqrdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) {
   4629   return vqrdmulhq_laneq_s32(a, v, 0);
   4630 }
   4631 
   4632 // CHECK-LABEL: define <4 x i16> @test_vmla_lane_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 {
   4633 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   4634 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
   4635 // CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
   4636 // CHECK:   ret <4 x i16> [[ADD]]
   4637 uint16x4_t test_vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
   4638   return vmla_lane_u16(a, b, v, 3);
   4639 }
   4640 
   4641 // CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_u16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 {
   4642 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
   4643 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
   4644 // CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
   4645 // CHECK:   ret <8 x i16> [[ADD]]
   4646 uint16x8_t test_vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
   4647   return vmlaq_lane_u16(a, b, v, 3);
   4648 }
   4649 
   4650 // CHECK-LABEL: define <2 x i32> @test_vmla_lane_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 {
   4651 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
   4652 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
   4653 // CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
   4654 // CHECK:   ret <2 x i32> [[ADD]]
   4655 uint32x2_t test_vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
   4656   return vmla_lane_u32(a, b, v, 1);
   4657 }
   4658 
   4659 // CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_u32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 {
   4660 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   4661 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
   4662 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
   4663 // CHECK:   ret <4 x i32> [[ADD]]
   4664 uint32x4_t test_vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
   4665   return vmlaq_lane_u32(a, b, v, 1);
   4666 }
   4667 
   4668 // CHECK-LABEL: define <4 x i16> @test_vmla_laneq_u16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
   4669 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   4670 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
   4671 // CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
   4672 // CHECK:   ret <4 x i16> [[ADD]]
   4673 uint16x4_t test_vmla_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
   4674   return vmla_laneq_u16(a, b, v, 7);
   4675 }
   4676 
   4677 // CHECK-LABEL: define <8 x i16> @test_vmlaq_laneq_u16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 {
   4678 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
   4679 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
   4680 // CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
   4681 // CHECK:   ret <8 x i16> [[ADD]]
   4682 uint16x8_t test_vmlaq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
   4683   return vmlaq_laneq_u16(a, b, v, 7);
   4684 }
   4685 
   4686 // CHECK-LABEL: define <2 x i32> @test_vmla_laneq_u32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 {
   4687 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
   4688 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
   4689 // CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
   4690 // CHECK:   ret <2 x i32> [[ADD]]
   4691 uint32x2_t test_vmla_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
   4692   return vmla_laneq_u32(a, b, v, 3);
   4693 }
   4694 
   4695 // CHECK-LABEL: define <4 x i32> @test_vmlaq_laneq_u32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 {
   4696 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   4697 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
   4698 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
   4699 // CHECK:   ret <4 x i32> [[ADD]]
   4700 uint32x4_t test_vmlaq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
   4701   return vmlaq_laneq_u32(a, b, v, 3);
   4702 }
   4703 
   4704 // CHECK-LABEL: define <4 x i32> @test_vqdmlal_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
   4705 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   4706 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   4707 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   4708 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   4709 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   4710 // CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
   4711 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
   4712 // CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   4713 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
   4714 // CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
   4715 int32x4_t test_vqdmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
   4716   return vqdmlal_laneq_s16(a, b, v, 7);
   4717 }
   4718 
   4719 // CHECK-LABEL: define <2 x i64> @test_vqdmlal_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
   4720 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
   4721 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   4722 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   4723 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   4724 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   4725 // CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
   4726 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
   4727 // CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   4728 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
   4729 // CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
   4730 int64x2_t test_vqdmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
   4731   return vqdmlal_laneq_s32(a, b, v, 3);
   4732 }
   4733 
   4734 // CHECK-LABEL: define <4 x i32> @test_vqdmlal_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
   4735 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   4736 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   4737 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   4738 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
   4739 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   4740 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   4741 // CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
   4742 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
   4743 // CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   4744 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
   4745 // CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
   4746 int32x4_t test_vqdmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
   4747   return vqdmlal_high_laneq_s16(a, b, v, 7);
   4748 }
   4749 
   4750 // CHECK-LABEL: define <2 x i64> @test_vqdmlal_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
   4751 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
   4752 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
   4753 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   4754 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
   4755 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   4756 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   4757 // CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
   4758 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
   4759 // CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   4760 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
   4761 // CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
   4762 int64x2_t test_vqdmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
   4763   return vqdmlal_high_laneq_s32(a, b, v, 3);
   4764 }
   4765 
   4766 // CHECK-LABEL: define <4 x i16> @test_vmls_lane_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 {
   4767 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   4768 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
   4769 // CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
   4770 // CHECK:   ret <4 x i16> [[SUB]]
   4771 uint16x4_t test_vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
   4772   return vmls_lane_u16(a, b, v, 3);
   4773 }
   4774 
   4775 // CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_u16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 {
   4776 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
   4777 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
   4778 // CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
   4779 // CHECK:   ret <8 x i16> [[SUB]]
   4780 uint16x8_t test_vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
   4781   return vmlsq_lane_u16(a, b, v, 3);
   4782 }
   4783 
   4784 // CHECK-LABEL: define <2 x i32> @test_vmls_lane_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 {
   4785 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
   4786 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
   4787 // CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
   4788 // CHECK:   ret <2 x i32> [[SUB]]
   4789 uint32x2_t test_vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
   4790   return vmls_lane_u32(a, b, v, 1);
   4791 }
   4792 
   4793 // CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_u32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 {
   4794 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   4795 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
   4796 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
   4797 // CHECK:   ret <4 x i32> [[SUB]]
   4798 uint32x4_t test_vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
   4799   return vmlsq_lane_u32(a, b, v, 1);
   4800 }
   4801 
   4802 // CHECK-LABEL: define <4 x i16> @test_vmls_laneq_u16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
   4803 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   4804 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
   4805 // CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
   4806 // CHECK:   ret <4 x i16> [[SUB]]
   4807 uint16x4_t test_vmls_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
   4808   return vmls_laneq_u16(a, b, v, 7);
   4809 }
   4810 
   4811 // CHECK-LABEL: define <8 x i16> @test_vmlsq_laneq_u16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 {
   4812 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
   4813 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
   4814 // CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
   4815 // CHECK:   ret <8 x i16> [[SUB]]
   4816 uint16x8_t test_vmlsq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
   4817   return vmlsq_laneq_u16(a, b, v, 7);
   4818 }
   4819 
   4820 // CHECK-LABEL: define <2 x i32> @test_vmls_laneq_u32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 {
   4821 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
   4822 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
   4823 // CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
   4824 // CHECK:   ret <2 x i32> [[SUB]]
   4825 uint32x2_t test_vmls_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
   4826   return vmls_laneq_u32(a, b, v, 3);
   4827 }
   4828 
   4829 // CHECK-LABEL: define <4 x i32> @test_vmlsq_laneq_u32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 {
   4830 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   4831 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
   4832 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
   4833 // CHECK:   ret <4 x i32> [[SUB]]
   4834 uint32x4_t test_vmlsq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
   4835   return vmlsq_laneq_u32(a, b, v, 3);
   4836 }
   4837 
   4838 // CHECK-LABEL: define <4 x i32> @test_vqdmlsl_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
   4839 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   4840 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   4841 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   4842 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   4843 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   4844 // CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
   4845 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
   4846 // CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   4847 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
   4848 // CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
   4849 int32x4_t test_vqdmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
   4850   return vqdmlsl_laneq_s16(a, b, v, 7);
   4851 }
   4852 
   4853 // CHECK-LABEL: define <2 x i64> @test_vqdmlsl_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
   4854 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
   4855 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   4856 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   4857 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   4858 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   4859 // CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
   4860 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
   4861 // CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   4862 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
   4863 // CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
   4864 int64x2_t test_vqdmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
   4865   return vqdmlsl_laneq_s32(a, b, v, 3);
   4866 }
   4867 
   4868 // CHECK-LABEL: define <4 x i32> @test_vqdmlsl_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
   4869 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   4870 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   4871 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   4872 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
   4873 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   4874 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   4875 // CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
   4876 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
   4877 // CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   4878 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
   4879 // CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
   4880 int32x4_t test_vqdmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
   4881   return vqdmlsl_high_laneq_s16(a, b, v, 7);
   4882 }
   4883 
   4884 // CHECK-LABEL: define <2 x i64> @test_vqdmlsl_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
   4885 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
   4886 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
   4887 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   4888 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
   4889 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   4890 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   4891 // CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
   4892 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
   4893 // CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   4894 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
   4895 // CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
   4896 int64x2_t test_vqdmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
   4897   return vqdmlsl_high_laneq_s32(a, b, v, 3);
   4898 }
   4899 
   4900 // CHECK-LABEL: define <4 x i16> @test_vqdmulh_laneq_s16(<4 x i16> %a, <8 x i16> %v) #0 {
   4901 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   4902 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   4903 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   4904 // CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   4905 // CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   4906 // CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) #2
   4907 // CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
   4908 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16>
   4909 // CHECK:   ret <4 x i16> [[TMP2]]
   4910 int16x4_t test_vqdmulh_laneq_s16(int16x4_t a, int16x8_t v) {
   4911   return vqdmulh_laneq_s16(a, v, 7);
   4912 }
   4913 
   4914 // CHECK-LABEL: define <8 x i16> @test_vqdmulhq_laneq_s16(<8 x i16> %a, <8 x i16> %v) #0 {
   4915 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
   4916 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   4917 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
   4918 // CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   4919 // CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   4920 // CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) #2
   4921 // CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
   4922 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16>
   4923 // CHECK:   ret <8 x i16> [[TMP2]]
   4924 int16x8_t test_vqdmulhq_laneq_s16(int16x8_t a, int16x8_t v) {
   4925   return vqdmulhq_laneq_s16(a, v, 7);
   4926 }
   4927 
   4928 // CHECK-LABEL: define <2 x i32> @test_vqdmulh_laneq_s32(<2 x i32> %a, <4 x i32> %v) #0 {
   4929 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
   4930 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   4931 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   4932 // CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   4933 // CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   4934 // CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) #2
   4935 // CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
   4936 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32>
   4937 // CHECK:   ret <2 x i32> [[TMP2]]
   4938 int32x2_t test_vqdmulh_laneq_s32(int32x2_t a, int32x4_t v) {
   4939   return vqdmulh_laneq_s32(a, v, 3);
   4940 }
   4941 
   4942 // CHECK-LABEL: define <4 x i32> @test_vqdmulhq_laneq_s32(<4 x i32> %a, <4 x i32> %v) #0 {
   4943 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   4944 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   4945 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
   4946 // CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   4947 // CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   4948 // CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) #2
   4949 // CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
   4950 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32>
   4951 // CHECK:   ret <4 x i32> [[TMP2]]
   4952 int32x4_t test_vqdmulhq_laneq_s32(int32x4_t a, int32x4_t v) {
   4953   return vqdmulhq_laneq_s32(a, v, 3);
   4954 }
   4955 
   4956 // CHECK-LABEL: define <4 x i16> @test_vqrdmulh_laneq_s16(<4 x i16> %a, <8 x i16> %v) #0 {
   4957 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   4958 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   4959 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
   4960 // CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   4961 // CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   4962 // CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) #2
   4963 // CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
   4964 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16>
   4965 // CHECK:   ret <4 x i16> [[TMP2]]
   4966 int16x4_t test_vqrdmulh_laneq_s16(int16x4_t a, int16x8_t v) {
   4967   return vqrdmulh_laneq_s16(a, v, 7);
   4968 }
   4969 
   4970 // CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_laneq_s16(<8 x i16> %a, <8 x i16> %v) #0 {
   4971 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
   4972 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   4973 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
   4974 // CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   4975 // CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   4976 // CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) #2
   4977 // CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
   4978 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16>
   4979 // CHECK:   ret <8 x i16> [[TMP2]]
   4980 int16x8_t test_vqrdmulhq_laneq_s16(int16x8_t a, int16x8_t v) {
   4981   return vqrdmulhq_laneq_s16(a, v, 7);
   4982 }
   4983 
   4984 // CHECK-LABEL: define <2 x i32> @test_vqrdmulh_laneq_s32(<2 x i32> %a, <4 x i32> %v) #0 {
   4985 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
   4986 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   4987 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
   4988 // CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   4989 // CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   4990 // CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) #2
   4991 // CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
   4992 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32>
   4993 // CHECK:   ret <2 x i32> [[TMP2]]
   4994 int32x2_t test_vqrdmulh_laneq_s32(int32x2_t a, int32x4_t v) {
   4995   return vqrdmulh_laneq_s32(a, v, 3);
   4996 }
   4997 
   4998 // CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_laneq_s32(<4 x i32> %a, <4 x i32> %v) #0 {
   4999 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   5000 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   5001 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
   5002 // CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   5003 // CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   5004 // CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) #2
   5005 // CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
   5006 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32>
   5007 // CHECK:   ret <4 x i32> [[TMP2]]
   5008 int32x4_t test_vqrdmulhq_laneq_s32(int32x4_t a, int32x4_t v) {
   5009   return vqrdmulhq_laneq_s32(a, v, 3);
   5010 }
   5011 
   5012