Home | History | Annotate | Download | only in CodeGen
      1 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
      2 // RUN:     -fallow-half-arguments-and-returns -ffp-contract=fast -S -emit-llvm -o - %s \
      3 // RUN: | opt -S -mem2reg \
      4 // RUN: | FileCheck %s
      5 
      6 // Test new aarch64 intrinsics and types
      7 
      8 #include <arm_neon.h>
      9 
     10 // CHECK-LABEL: define <8 x i8> @test_vadd_s8(<8 x i8> %v1, <8 x i8> %v2) #0 {
     11 // CHECK:   [[ADD_I:%.*]] = add <8 x i8> %v1, %v2
     12 // CHECK:   ret <8 x i8> [[ADD_I]]
     13 int8x8_t test_vadd_s8(int8x8_t v1, int8x8_t v2) {
     14   return vadd_s8(v1, v2);
     15 }
     16 
     17 // CHECK-LABEL: define <4 x i16> @test_vadd_s16(<4 x i16> %v1, <4 x i16> %v2) #0 {
     18 // CHECK:   [[ADD_I:%.*]] = add <4 x i16> %v1, %v2
     19 // CHECK:   ret <4 x i16> [[ADD_I]]
     20 int16x4_t test_vadd_s16(int16x4_t v1, int16x4_t v2) {
     21   return vadd_s16(v1, v2);
     22 }
     23 
     24 // CHECK-LABEL: define <2 x i32> @test_vadd_s32(<2 x i32> %v1, <2 x i32> %v2) #0 {
     25 // CHECK:   [[ADD_I:%.*]] = add <2 x i32> %v1, %v2
     26 // CHECK:   ret <2 x i32> [[ADD_I]]
     27 int32x2_t test_vadd_s32(int32x2_t v1, int32x2_t v2) {
     28   return vadd_s32(v1, v2);
     29 }
     30 
     31 // CHECK-LABEL: define <1 x i64> @test_vadd_s64(<1 x i64> %v1, <1 x i64> %v2) #0 {
     32 // CHECK:   [[ADD_I:%.*]] = add <1 x i64> %v1, %v2
     33 // CHECK:   ret <1 x i64> [[ADD_I]]
     34 int64x1_t test_vadd_s64(int64x1_t v1, int64x1_t v2) {
     35   return vadd_s64(v1, v2);
     36 }
     37 
     38 // CHECK-LABEL: define <2 x float> @test_vadd_f32(<2 x float> %v1, <2 x float> %v2) #0 {
     39 // CHECK:   [[ADD_I:%.*]] = fadd <2 x float> %v1, %v2
     40 // CHECK:   ret <2 x float> [[ADD_I]]
     41 float32x2_t test_vadd_f32(float32x2_t v1, float32x2_t v2) {
     42   return vadd_f32(v1, v2);
     43 }
     44 
     45 // CHECK-LABEL: define <8 x i8> @test_vadd_u8(<8 x i8> %v1, <8 x i8> %v2) #0 {
     46 // CHECK:   [[ADD_I:%.*]] = add <8 x i8> %v1, %v2
     47 // CHECK:   ret <8 x i8> [[ADD_I]]
     48 uint8x8_t test_vadd_u8(uint8x8_t v1, uint8x8_t v2) {
     49   return vadd_u8(v1, v2);
     50 }
     51 
     52 // CHECK-LABEL: define <4 x i16> @test_vadd_u16(<4 x i16> %v1, <4 x i16> %v2) #0 {
     53 // CHECK:   [[ADD_I:%.*]] = add <4 x i16> %v1, %v2
     54 // CHECK:   ret <4 x i16> [[ADD_I]]
     55 uint16x4_t test_vadd_u16(uint16x4_t v1, uint16x4_t v2) {
     56   return vadd_u16(v1, v2);
     57 }
     58 
     59 // CHECK-LABEL: define <2 x i32> @test_vadd_u32(<2 x i32> %v1, <2 x i32> %v2) #0 {
     60 // CHECK:   [[ADD_I:%.*]] = add <2 x i32> %v1, %v2
     61 // CHECK:   ret <2 x i32> [[ADD_I]]
     62 uint32x2_t test_vadd_u32(uint32x2_t v1, uint32x2_t v2) {
     63   return vadd_u32(v1, v2);
     64 }
     65 
     66 // CHECK-LABEL: define <1 x i64> @test_vadd_u64(<1 x i64> %v1, <1 x i64> %v2) #0 {
     67 // CHECK:   [[ADD_I:%.*]] = add <1 x i64> %v1, %v2
     68 // CHECK:   ret <1 x i64> [[ADD_I]]
     69 uint64x1_t test_vadd_u64(uint64x1_t v1, uint64x1_t v2) {
     70   return vadd_u64(v1, v2);
     71 }
     72 
     73 // CHECK-LABEL: define <16 x i8> @test_vaddq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 {
     74 // CHECK:   [[ADD_I:%.*]] = add <16 x i8> %v1, %v2
     75 // CHECK:   ret <16 x i8> [[ADD_I]]
     76 int8x16_t test_vaddq_s8(int8x16_t v1, int8x16_t v2) {
     77   return vaddq_s8(v1, v2);
     78 }
     79 
     80 // CHECK-LABEL: define <8 x i16> @test_vaddq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 {
     81 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %v1, %v2
     82 // CHECK:   ret <8 x i16> [[ADD_I]]
     83 int16x8_t test_vaddq_s16(int16x8_t v1, int16x8_t v2) {
     84   return vaddq_s16(v1, v2);
     85 }
     86 
     87 // CHECK-LABEL: define <4 x i32> @test_vaddq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 {
     88 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %v1, %v2
     89 // CHECK:   ret <4 x i32> [[ADD_I]]
     90 int32x4_t test_vaddq_s32(int32x4_t v1,int32x4_t  v2) {
     91   return vaddq_s32(v1, v2);
     92 }
     93 
     94 // CHECK-LABEL: define <2 x i64> @test_vaddq_s64(<2 x i64> %v1, <2 x i64> %v2) #0 {
     95 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %v1, %v2
     96 // CHECK:   ret <2 x i64> [[ADD_I]]
     97 int64x2_t test_vaddq_s64(int64x2_t v1, int64x2_t v2) {
     98   return vaddq_s64(v1, v2);
     99 }
    100 
    101 // CHECK-LABEL: define <4 x float> @test_vaddq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
    102 // CHECK:   [[ADD_I:%.*]] = fadd <4 x float> %v1, %v2
    103 // CHECK:   ret <4 x float> [[ADD_I]]
    104 float32x4_t test_vaddq_f32(float32x4_t v1, float32x4_t v2) {
    105   return vaddq_f32(v1, v2);
    106 }
    107 
    108 // CHECK-LABEL: define <2 x double> @test_vaddq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
    109 // CHECK:   [[ADD_I:%.*]] = fadd <2 x double> %v1, %v2
    110 // CHECK:   ret <2 x double> [[ADD_I]]
    111 float64x2_t test_vaddq_f64(float64x2_t v1, float64x2_t v2) {
    112   return vaddq_f64(v1, v2);
    113 }
    114 
    115 // CHECK-LABEL: define <16 x i8> @test_vaddq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 {
    116 // CHECK:   [[ADD_I:%.*]] = add <16 x i8> %v1, %v2
    117 // CHECK:   ret <16 x i8> [[ADD_I]]
    118 uint8x16_t test_vaddq_u8(uint8x16_t v1, uint8x16_t v2) {
    119   return vaddq_u8(v1, v2);
    120 }
    121 
    122 // CHECK-LABEL: define <8 x i16> @test_vaddq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 {
    123 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %v1, %v2
    124 // CHECK:   ret <8 x i16> [[ADD_I]]
    125 uint16x8_t test_vaddq_u16(uint16x8_t v1, uint16x8_t v2) {
    126   return vaddq_u16(v1, v2);
    127 }
    128 
    129 // CHECK-LABEL: define <4 x i32> @test_vaddq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 {
    130 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %v1, %v2
    131 // CHECK:   ret <4 x i32> [[ADD_I]]
    132 uint32x4_t test_vaddq_u32(uint32x4_t v1, uint32x4_t v2) {
    133   return vaddq_u32(v1, v2);
    134 }
    135 
    136 // CHECK-LABEL: define <2 x i64> @test_vaddq_u64(<2 x i64> %v1, <2 x i64> %v2) #0 {
    137 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %v1, %v2
    138 // CHECK:   ret <2 x i64> [[ADD_I]]
    139 uint64x2_t test_vaddq_u64(uint64x2_t v1, uint64x2_t v2) {
    140   return vaddq_u64(v1, v2);
    141 }
    142 
    143 // CHECK-LABEL: define <8 x i8> @test_vsub_s8(<8 x i8> %v1, <8 x i8> %v2) #0 {
    144 // CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %v1, %v2
    145 // CHECK:   ret <8 x i8> [[SUB_I]]
    146 int8x8_t test_vsub_s8(int8x8_t v1, int8x8_t v2) {
    147   return vsub_s8(v1, v2);
    148 }
    149 // CHECK-LABEL: define <4 x i16> @test_vsub_s16(<4 x i16> %v1, <4 x i16> %v2) #0 {
    150 // CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %v1, %v2
    151 // CHECK:   ret <4 x i16> [[SUB_I]]
    152 int16x4_t test_vsub_s16(int16x4_t v1, int16x4_t v2) {
    153   return vsub_s16(v1, v2);
    154 }
    155 // CHECK-LABEL: define <2 x i32> @test_vsub_s32(<2 x i32> %v1, <2 x i32> %v2) #0 {
    156 // CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %v1, %v2
    157 // CHECK:   ret <2 x i32> [[SUB_I]]
    158 int32x2_t test_vsub_s32(int32x2_t v1, int32x2_t v2) {
    159   return vsub_s32(v1, v2);
    160 }
    161 
    162 // CHECK-LABEL: define <1 x i64> @test_vsub_s64(<1 x i64> %v1, <1 x i64> %v2) #0 {
    163 // CHECK:   [[SUB_I:%.*]] = sub <1 x i64> %v1, %v2
    164 // CHECK:   ret <1 x i64> [[SUB_I]]
    165 int64x1_t test_vsub_s64(int64x1_t v1, int64x1_t v2) {
    166   return vsub_s64(v1, v2);
    167 }
    168 
    169 // CHECK-LABEL: define <2 x float> @test_vsub_f32(<2 x float> %v1, <2 x float> %v2) #0 {
    170 // CHECK:   [[SUB_I:%.*]] = fsub <2 x float> %v1, %v2
    171 // CHECK:   ret <2 x float> [[SUB_I]]
    172 float32x2_t test_vsub_f32(float32x2_t v1, float32x2_t v2) {
    173   return vsub_f32(v1, v2);
    174 }
    175 
    176 // CHECK-LABEL: define <8 x i8> @test_vsub_u8(<8 x i8> %v1, <8 x i8> %v2) #0 {
    177 // CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %v1, %v2
    178 // CHECK:   ret <8 x i8> [[SUB_I]]
    179 uint8x8_t test_vsub_u8(uint8x8_t v1, uint8x8_t v2) {
    180   return vsub_u8(v1, v2);
    181 }
    182 
    183 // CHECK-LABEL: define <4 x i16> @test_vsub_u16(<4 x i16> %v1, <4 x i16> %v2) #0 {
    184 // CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %v1, %v2
    185 // CHECK:   ret <4 x i16> [[SUB_I]]
    186 uint16x4_t test_vsub_u16(uint16x4_t v1, uint16x4_t v2) {
    187   return vsub_u16(v1, v2);
    188 }
    189 
    190 // CHECK-LABEL: define <2 x i32> @test_vsub_u32(<2 x i32> %v1, <2 x i32> %v2) #0 {
    191 // CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %v1, %v2
    192 // CHECK:   ret <2 x i32> [[SUB_I]]
    193 uint32x2_t test_vsub_u32(uint32x2_t v1, uint32x2_t v2) {
    194   return vsub_u32(v1, v2);
    195 }
    196 
    197 // CHECK-LABEL: define <1 x i64> @test_vsub_u64(<1 x i64> %v1, <1 x i64> %v2) #0 {
    198 // CHECK:   [[SUB_I:%.*]] = sub <1 x i64> %v1, %v2
    199 // CHECK:   ret <1 x i64> [[SUB_I]]
    200 uint64x1_t test_vsub_u64(uint64x1_t v1, uint64x1_t v2) {
    201   return vsub_u64(v1, v2);
    202 }
    203 
    204 // CHECK-LABEL: define <16 x i8> @test_vsubq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 {
    205 // CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %v1, %v2
    206 // CHECK:   ret <16 x i8> [[SUB_I]]
    207 int8x16_t test_vsubq_s8(int8x16_t v1, int8x16_t v2) {
    208   return vsubq_s8(v1, v2);
    209 }
    210 
    211 // CHECK-LABEL: define <8 x i16> @test_vsubq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 {
    212 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %v1, %v2
    213 // CHECK:   ret <8 x i16> [[SUB_I]]
    214 int16x8_t test_vsubq_s16(int16x8_t v1, int16x8_t v2) {
    215   return vsubq_s16(v1, v2);
    216 }
    217 
    218 // CHECK-LABEL: define <4 x i32> @test_vsubq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 {
    219 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %v1, %v2
    220 // CHECK:   ret <4 x i32> [[SUB_I]]
    221 int32x4_t test_vsubq_s32(int32x4_t v1,int32x4_t  v2) {
    222   return vsubq_s32(v1, v2);
    223 }
    224 
    225 // CHECK-LABEL: define <2 x i64> @test_vsubq_s64(<2 x i64> %v1, <2 x i64> %v2) #0 {
    226 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %v1, %v2
    227 // CHECK:   ret <2 x i64> [[SUB_I]]
    228 int64x2_t test_vsubq_s64(int64x2_t v1, int64x2_t v2) {
    229   return vsubq_s64(v1, v2);
    230 }
    231 
    232 // CHECK-LABEL: define <4 x float> @test_vsubq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
    233 // CHECK:   [[SUB_I:%.*]] = fsub <4 x float> %v1, %v2
    234 // CHECK:   ret <4 x float> [[SUB_I]]
    235 float32x4_t test_vsubq_f32(float32x4_t v1, float32x4_t v2) {
    236   return vsubq_f32(v1, v2);
    237 }
    238 
    239 // CHECK-LABEL: define <2 x double> @test_vsubq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
    240 // CHECK:   [[SUB_I:%.*]] = fsub <2 x double> %v1, %v2
    241 // CHECK:   ret <2 x double> [[SUB_I]]
    242 float64x2_t test_vsubq_f64(float64x2_t v1, float64x2_t v2) {
    243   return vsubq_f64(v1, v2);
    244 }
    245 
    246 // CHECK-LABEL: define <16 x i8> @test_vsubq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 {
    247 // CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %v1, %v2
    248 // CHECK:   ret <16 x i8> [[SUB_I]]
    249 uint8x16_t test_vsubq_u8(uint8x16_t v1, uint8x16_t v2) {
    250   return vsubq_u8(v1, v2);
    251 }
    252 
    253 // CHECK-LABEL: define <8 x i16> @test_vsubq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 {
    254 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %v1, %v2
    255 // CHECK:   ret <8 x i16> [[SUB_I]]
    256 uint16x8_t test_vsubq_u16(uint16x8_t v1, uint16x8_t v2) {
    257   return vsubq_u16(v1, v2);
    258 }
    259 
    260 // CHECK-LABEL: define <4 x i32> @test_vsubq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 {
    261 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %v1, %v2
    262 // CHECK:   ret <4 x i32> [[SUB_I]]
    263 uint32x4_t test_vsubq_u32(uint32x4_t v1, uint32x4_t v2) {
    264   return vsubq_u32(v1, v2);
    265 }
    266 
    267 // CHECK-LABEL: define <2 x i64> @test_vsubq_u64(<2 x i64> %v1, <2 x i64> %v2) #0 {
    268 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %v1, %v2
    269 // CHECK:   ret <2 x i64> [[SUB_I]]
    270 uint64x2_t test_vsubq_u64(uint64x2_t v1, uint64x2_t v2) {
    271   return vsubq_u64(v1, v2);
    272 }
    273 
    274 // CHECK-LABEL: define <8 x i8> @test_vmul_s8(<8 x i8> %v1, <8 x i8> %v2) #0 {
    275 // CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %v1, %v2
    276 // CHECK:   ret <8 x i8> [[MUL_I]]
    277 int8x8_t test_vmul_s8(int8x8_t v1, int8x8_t v2) {
    278   return vmul_s8(v1, v2);
    279 }
    280 
    281 // CHECK-LABEL: define <4 x i16> @test_vmul_s16(<4 x i16> %v1, <4 x i16> %v2) #0 {
    282 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %v1, %v2
    283 // CHECK:   ret <4 x i16> [[MUL_I]]
    284 int16x4_t test_vmul_s16(int16x4_t v1, int16x4_t v2) {
    285   return vmul_s16(v1, v2);
    286 }
    287 
    288 // CHECK-LABEL: define <2 x i32> @test_vmul_s32(<2 x i32> %v1, <2 x i32> %v2) #0 {
    289 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %v1, %v2
    290 // CHECK:   ret <2 x i32> [[MUL_I]]
    291 int32x2_t test_vmul_s32(int32x2_t v1, int32x2_t v2) {
    292   return vmul_s32(v1, v2);
    293 }
    294 
    295 // CHECK-LABEL: define <2 x float> @test_vmul_f32(<2 x float> %v1, <2 x float> %v2) #0 {
    296 // CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %v1, %v2
    297 // CHECK:   ret <2 x float> [[MUL_I]]
    298 float32x2_t test_vmul_f32(float32x2_t v1, float32x2_t v2) {
    299   return vmul_f32(v1, v2);
    300 }
    301 
    302 
    303 // CHECK-LABEL: define <8 x i8> @test_vmul_u8(<8 x i8> %v1, <8 x i8> %v2) #0 {
    304 // CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %v1, %v2
    305 // CHECK:   ret <8 x i8> [[MUL_I]]
    306 uint8x8_t test_vmul_u8(uint8x8_t v1, uint8x8_t v2) {
    307   return vmul_u8(v1, v2);
    308 }
    309 
    310 // CHECK-LABEL: define <4 x i16> @test_vmul_u16(<4 x i16> %v1, <4 x i16> %v2) #0 {
    311 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %v1, %v2
    312 // CHECK:   ret <4 x i16> [[MUL_I]]
    313 uint16x4_t test_vmul_u16(uint16x4_t v1, uint16x4_t v2) {
    314   return vmul_u16(v1, v2);
    315 }
    316 
    317 // CHECK-LABEL: define <2 x i32> @test_vmul_u32(<2 x i32> %v1, <2 x i32> %v2) #0 {
    318 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %v1, %v2
    319 // CHECK:   ret <2 x i32> [[MUL_I]]
    320 uint32x2_t test_vmul_u32(uint32x2_t v1, uint32x2_t v2) {
    321   return vmul_u32(v1, v2);
    322 }
    323 
    324 // CHECK-LABEL: define <16 x i8> @test_vmulq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 {
    325 // CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %v1, %v2
    326 // CHECK:   ret <16 x i8> [[MUL_I]]
    327 int8x16_t test_vmulq_s8(int8x16_t v1, int8x16_t v2) {
    328   return vmulq_s8(v1, v2);
    329 }
    330 
    331 // CHECK-LABEL: define <8 x i16> @test_vmulq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 {
    332 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %v1, %v2
    333 // CHECK:   ret <8 x i16> [[MUL_I]]
    334 int16x8_t test_vmulq_s16(int16x8_t v1, int16x8_t v2) {
    335   return vmulq_s16(v1, v2);
    336 }
    337 
    338 // CHECK-LABEL: define <4 x i32> @test_vmulq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 {
    339 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %v1, %v2
    340 // CHECK:   ret <4 x i32> [[MUL_I]]
    341 int32x4_t test_vmulq_s32(int32x4_t v1, int32x4_t v2) {
    342   return vmulq_s32(v1, v2);
    343 }
    344 
    345 // CHECK-LABEL: define <16 x i8> @test_vmulq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 {
    346 // CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %v1, %v2
    347 // CHECK:   ret <16 x i8> [[MUL_I]]
    348 uint8x16_t test_vmulq_u8(uint8x16_t v1, uint8x16_t v2) {
    349   return vmulq_u8(v1, v2);
    350 }
    351 
    352 // CHECK-LABEL: define <8 x i16> @test_vmulq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 {
    353 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %v1, %v2
    354 // CHECK:   ret <8 x i16> [[MUL_I]]
    355 uint16x8_t test_vmulq_u16(uint16x8_t v1, uint16x8_t v2) {
    356   return vmulq_u16(v1, v2);
    357 }
    358 
    359 // CHECK-LABEL: define <4 x i32> @test_vmulq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 {
    360 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %v1, %v2
    361 // CHECK:   ret <4 x i32> [[MUL_I]]
    362 uint32x4_t test_vmulq_u32(uint32x4_t v1, uint32x4_t v2) {
    363   return vmulq_u32(v1, v2);
    364 }
    365 
    366 // CHECK-LABEL: define <4 x float> @test_vmulq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
    367 // CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %v1, %v2
    368 // CHECK:   ret <4 x float> [[MUL_I]]
    369 float32x4_t test_vmulq_f32(float32x4_t v1, float32x4_t v2) {
    370   return vmulq_f32(v1, v2);
    371 }
    372 
    373 // CHECK-LABEL: define <2 x double> @test_vmulq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
    374 // CHECK:   [[MUL_I:%.*]] = fmul <2 x double> %v1, %v2
    375 // CHECK:   ret <2 x double> [[MUL_I]]
    376 float64x2_t test_vmulq_f64(float64x2_t v1, float64x2_t v2) {
    377   return vmulq_f64(v1, v2);
    378 }
    379 
    380 // CHECK-LABEL: define <8 x i8> @test_vmul_p8(<8 x i8> %v1, <8 x i8> %v2) #0 {
    381 // CHECK:   [[VMUL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.pmul.v8i8(<8 x i8> %v1, <8 x i8> %v2) #4
    382 // CHECK:   ret <8 x i8> [[VMUL_V_I]]
    383 poly8x8_t test_vmul_p8(poly8x8_t v1, poly8x8_t v2) {
    384   //  test_vmul_p8
    385   return vmul_p8(v1, v2);
    386   //  pmul {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    387 }
    388 
    389 // CHECK-LABEL: define <16 x i8> @test_vmulq_p8(<16 x i8> %v1, <16 x i8> %v2) #0 {
    390 // CHECK:   [[VMULQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.pmul.v16i8(<16 x i8> %v1, <16 x i8> %v2) #4
    391 // CHECK:   ret <16 x i8> [[VMULQ_V_I]]
    392 poly8x16_t test_vmulq_p8(poly8x16_t v1, poly8x16_t v2) {
    393   // test_vmulq_p8
    394   return vmulq_p8(v1, v2);
    395   // pmul {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    396 }
    397 
    398 
    399 // CHECK-LABEL: define <8 x i8> @test_vmla_s8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) #0 {
    400 // CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %v2, %v3
    401 // CHECK:   [[ADD_I:%.*]] = add <8 x i8> %v1, [[MUL_I]]
    402 // CHECK:   ret <8 x i8> [[ADD_I]]
    403 int8x8_t test_vmla_s8(int8x8_t v1, int8x8_t v2, int8x8_t v3) {
    404   return vmla_s8(v1, v2, v3);
    405 }
    406 
    407 // CHECK-LABEL: define <8 x i8> @test_vmla_s16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) #0 {
    408 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %v2, %v3
    409 // CHECK:   [[ADD_I:%.*]] = add <4 x i16> %v1, [[MUL_I]]
    410 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[ADD_I]] to <8 x i8>
    411 // CHECK:   ret <8 x i8> [[TMP0]]
    412 int8x8_t test_vmla_s16(int16x4_t v1, int16x4_t v2, int16x4_t v3) {
    413   return vmla_s16(v1, v2, v3);
    414 }
    415 
    416 // CHECK-LABEL: define <2 x i32> @test_vmla_s32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) #0 {
    417 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %v2, %v3
    418 // CHECK:   [[ADD_I:%.*]] = add <2 x i32> %v1, [[MUL_I]]
    419 // CHECK:   ret <2 x i32> [[ADD_I]]
    420 int32x2_t test_vmla_s32(int32x2_t v1, int32x2_t v2, int32x2_t v3) {
    421   return vmla_s32(v1, v2, v3);
    422 }
    423 
    424 // CHECK-LABEL: define <2 x float> @test_vmla_f32(<2 x float> %v1, <2 x float> %v2, <2 x float> %v3) #0 {
    425 // CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %v2, %v3
    426 // CHECK:   [[ADD_I:%.*]] = fadd <2 x float> %v1, [[MUL_I]]
    427 // CHECK:   ret <2 x float> [[ADD_I]]
    428 float32x2_t test_vmla_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) {
    429   return vmla_f32(v1, v2, v3);
    430 }
    431 
    432 // CHECK-LABEL: define <8 x i8> @test_vmla_u8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) #0 {
    433 // CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %v2, %v3
    434 // CHECK:   [[ADD_I:%.*]] = add <8 x i8> %v1, [[MUL_I]]
    435 // CHECK:   ret <8 x i8> [[ADD_I]]
    436 uint8x8_t test_vmla_u8(uint8x8_t v1, uint8x8_t v2, uint8x8_t v3) {
    437   return vmla_u8(v1, v2, v3);
    438 }
    439 
    440 // CHECK-LABEL: define <4 x i16> @test_vmla_u16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) #0 {
    441 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %v2, %v3
    442 // CHECK:   [[ADD_I:%.*]] = add <4 x i16> %v1, [[MUL_I]]
    443 // CHECK:   ret <4 x i16> [[ADD_I]]
    444 uint16x4_t test_vmla_u16(uint16x4_t v1, uint16x4_t v2, uint16x4_t v3) {
    445   return vmla_u16(v1, v2, v3);
    446 }
    447 
    448 // CHECK-LABEL: define <2 x i32> @test_vmla_u32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) #0 {
    449 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %v2, %v3
    450 // CHECK:   [[ADD_I:%.*]] = add <2 x i32> %v1, [[MUL_I]]
    451 // CHECK:   ret <2 x i32> [[ADD_I]]
    452 uint32x2_t test_vmla_u32(uint32x2_t v1, uint32x2_t v2, uint32x2_t v3) {
    453   return vmla_u32(v1, v2, v3);
    454 }
    455 
    456 // CHECK-LABEL: define <16 x i8> @test_vmlaq_s8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) #0 {
    457 // CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %v2, %v3
    458 // CHECK:   [[ADD_I:%.*]] = add <16 x i8> %v1, [[MUL_I]]
    459 // CHECK:   ret <16 x i8> [[ADD_I]]
    460 int8x16_t test_vmlaq_s8(int8x16_t v1, int8x16_t v2, int8x16_t v3) {
    461   return vmlaq_s8(v1, v2, v3);
    462 }
    463 
    464 // CHECK-LABEL: define <8 x i16> @test_vmlaq_s16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) #0 {
    465 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %v2, %v3
    466 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %v1, [[MUL_I]]
    467 // CHECK:   ret <8 x i16> [[ADD_I]]
    468 int16x8_t test_vmlaq_s16(int16x8_t v1, int16x8_t v2, int16x8_t v3) {
    469   return vmlaq_s16(v1, v2, v3);
    470 }
    471 
    472 // CHECK-LABEL: define <4 x i32> @test_vmlaq_s32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) #0 {
    473 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %v2, %v3
    474 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %v1, [[MUL_I]]
    475 // CHECK:   ret <4 x i32> [[ADD_I]]
    476 int32x4_t test_vmlaq_s32(int32x4_t v1, int32x4_t v2, int32x4_t v3) {
    477   return vmlaq_s32(v1, v2, v3);
    478 }
    479 
    480 // CHECK-LABEL: define <4 x float> @test_vmlaq_f32(<4 x float> %v1, <4 x float> %v2, <4 x float> %v3) #0 {
    481 // CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %v2, %v3
    482 // CHECK:   [[ADD_I:%.*]] = fadd <4 x float> %v1, [[MUL_I]]
    483 // CHECK:   ret <4 x float> [[ADD_I]]
    484 float32x4_t test_vmlaq_f32(float32x4_t v1, float32x4_t v2, float32x4_t v3) {
    485   return vmlaq_f32(v1, v2, v3);
    486 }
    487 
    488 // CHECK-LABEL: define <16 x i8> @test_vmlaq_u8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) #0 {
    489 // CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %v2, %v3
    490 // CHECK:   [[ADD_I:%.*]] = add <16 x i8> %v1, [[MUL_I]]
    491 // CHECK:   ret <16 x i8> [[ADD_I]]
    492 uint8x16_t test_vmlaq_u8(uint8x16_t v1, uint8x16_t v2, uint8x16_t v3) {
    493   return vmlaq_u8(v1, v2, v3);
    494 }
    495 
    496 // CHECK-LABEL: define <8 x i16> @test_vmlaq_u16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) #0 {
    497 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %v2, %v3
    498 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %v1, [[MUL_I]]
    499 // CHECK:   ret <8 x i16> [[ADD_I]]
    500 uint16x8_t test_vmlaq_u16(uint16x8_t v1, uint16x8_t v2, uint16x8_t v3) {
    501   return vmlaq_u16(v1, v2, v3);
    502 }
    503 
    504 // CHECK-LABEL: define <4 x i32> @test_vmlaq_u32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) #0 {
    505 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %v2, %v3
    506 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %v1, [[MUL_I]]
    507 // CHECK:   ret <4 x i32> [[ADD_I]]
    508 uint32x4_t test_vmlaq_u32(uint32x4_t v1, uint32x4_t v2, uint32x4_t v3) {
    509   return vmlaq_u32(v1, v2, v3);
    510 }
    511 
    512 // CHECK-LABEL: define <2 x double> @test_vmlaq_f64(<2 x double> %v1, <2 x double> %v2, <2 x double> %v3) #0 {
    513 // CHECK:   [[MUL_I:%.*]] = fmul <2 x double> %v2, %v3
    514 // CHECK:   [[ADD_I:%.*]] = fadd <2 x double> %v1, [[MUL_I]]
    515 // CHECK:   ret <2 x double> [[ADD_I]]
    516 float64x2_t test_vmlaq_f64(float64x2_t v1, float64x2_t v2, float64x2_t v3) {
    517   return vmlaq_f64(v1, v2, v3);
    518 }
    519 
    520 // CHECK-LABEL: define <8 x i8> @test_vmls_s8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) #0 {
    521 // CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %v2, %v3
    522 // CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %v1, [[MUL_I]]
    523 // CHECK:   ret <8 x i8> [[SUB_I]]
    524 int8x8_t test_vmls_s8(int8x8_t v1, int8x8_t v2, int8x8_t v3) {
    525   return vmls_s8(v1, v2, v3);
    526 }
    527 
    528 // CHECK-LABEL: define <8 x i8> @test_vmls_s16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) #0 {
    529 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %v2, %v3
    530 // CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %v1, [[MUL_I]]
    531 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SUB_I]] to <8 x i8>
    532 // CHECK:   ret <8 x i8> [[TMP0]]
    533 int8x8_t test_vmls_s16(int16x4_t v1, int16x4_t v2, int16x4_t v3) {
    534   return vmls_s16(v1, v2, v3);
    535 }
    536 
    537 // CHECK-LABEL: define <2 x i32> @test_vmls_s32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) #0 {
    538 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %v2, %v3
    539 // CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %v1, [[MUL_I]]
    540 // CHECK:   ret <2 x i32> [[SUB_I]]
    541 int32x2_t test_vmls_s32(int32x2_t v1, int32x2_t v2, int32x2_t v3) {
    542   return vmls_s32(v1, v2, v3);
    543 }
    544 
    545 // CHECK-LABEL: define <2 x float> @test_vmls_f32(<2 x float> %v1, <2 x float> %v2, <2 x float> %v3) #0 {
    546 // CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %v2, %v3
    547 // CHECK:   [[SUB_I:%.*]] = fsub <2 x float> %v1, [[MUL_I]]
    548 // CHECK:   ret <2 x float> [[SUB_I]]
    549 float32x2_t test_vmls_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) {
    550   return vmls_f32(v1, v2, v3);
    551 }
    552 
    553 // CHECK-LABEL: define <8 x i8> @test_vmls_u8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) #0 {
    554 // CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %v2, %v3
    555 // CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %v1, [[MUL_I]]
    556 // CHECK:   ret <8 x i8> [[SUB_I]]
    557 uint8x8_t test_vmls_u8(uint8x8_t v1, uint8x8_t v2, uint8x8_t v3) {
    558   return vmls_u8(v1, v2, v3);
    559 }
    560 
    561 // CHECK-LABEL: define <4 x i16> @test_vmls_u16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) #0 {
    562 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %v2, %v3
    563 // CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %v1, [[MUL_I]]
    564 // CHECK:   ret <4 x i16> [[SUB_I]]
    565 uint16x4_t test_vmls_u16(uint16x4_t v1, uint16x4_t v2, uint16x4_t v3) {
    566   return vmls_u16(v1, v2, v3);
    567 }
    568 
    569 // CHECK-LABEL: define <2 x i32> @test_vmls_u32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) #0 {
    570 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %v2, %v3
    571 // CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %v1, [[MUL_I]]
    572 // CHECK:   ret <2 x i32> [[SUB_I]]
    573 uint32x2_t test_vmls_u32(uint32x2_t v1, uint32x2_t v2, uint32x2_t v3) {
    574   return vmls_u32(v1, v2, v3);
    575 }
    576 // CHECK-LABEL: define <16 x i8> @test_vmlsq_s8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) #0 {
    577 // CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %v2, %v3
    578 // CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %v1, [[MUL_I]]
    579 // CHECK:   ret <16 x i8> [[SUB_I]]
    580 int8x16_t test_vmlsq_s8(int8x16_t v1, int8x16_t v2, int8x16_t v3) {
    581   return vmlsq_s8(v1, v2, v3);
    582 }
    583 
    584 // CHECK-LABEL: define <8 x i16> @test_vmlsq_s16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) #0 {
    585 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %v2, %v3
    586 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %v1, [[MUL_I]]
    587 // CHECK:   ret <8 x i16> [[SUB_I]]
    588 int16x8_t test_vmlsq_s16(int16x8_t v1, int16x8_t v2, int16x8_t v3) {
    589   return vmlsq_s16(v1, v2, v3);
    590 }
    591 
    592 // CHECK-LABEL: define <4 x i32> @test_vmlsq_s32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) #0 {
    593 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %v2, %v3
    594 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %v1, [[MUL_I]]
    595 // CHECK:   ret <4 x i32> [[SUB_I]]
    596 int32x4_t test_vmlsq_s32(int32x4_t v1, int32x4_t v2, int32x4_t v3) {
    597   return vmlsq_s32(v1, v2, v3);
    598 }
    599 
    600 // CHECK-LABEL: define <4 x float> @test_vmlsq_f32(<4 x float> %v1, <4 x float> %v2, <4 x float> %v3) #0 {
    601 // CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %v2, %v3
    602 // CHECK:   [[SUB_I:%.*]] = fsub <4 x float> %v1, [[MUL_I]]
    603 // CHECK:   ret <4 x float> [[SUB_I]]
    604 float32x4_t test_vmlsq_f32(float32x4_t v1, float32x4_t v2, float32x4_t v3) {
    605   return vmlsq_f32(v1, v2, v3);
    606 }
    607 // CHECK-LABEL: define <16 x i8> @test_vmlsq_u8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) #0 {
    608 // CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %v2, %v3
    609 // CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %v1, [[MUL_I]]
    610 // CHECK:   ret <16 x i8> [[SUB_I]]
    611 uint8x16_t test_vmlsq_u8(uint8x16_t v1, uint8x16_t v2, uint8x16_t v3) {
    612   return vmlsq_u8(v1, v2, v3);
    613 }
    614 
    615 // CHECK-LABEL: define <8 x i16> @test_vmlsq_u16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) #0 {
    616 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %v2, %v3
    617 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %v1, [[MUL_I]]
    618 // CHECK:   ret <8 x i16> [[SUB_I]]
    619 uint16x8_t test_vmlsq_u16(uint16x8_t v1, uint16x8_t v2, uint16x8_t v3) {
    620   return vmlsq_u16(v1, v2, v3);
    621 }
    622 
    623 // CHECK-LABEL: define <4 x i32> @test_vmlsq_u32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) #0 {
    624 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %v2, %v3
    625 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %v1, [[MUL_I]]
    626 // CHECK:   ret <4 x i32> [[SUB_I]]
    627 uint32x4_t test_vmlsq_u32(uint32x4_t v1, uint32x4_t v2, uint32x4_t v3) {
    628   return vmlsq_u32(v1, v2, v3);
    629 }
    630 
    631 // CHECK-LABEL: define <2 x double> @test_vmlsq_f64(<2 x double> %v1, <2 x double> %v2, <2 x double> %v3) #0 {
    632 // CHECK:   [[MUL_I:%.*]] = fmul <2 x double> %v2, %v3
    633 // CHECK:   [[SUB_I:%.*]] = fsub <2 x double> %v1, [[MUL_I]]
    634 // CHECK:   ret <2 x double> [[SUB_I]]
    635 float64x2_t test_vmlsq_f64(float64x2_t v1, float64x2_t v2, float64x2_t v3) {
    636   return vmlsq_f64(v1, v2, v3);
    637 }
    638 // CHECK-LABEL: define <2 x float> @test_vfma_f32(<2 x float> %v1, <2 x float> %v2, <2 x float> %v3) #0 {
    639 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8>
    640 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8>
    641 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v3 to <8 x i8>
    642 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
    643 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
    644 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
    645 // CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x float> [[TMP3]]) #4
    646 // CHECK:   ret <2 x float> [[TMP6]]
    647 float32x2_t test_vfma_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) {
    648   return vfma_f32(v1, v2, v3);
    649 }
    650 
    651 // CHECK-LABEL: define <4 x float> @test_vfmaq_f32(<4 x float> %v1, <4 x float> %v2, <4 x float> %v3) #0 {
    652 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8>
    653 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8>
    654 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v3 to <16 x i8>
    655 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
    656 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
    657 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
    658 // CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x float> [[TMP3]]) #4
    659 // CHECK:   ret <4 x float> [[TMP6]]
    660 float32x4_t test_vfmaq_f32(float32x4_t v1, float32x4_t v2, float32x4_t v3) {
    661   return vfmaq_f32(v1, v2, v3);
    662 }
    663 
    664 // CHECK-LABEL: define <2 x double> @test_vfmaq_f64(<2 x double> %v1, <2 x double> %v2, <2 x double> %v3) #0 {
    665 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8>
    666 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8>
    667 // CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v3 to <16 x i8>
    668 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
    669 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
    670 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
    671 // CHECK:   [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[TMP4]], <2 x double> [[TMP5]], <2 x double> [[TMP3]]) #4
    672 // CHECK:   ret <2 x double> [[TMP6]]
    673 float64x2_t test_vfmaq_f64(float64x2_t v1, float64x2_t v2, float64x2_t v3) {
    674   return vfmaq_f64(v1, v2, v3);
    675 }
    676 // CHECK-LABEL: define <2 x float> @test_vfms_f32(<2 x float> %v1, <2 x float> %v2, <2 x float> %v3) #0 {
    677 // CHECK:   [[SUB_I:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v2
    678 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8>
    679 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SUB_I]] to <8 x i8>
    680 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v3 to <8 x i8>
    681 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
    682 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
    683 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
    684 // CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x float> [[TMP3]]) #4
    685 // CHECK:   ret <2 x float> [[TMP6]]
    686 float32x2_t test_vfms_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) {
    687   return vfms_f32(v1, v2, v3);
    688 }
    689 
    690 // CHECK-LABEL: define <4 x float> @test_vfmsq_f32(<4 x float> %v1, <4 x float> %v2, <4 x float> %v3) #0 {
    691 // CHECK:   [[SUB_I:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v2
    692 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8>
    693 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SUB_I]] to <16 x i8>
    694 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v3 to <16 x i8>
    695 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
    696 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
    697 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
    698 // CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x float> [[TMP3]]) #4
    699 // CHECK:   ret <4 x float> [[TMP6]]
    700 float32x4_t test_vfmsq_f32(float32x4_t v1, float32x4_t v2, float32x4_t v3) {
    701   return vfmsq_f32(v1, v2, v3);
    702 }
    703 
    704 // CHECK-LABEL: define <2 x double> @test_vfmsq_f64(<2 x double> %v1, <2 x double> %v2, <2 x double> %v3) #0 {
    705 // CHECK:   [[SUB_I:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %v2
    706 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8>
    707 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SUB_I]] to <16 x i8>
    708 // CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v3 to <16 x i8>
    709 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
    710 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
    711 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
    712 // CHECK:   [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[TMP4]], <2 x double> [[TMP5]], <2 x double> [[TMP3]]) #4
    713 // CHECK:   ret <2 x double> [[TMP6]]
    714 float64x2_t test_vfmsq_f64(float64x2_t v1, float64x2_t v2, float64x2_t v3) {
    715   return vfmsq_f64(v1, v2, v3);
    716 }
    717 
    718 // CHECK-LABEL: define <2 x double> @test_vdivq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
    719 // CHECK:   [[DIV_I:%.*]] = fdiv <2 x double> %v1, %v2
    720 // CHECK:   ret <2 x double> [[DIV_I]]
    721 float64x2_t test_vdivq_f64(float64x2_t v1, float64x2_t v2) {
    722   return vdivq_f64(v1, v2);
    723 }
    724 
    725 // CHECK-LABEL: define <4 x float> @test_vdivq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
    726 // CHECK:   [[DIV_I:%.*]] = fdiv <4 x float> %v1, %v2
    727 // CHECK:   ret <4 x float> [[DIV_I]]
    728 float32x4_t test_vdivq_f32(float32x4_t v1, float32x4_t v2) {
    729   return vdivq_f32(v1, v2);
    730 }
    731 
    732 // CHECK-LABEL: define <2 x float> @test_vdiv_f32(<2 x float> %v1, <2 x float> %v2) #0 {
    733 // CHECK:   [[DIV_I:%.*]] = fdiv <2 x float> %v1, %v2
    734 // CHECK:   ret <2 x float> [[DIV_I]]
    735 float32x2_t test_vdiv_f32(float32x2_t v1, float32x2_t v2) {
    736   return vdiv_f32(v1, v2);
    737 }
    738 
    739 // CHECK-LABEL: define <8 x i8> @test_vaba_s8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) #0 {
    740 // CHECK:   [[VABD_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %v2, <8 x i8> %v3) #4
    741 // CHECK:   [[ADD_I:%.*]] = add <8 x i8> %v1, [[VABD_I_I]]
    742 // CHECK:   ret <8 x i8> [[ADD_I]]
    743 int8x8_t test_vaba_s8(int8x8_t v1, int8x8_t v2, int8x8_t v3) {
    744   return vaba_s8(v1, v2, v3);
    745 }
    746 
    747 // CHECK-LABEL: define <4 x i16> @test_vaba_s16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) #0 {
    748 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
    749 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v3 to <8 x i8>
    750 // CHECK:   [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
    751 // CHECK:   [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
    752 // CHECK:   [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I_I]], <4 x i16> [[VABD1_I_I]]) #4
    753 // CHECK:   [[ADD_I:%.*]] = add <4 x i16> %v1, [[VABD2_I_I]]
    754 // CHECK:   ret <4 x i16> [[ADD_I]]
    755 int16x4_t test_vaba_s16(int16x4_t v1, int16x4_t v2, int16x4_t v3) {
    756   return vaba_s16(v1, v2, v3);
    757 }
    758 
    759 // CHECK-LABEL: define <2 x i32> @test_vaba_s32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) #0 {
    760 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
    761 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v3 to <8 x i8>
    762 // CHECK:   [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
    763 // CHECK:   [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
    764 // CHECK:   [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I_I]], <2 x i32> [[VABD1_I_I]]) #4
    765 // CHECK:   [[ADD_I:%.*]] = add <2 x i32> %v1, [[VABD2_I_I]]
    766 // CHECK:   ret <2 x i32> [[ADD_I]]
    767 int32x2_t test_vaba_s32(int32x2_t v1, int32x2_t v2, int32x2_t v3) {
    768   return vaba_s32(v1, v2, v3);
    769 }
    770 
    771 // CHECK-LABEL: define <8 x i8> @test_vaba_u8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) #0 {
    772 // CHECK:   [[VABD_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %v2, <8 x i8> %v3) #4
    773 // CHECK:   [[ADD_I:%.*]] = add <8 x i8> %v1, [[VABD_I_I]]
    774 // CHECK:   ret <8 x i8> [[ADD_I]]
    775 uint8x8_t test_vaba_u8(uint8x8_t v1, uint8x8_t v2, uint8x8_t v3) {
    776   return vaba_u8(v1, v2, v3);
    777 }
    778 
    779 // CHECK-LABEL: define <4 x i16> @test_vaba_u16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) #0 {
    780 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
    781 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v3 to <8 x i8>
    782 // CHECK:   [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
    783 // CHECK:   [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
    784 // CHECK:   [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I_I]], <4 x i16> [[VABD1_I_I]]) #4
    785 // CHECK:   [[ADD_I:%.*]] = add <4 x i16> %v1, [[VABD2_I_I]]
    786 // CHECK:   ret <4 x i16> [[ADD_I]]
    787 uint16x4_t test_vaba_u16(uint16x4_t v1, uint16x4_t v2, uint16x4_t v3) {
    788   return vaba_u16(v1, v2, v3);
    789 }
    790 
    791 // CHECK-LABEL: define <2 x i32> @test_vaba_u32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) #0 {
    792 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
    793 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v3 to <8 x i8>
    794 // CHECK:   [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
    795 // CHECK:   [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
    796 // CHECK:   [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I_I]], <2 x i32> [[VABD1_I_I]]) #4
    797 // CHECK:   [[ADD_I:%.*]] = add <2 x i32> %v1, [[VABD2_I_I]]
    798 // CHECK:   ret <2 x i32> [[ADD_I]]
    799 uint32x2_t test_vaba_u32(uint32x2_t v1, uint32x2_t v2, uint32x2_t v3) {
    800   return vaba_u32(v1, v2, v3);
    801 }
    802 
    803 // CHECK-LABEL: define <16 x i8> @test_vabaq_s8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) #0 {
    804 // CHECK:   [[VABD_I_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %v2, <16 x i8> %v3) #4
    805 // CHECK:   [[ADD_I:%.*]] = add <16 x i8> %v1, [[VABD_I_I]]
    806 // CHECK:   ret <16 x i8> [[ADD_I]]
    807 int8x16_t test_vabaq_s8(int8x16_t v1, int8x16_t v2, int8x16_t v3) {
    808   return vabaq_s8(v1, v2, v3);
    809 }
    810 
    811 // CHECK-LABEL: define <8 x i16> @test_vabaq_s16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) #0 {
    812 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
    813 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v3 to <16 x i8>
    814 // CHECK:   [[VABD_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
    815 // CHECK:   [[VABD1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
    816 // CHECK:   [[VABD2_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> [[VABD_I_I]], <8 x i16> [[VABD1_I_I]]) #4
    817 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %v1, [[VABD2_I_I]]
    818 // CHECK:   ret <8 x i16> [[ADD_I]]
    819 int16x8_t test_vabaq_s16(int16x8_t v1, int16x8_t v2, int16x8_t v3) {
    820   return vabaq_s16(v1, v2, v3);
    821 }
    822 
    823 // CHECK-LABEL: define <4 x i32> @test_vabaq_s32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) #0 {
    824 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
    825 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v3 to <16 x i8>
    826 // CHECK:   [[VABD_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
    827 // CHECK:   [[VABD1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
    828 // CHECK:   [[VABD2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> [[VABD_I_I]], <4 x i32> [[VABD1_I_I]]) #4
    829 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %v1, [[VABD2_I_I]]
    830 // CHECK:   ret <4 x i32> [[ADD_I]]
    831 int32x4_t test_vabaq_s32(int32x4_t v1, int32x4_t v2, int32x4_t v3) {
    832   return vabaq_s32(v1, v2, v3);
    833 }
    834 
    835 // CHECK-LABEL: define <16 x i8> @test_vabaq_u8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) #0 {
    836 // CHECK:   [[VABD_I_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %v2, <16 x i8> %v3) #4
    837 // CHECK:   [[ADD_I:%.*]] = add <16 x i8> %v1, [[VABD_I_I]]
    838 // CHECK:   ret <16 x i8> [[ADD_I]]
    839 uint8x16_t test_vabaq_u8(uint8x16_t v1, uint8x16_t v2, uint8x16_t v3) {
    840   return vabaq_u8(v1, v2, v3);
    841 }
    842 
    843 // CHECK-LABEL: define <8 x i16> @test_vabaq_u16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) #0 {
    844 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
    845 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v3 to <16 x i8>
    846 // CHECK:   [[VABD_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
    847 // CHECK:   [[VABD1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
    848 // CHECK:   [[VABD2_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> [[VABD_I_I]], <8 x i16> [[VABD1_I_I]]) #4
    849 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %v1, [[VABD2_I_I]]
    850 // CHECK:   ret <8 x i16> [[ADD_I]]
    851 uint16x8_t test_vabaq_u16(uint16x8_t v1, uint16x8_t v2, uint16x8_t v3) {
    852   return vabaq_u16(v1, v2, v3);
    853 }
    854 
    855 // CHECK-LABEL: define <4 x i32> @test_vabaq_u32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) #0 {
    856 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
    857 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v3 to <16 x i8>
    858 // CHECK:   [[VABD_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
    859 // CHECK:   [[VABD1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
    860 // CHECK:   [[VABD2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> [[VABD_I_I]], <4 x i32> [[VABD1_I_I]]) #4
    861 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %v1, [[VABD2_I_I]]
    862 // CHECK:   ret <4 x i32> [[ADD_I]]
    863 uint32x4_t test_vabaq_u32(uint32x4_t v1, uint32x4_t v2, uint32x4_t v3) {
    864   return vabaq_u32(v1, v2, v3);
    865 }
    866 
    867 // CHECK-LABEL: define <8 x i8> @test_vabd_s8(<8 x i8> %v1, <8 x i8> %v2) #0 {
    868 // CHECK:   [[VABD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %v1, <8 x i8> %v2) #4
    869 // CHECK:   ret <8 x i8> [[VABD_I]]
    870 int8x8_t test_vabd_s8(int8x8_t v1, int8x8_t v2) {
    871   return vabd_s8(v1, v2);
    872 }
    873 
    874 // CHECK-LABEL: define <4 x i16> @test_vabd_s16(<4 x i16> %v1, <4 x i16> %v2) #0 {
    875 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
    876 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
    877 // CHECK:   [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
    878 // CHECK:   [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
    879 // CHECK:   [[VABD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I]], <4 x i16> [[VABD1_I]]) #4
    880 // CHECK:   ret <4 x i16> [[VABD2_I]]
    881 int16x4_t test_vabd_s16(int16x4_t v1, int16x4_t v2) {
    882   return vabd_s16(v1, v2);
    883 }
    884 
    885 // CHECK-LABEL: define <2 x i32> @test_vabd_s32(<2 x i32> %v1, <2 x i32> %v2) #0 {
    886 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
    887 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
    888 // CHECK:   [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
    889 // CHECK:   [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
    890 // CHECK:   [[VABD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I]], <2 x i32> [[VABD1_I]]) #4
    891 // CHECK:   ret <2 x i32> [[VABD2_I]]
    892 int32x2_t test_vabd_s32(int32x2_t v1, int32x2_t v2) {
    893   return vabd_s32(v1, v2);
    894 }
    895 
    896 // CHECK-LABEL: define <8 x i8> @test_vabd_u8(<8 x i8> %v1, <8 x i8> %v2) #0 {
    897 // CHECK:   [[VABD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %v1, <8 x i8> %v2) #4
    898 // CHECK:   ret <8 x i8> [[VABD_I]]
    899 uint8x8_t test_vabd_u8(uint8x8_t v1, uint8x8_t v2) {
    900   return vabd_u8(v1, v2);
    901 }
    902 
    903 // CHECK-LABEL: define <4 x i16> @test_vabd_u16(<4 x i16> %v1, <4 x i16> %v2) #0 {
    904 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
    905 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
    906 // CHECK:   [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
    907 // CHECK:   [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
    908 // CHECK:   [[VABD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I]], <4 x i16> [[VABD1_I]]) #4
    909 // CHECK:   ret <4 x i16> [[VABD2_I]]
    910 uint16x4_t test_vabd_u16(uint16x4_t v1, uint16x4_t v2) {
    911   return vabd_u16(v1, v2);
    912 }
    913 
    914 // CHECK-LABEL: define <2 x i32> @test_vabd_u32(<2 x i32> %v1, <2 x i32> %v2) #0 {
    915 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
    916 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
    917 // CHECK:   [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
    918 // CHECK:   [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
    919 // CHECK:   [[VABD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I]], <2 x i32> [[VABD1_I]]) #4
    920 // CHECK:   ret <2 x i32> [[VABD2_I]]
    921 uint32x2_t test_vabd_u32(uint32x2_t v1, uint32x2_t v2) {
    922   return vabd_u32(v1, v2);
    923 }
    924 
    925 // CHECK-LABEL: define <2 x float> @test_vabd_f32(<2 x float> %v1, <2 x float> %v2) #0 {
    926 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8>
    927 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8>
    928 // CHECK:   [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
    929 // CHECK:   [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
    930 // CHECK:   [[VABD2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float> [[VABD_I]], <2 x float> [[VABD1_I]]) #4
    931 // CHECK:   ret <2 x float> [[VABD2_I]]
    932 float32x2_t test_vabd_f32(float32x2_t v1, float32x2_t v2) {
    933   return vabd_f32(v1, v2);
    934 }
    935 
    936 // CHECK-LABEL: define <16 x i8> @test_vabdq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 {
    937 // CHECK:   [[VABD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %v1, <16 x i8> %v2) #4
    938 // CHECK:   ret <16 x i8> [[VABD_I]]
    939 int8x16_t test_vabdq_s8(int8x16_t v1, int8x16_t v2) {
    940   return vabdq_s8(v1, v2);
    941 }
    942 
    943 // CHECK-LABEL: define <8 x i16> @test_vabdq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 {
    944 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
    945 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
    946 // CHECK:   [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
    947 // CHECK:   [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
    948 // CHECK:   [[VABD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> [[VABD_I]], <8 x i16> [[VABD1_I]]) #4
    949 // CHECK:   ret <8 x i16> [[VABD2_I]]
    950 int16x8_t test_vabdq_s16(int16x8_t v1, int16x8_t v2) {
    951   return vabdq_s16(v1, v2);
    952 }
    953 
    954 // CHECK-LABEL: define <4 x i32> @test_vabdq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 {
    955 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
    956 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
    957 // CHECK:   [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
    958 // CHECK:   [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
    959 // CHECK:   [[VABD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> [[VABD_I]], <4 x i32> [[VABD1_I]]) #4
    960 // CHECK:   ret <4 x i32> [[VABD2_I]]
    961 int32x4_t test_vabdq_s32(int32x4_t v1, int32x4_t v2) {
    962   return vabdq_s32(v1, v2);
    963 }
    964 
    965 // CHECK-LABEL: define <16 x i8> @test_vabdq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 {
    966 // CHECK:   [[VABD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %v1, <16 x i8> %v2) #4
    967 // CHECK:   ret <16 x i8> [[VABD_I]]
    968 uint8x16_t test_vabdq_u8(uint8x16_t v1, uint8x16_t v2) {
    969   return vabdq_u8(v1, v2);
    970 }
    971 
    972 // CHECK-LABEL: define <8 x i16> @test_vabdq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 {
    973 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
    974 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
    975 // CHECK:   [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
    976 // CHECK:   [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
    977 // CHECK:   [[VABD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> [[VABD_I]], <8 x i16> [[VABD1_I]]) #4
    978 // CHECK:   ret <8 x i16> [[VABD2_I]]
    979 uint16x8_t test_vabdq_u16(uint16x8_t v1, uint16x8_t v2) {
    980   return vabdq_u16(v1, v2);
    981 }
    982 
    983 // CHECK-LABEL: define <4 x i32> @test_vabdq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 {
    984 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
    985 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
    986 // CHECK:   [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
    987 // CHECK:   [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
    988 // CHECK:   [[VABD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> [[VABD_I]], <4 x i32> [[VABD1_I]]) #4
    989 // CHECK:   ret <4 x i32> [[VABD2_I]]
    990 uint32x4_t test_vabdq_u32(uint32x4_t v1, uint32x4_t v2) {
    991   return vabdq_u32(v1, v2);
    992 }
    993 
    994 // CHECK-LABEL: define <4 x float> @test_vabdq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
    995 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8>
    996 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8>
    997 // CHECK:   [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
    998 // CHECK:   [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
    999 // CHECK:   [[VABD2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float> [[VABD_I]], <4 x float> [[VABD1_I]]) #4
   1000 // CHECK:   ret <4 x float> [[VABD2_I]]
   1001 float32x4_t test_vabdq_f32(float32x4_t v1, float32x4_t v2) {
   1002   return vabdq_f32(v1, v2);
   1003 }
   1004 
   1005 // CHECK-LABEL: define <2 x double> @test_vabdq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
   1006 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8>
   1007 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8>
   1008 // CHECK:   [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
   1009 // CHECK:   [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
   1010 // CHECK:   [[VABD2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double> [[VABD_I]], <2 x double> [[VABD1_I]]) #4
   1011 // CHECK:   ret <2 x double> [[VABD2_I]]
   1012 float64x2_t test_vabdq_f64(float64x2_t v1, float64x2_t v2) {
   1013   return vabdq_f64(v1, v2);
   1014 }
   1015 
   1016 
   1017 // CHECK-LABEL: define <8 x i8> @test_vbsl_s8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) #0 {
   1018 // CHECK:   [[VBSL_I:%.*]] = and <8 x i8> %v1, %v2
   1019 // CHECK:   [[TMP0:%.*]] = xor <8 x i8> %v1, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
   1020 // CHECK:   [[VBSL1_I:%.*]] = and <8 x i8> [[TMP0]], %v3
   1021 // CHECK:   [[VBSL2_I:%.*]] = or <8 x i8> [[VBSL_I]], [[VBSL1_I]]
   1022 // CHECK:   ret <8 x i8> [[VBSL2_I]]
   1023 int8x8_t test_vbsl_s8(uint8x8_t v1, int8x8_t v2, int8x8_t v3) {
   1024   return vbsl_s8(v1, v2, v3);
   1025 }
   1026 
   1027 // CHECK-LABEL: define <8 x i8> @test_vbsl_s16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) #0 {
   1028 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
   1029 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
   1030 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %v3 to <8 x i8>
   1031 // CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   1032 // CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   1033 // CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
   1034 // CHECK:   [[VBSL3_I:%.*]] = and <4 x i16> [[VBSL_I]], [[VBSL1_I]]
   1035 // CHECK:   [[TMP3:%.*]] = xor <4 x i16> [[VBSL_I]], <i16 -1, i16 -1, i16 -1, i16 -1>
   1036 // CHECK:   [[VBSL4_I:%.*]] = and <4 x i16> [[TMP3]], [[VBSL2_I]]
   1037 // CHECK:   [[VBSL5_I:%.*]] = or <4 x i16> [[VBSL3_I]], [[VBSL4_I]]
   1038 // CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[VBSL5_I]] to <8 x i8>
   1039 // CHECK:   ret <8 x i8> [[TMP4]]
   1040 int8x8_t test_vbsl_s16(uint16x4_t v1, int16x4_t v2, int16x4_t v3) {
   1041   return vbsl_s16(v1, v2, v3);
   1042 }
   1043 
   1044 // CHECK-LABEL: define <2 x i32> @test_vbsl_s32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) #0 {
   1045 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
   1046 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
   1047 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %v3 to <8 x i8>
   1048 // CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   1049 // CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   1050 // CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
   1051 // CHECK:   [[VBSL3_I:%.*]] = and <2 x i32> [[VBSL_I]], [[VBSL1_I]]
   1052 // CHECK:   [[TMP3:%.*]] = xor <2 x i32> [[VBSL_I]], <i32 -1, i32 -1>
   1053 // CHECK:   [[VBSL4_I:%.*]] = and <2 x i32> [[TMP3]], [[VBSL2_I]]
   1054 // CHECK:   [[VBSL5_I:%.*]] = or <2 x i32> [[VBSL3_I]], [[VBSL4_I]]
   1055 // CHECK:   ret <2 x i32> [[VBSL5_I]]
   1056 int32x2_t test_vbsl_s32(uint32x2_t v1, int32x2_t v2, int32x2_t v3) {
   1057   return vbsl_s32(v1, v2, v3);
   1058 }
   1059 
   1060 // CHECK-LABEL: define <1 x i64> @test_vbsl_s64(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3) #0 {
   1061 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %v1 to <8 x i8>
   1062 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %v2 to <8 x i8>
   1063 // CHECK:   [[TMP2:%.*]] = bitcast <1 x i64> %v3 to <8 x i8>
   1064 // CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   1065 // CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   1066 // CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64>
   1067 // CHECK:   [[VBSL3_I:%.*]] = and <1 x i64> [[VBSL_I]], [[VBSL1_I]]
   1068 // CHECK:   [[TMP3:%.*]] = xor <1 x i64> [[VBSL_I]], <i64 -1>
   1069 // CHECK:   [[VBSL4_I:%.*]] = and <1 x i64> [[TMP3]], [[VBSL2_I]]
   1070 // CHECK:   [[VBSL5_I:%.*]] = or <1 x i64> [[VBSL3_I]], [[VBSL4_I]]
   1071 // CHECK:   ret <1 x i64> [[VBSL5_I]]
   1072 uint64x1_t test_vbsl_s64(uint64x1_t v1, uint64x1_t v2, uint64x1_t v3) {
   1073   return vbsl_s64(v1, v2, v3);
   1074 }
   1075 
   1076 // CHECK-LABEL: define <8 x i8> @test_vbsl_u8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) #0 {
   1077 // CHECK:   [[VBSL_I:%.*]] = and <8 x i8> %v1, %v2
   1078 // CHECK:   [[TMP0:%.*]] = xor <8 x i8> %v1, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
   1079 // CHECK:   [[VBSL1_I:%.*]] = and <8 x i8> [[TMP0]], %v3
   1080 // CHECK:   [[VBSL2_I:%.*]] = or <8 x i8> [[VBSL_I]], [[VBSL1_I]]
   1081 // CHECK:   ret <8 x i8> [[VBSL2_I]]
   1082 uint8x8_t test_vbsl_u8(uint8x8_t v1, uint8x8_t v2, uint8x8_t v3) {
   1083   return vbsl_u8(v1, v2, v3);
   1084 }
   1085 
   1086 // CHECK-LABEL: define <4 x i16> @test_vbsl_u16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) #0 {
   1087 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
   1088 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
   1089 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %v3 to <8 x i8>
   1090 // CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   1091 // CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   1092 // CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
   1093 // CHECK:   [[VBSL3_I:%.*]] = and <4 x i16> [[VBSL_I]], [[VBSL1_I]]
   1094 // CHECK:   [[TMP3:%.*]] = xor <4 x i16> [[VBSL_I]], <i16 -1, i16 -1, i16 -1, i16 -1>
   1095 // CHECK:   [[VBSL4_I:%.*]] = and <4 x i16> [[TMP3]], [[VBSL2_I]]
   1096 // CHECK:   [[VBSL5_I:%.*]] = or <4 x i16> [[VBSL3_I]], [[VBSL4_I]]
   1097 // CHECK:   ret <4 x i16> [[VBSL5_I]]
   1098 uint16x4_t test_vbsl_u16(uint16x4_t v1, uint16x4_t v2, uint16x4_t v3) {
   1099   return vbsl_u16(v1, v2, v3);
   1100 }
   1101 
   1102 // CHECK-LABEL: define <2 x i32> @test_vbsl_u32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) #0 {
   1103 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
   1104 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
   1105 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %v3 to <8 x i8>
   1106 // CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   1107 // CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   1108 // CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
   1109 // CHECK:   [[VBSL3_I:%.*]] = and <2 x i32> [[VBSL_I]], [[VBSL1_I]]
   1110 // CHECK:   [[TMP3:%.*]] = xor <2 x i32> [[VBSL_I]], <i32 -1, i32 -1>
   1111 // CHECK:   [[VBSL4_I:%.*]] = and <2 x i32> [[TMP3]], [[VBSL2_I]]
   1112 // CHECK:   [[VBSL5_I:%.*]] = or <2 x i32> [[VBSL3_I]], [[VBSL4_I]]
   1113 // CHECK:   ret <2 x i32> [[VBSL5_I]]
   1114 uint32x2_t test_vbsl_u32(uint32x2_t v1, uint32x2_t v2, uint32x2_t v3) {
   1115   return vbsl_u32(v1, v2, v3);
   1116 }
   1117 
   1118 // CHECK-LABEL: define <1 x i64> @test_vbsl_u64(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3) #0 {
   1119 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %v1 to <8 x i8>
   1120 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %v2 to <8 x i8>
   1121 // CHECK:   [[TMP2:%.*]] = bitcast <1 x i64> %v3 to <8 x i8>
   1122 // CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   1123 // CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   1124 // CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64>
   1125 // CHECK:   [[VBSL3_I:%.*]] = and <1 x i64> [[VBSL_I]], [[VBSL1_I]]
   1126 // CHECK:   [[TMP3:%.*]] = xor <1 x i64> [[VBSL_I]], <i64 -1>
   1127 // CHECK:   [[VBSL4_I:%.*]] = and <1 x i64> [[TMP3]], [[VBSL2_I]]
   1128 // CHECK:   [[VBSL5_I:%.*]] = or <1 x i64> [[VBSL3_I]], [[VBSL4_I]]
   1129 // CHECK:   ret <1 x i64> [[VBSL5_I]]
   1130 uint64x1_t test_vbsl_u64(uint64x1_t v1, uint64x1_t v2, uint64x1_t v3) {
   1131   return vbsl_u64(v1, v2, v3);
   1132 }
   1133 
   1134 // CHECK-LABEL: define <2 x float> @test_vbsl_f32(<2 x float> %v1, <2 x float> %v2, <2 x float> %v3) #0 {
   1135 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <2 x i32>
   1136 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
   1137 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v2 to <8 x i8>
   1138 // CHECK:   [[TMP3:%.*]] = bitcast <2 x float> %v3 to <8 x i8>
   1139 // CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   1140 // CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
   1141 // CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
   1142 // CHECK:   [[VBSL3_I:%.*]] = and <2 x i32> [[VBSL_I]], [[VBSL1_I]]
   1143 // CHECK:   [[TMP4:%.*]] = xor <2 x i32> [[VBSL_I]], <i32 -1, i32 -1>
   1144 // CHECK:   [[VBSL4_I:%.*]] = and <2 x i32> [[TMP4]], [[VBSL2_I]]
   1145 // CHECK:   [[VBSL5_I:%.*]] = or <2 x i32> [[VBSL3_I]], [[VBSL4_I]]
   1146 // CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[VBSL5_I]] to <2 x float>
   1147 // CHECK:   ret <2 x float> [[TMP5]]
   1148 float32x2_t test_vbsl_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) {
   1149   return vbsl_f32(v1, v2, v3);
   1150 }
   1151 
   1152 // CHECK-LABEL: define <1 x double> @test_vbsl_f64(<1 x i64> %v1, <1 x double> %v2, <1 x double> %v3) #0 {
   1153 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %v1 to <8 x i8>
   1154 // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %v2 to <8 x i8>
   1155 // CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %v3 to <8 x i8>
   1156 // CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   1157 // CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   1158 // CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64>
   1159 // CHECK:   [[VBSL3_I:%.*]] = and <1 x i64> [[VBSL_I]], [[VBSL1_I]]
   1160 // CHECK:   [[TMP3:%.*]] = xor <1 x i64> [[VBSL_I]], <i64 -1>
   1161 // CHECK:   [[VBSL4_I:%.*]] = and <1 x i64> [[TMP3]], [[VBSL2_I]]
   1162 // CHECK:   [[VBSL5_I:%.*]] = or <1 x i64> [[VBSL3_I]], [[VBSL4_I]]
   1163 // CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[VBSL5_I]] to <1 x double>
   1164 // CHECK:   ret <1 x double> [[TMP4]]
   1165 float64x1_t test_vbsl_f64(uint64x1_t v1, float64x1_t v2, float64x1_t v3) {
   1166   return vbsl_f64(v1, v2, v3);
   1167 }
   1168 
   1169 // CHECK-LABEL: define <8 x i8> @test_vbsl_p8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) #0 {
   1170 // CHECK:   [[VBSL_I:%.*]] = and <8 x i8> %v1, %v2
   1171 // CHECK:   [[TMP0:%.*]] = xor <8 x i8> %v1, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
   1172 // CHECK:   [[VBSL1_I:%.*]] = and <8 x i8> [[TMP0]], %v3
   1173 // CHECK:   [[VBSL2_I:%.*]] = or <8 x i8> [[VBSL_I]], [[VBSL1_I]]
   1174 // CHECK:   ret <8 x i8> [[VBSL2_I]]
   1175 poly8x8_t test_vbsl_p8(uint8x8_t v1, poly8x8_t v2, poly8x8_t v3) {
   1176   return vbsl_p8(v1, v2, v3);
   1177 }
   1178 
   1179 // CHECK-LABEL: define <4 x i16> @test_vbsl_p16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) #0 {
   1180 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
   1181 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
   1182 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %v3 to <8 x i8>
   1183 // CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   1184 // CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   1185 // CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
   1186 // CHECK:   [[VBSL3_I:%.*]] = and <4 x i16> [[VBSL_I]], [[VBSL1_I]]
   1187 // CHECK:   [[TMP3:%.*]] = xor <4 x i16> [[VBSL_I]], <i16 -1, i16 -1, i16 -1, i16 -1>
   1188 // CHECK:   [[VBSL4_I:%.*]] = and <4 x i16> [[TMP3]], [[VBSL2_I]]
   1189 // CHECK:   [[VBSL5_I:%.*]] = or <4 x i16> [[VBSL3_I]], [[VBSL4_I]]
   1190 // CHECK:   ret <4 x i16> [[VBSL5_I]]
   1191 poly16x4_t test_vbsl_p16(uint16x4_t v1, poly16x4_t v2, poly16x4_t v3) {
   1192   return vbsl_p16(v1, v2, v3);
   1193 }
   1194 
   1195 // CHECK-LABEL: define <16 x i8> @test_vbslq_s8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) #0 {
   1196 // CHECK:   [[VBSL_I:%.*]] = and <16 x i8> %v1, %v2
   1197 // CHECK:   [[TMP0:%.*]] = xor <16 x i8> %v1, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
   1198 // CHECK:   [[VBSL1_I:%.*]] = and <16 x i8> [[TMP0]], %v3
   1199 // CHECK:   [[VBSL2_I:%.*]] = or <16 x i8> [[VBSL_I]], [[VBSL1_I]]
   1200 // CHECK:   ret <16 x i8> [[VBSL2_I]]
   1201 int8x16_t test_vbslq_s8(uint8x16_t v1, int8x16_t v2, int8x16_t v3) {
   1202   return vbslq_s8(v1, v2, v3);
   1203 }
   1204 
   1205 // CHECK-LABEL: define <8 x i16> @test_vbslq_s16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) #0 {
   1206 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
   1207 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
   1208 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %v3 to <16 x i8>
   1209 // CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   1210 // CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   1211 // CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
   1212 // CHECK:   [[VBSL3_I:%.*]] = and <8 x i16> [[VBSL_I]], [[VBSL1_I]]
   1213 // CHECK:   [[TMP3:%.*]] = xor <8 x i16> [[VBSL_I]], <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
   1214 // CHECK:   [[VBSL4_I:%.*]] = and <8 x i16> [[TMP3]], [[VBSL2_I]]
   1215 // CHECK:   [[VBSL5_I:%.*]] = or <8 x i16> [[VBSL3_I]], [[VBSL4_I]]
   1216 // CHECK:   ret <8 x i16> [[VBSL5_I]]
   1217 int16x8_t test_vbslq_s16(uint16x8_t v1, int16x8_t v2, int16x8_t v3) {
   1218   return vbslq_s16(v1, v2, v3);
   1219 }
   1220 
   1221 // CHECK-LABEL: define <4 x i32> @test_vbslq_s32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) #0 {
   1222 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
   1223 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
   1224 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %v3 to <16 x i8>
   1225 // CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   1226 // CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   1227 // CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
   1228 // CHECK:   [[VBSL3_I:%.*]] = and <4 x i32> [[VBSL_I]], [[VBSL1_I]]
   1229 // CHECK:   [[TMP3:%.*]] = xor <4 x i32> [[VBSL_I]], <i32 -1, i32 -1, i32 -1, i32 -1>
   1230 // CHECK:   [[VBSL4_I:%.*]] = and <4 x i32> [[TMP3]], [[VBSL2_I]]
   1231 // CHECK:   [[VBSL5_I:%.*]] = or <4 x i32> [[VBSL3_I]], [[VBSL4_I]]
   1232 // CHECK:   ret <4 x i32> [[VBSL5_I]]
   1233 int32x4_t test_vbslq_s32(uint32x4_t v1, int32x4_t v2, int32x4_t v3) {
   1234   return vbslq_s32(v1, v2, v3);
   1235 }
   1236 
   1237 // CHECK-LABEL: define <2 x i64> @test_vbslq_s64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3) #0 {
   1238 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %v1 to <16 x i8>
   1239 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %v2 to <16 x i8>
   1240 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %v3 to <16 x i8>
   1241 // CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   1242 // CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   1243 // CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
   1244 // CHECK:   [[VBSL3_I:%.*]] = and <2 x i64> [[VBSL_I]], [[VBSL1_I]]
   1245 // CHECK:   [[TMP3:%.*]] = xor <2 x i64> [[VBSL_I]], <i64 -1, i64 -1>
   1246 // CHECK:   [[VBSL4_I:%.*]] = and <2 x i64> [[TMP3]], [[VBSL2_I]]
   1247 // CHECK:   [[VBSL5_I:%.*]] = or <2 x i64> [[VBSL3_I]], [[VBSL4_I]]
   1248 // CHECK:   ret <2 x i64> [[VBSL5_I]]
   1249 int64x2_t test_vbslq_s64(uint64x2_t v1, int64x2_t v2, int64x2_t v3) {
   1250   return vbslq_s64(v1, v2, v3);
   1251 }
   1252 
   1253 // CHECK-LABEL: define <16 x i8> @test_vbslq_u8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) #0 {
   1254 // CHECK:   [[VBSL_I:%.*]] = and <16 x i8> %v1, %v2
   1255 // CHECK:   [[TMP0:%.*]] = xor <16 x i8> %v1, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
   1256 // CHECK:   [[VBSL1_I:%.*]] = and <16 x i8> [[TMP0]], %v3
   1257 // CHECK:   [[VBSL2_I:%.*]] = or <16 x i8> [[VBSL_I]], [[VBSL1_I]]
   1258 // CHECK:   ret <16 x i8> [[VBSL2_I]]
   1259 uint8x16_t test_vbslq_u8(uint8x16_t v1, uint8x16_t v2, uint8x16_t v3) {
   1260   return vbslq_u8(v1, v2, v3);
   1261 }
   1262 
   1263 // CHECK-LABEL: define <8 x i16> @test_vbslq_u16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) #0 {
   1264 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
   1265 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
   1266 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %v3 to <16 x i8>
   1267 // CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   1268 // CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   1269 // CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
   1270 // CHECK:   [[VBSL3_I:%.*]] = and <8 x i16> [[VBSL_I]], [[VBSL1_I]]
   1271 // CHECK:   [[TMP3:%.*]] = xor <8 x i16> [[VBSL_I]], <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
   1272 // CHECK:   [[VBSL4_I:%.*]] = and <8 x i16> [[TMP3]], [[VBSL2_I]]
   1273 // CHECK:   [[VBSL5_I:%.*]] = or <8 x i16> [[VBSL3_I]], [[VBSL4_I]]
   1274 // CHECK:   ret <8 x i16> [[VBSL5_I]]
   1275 uint16x8_t test_vbslq_u16(uint16x8_t v1, uint16x8_t v2, uint16x8_t v3) {
   1276   return vbslq_u16(v1, v2, v3);
   1277 }
   1278 
   1279 // CHECK-LABEL: define <4 x i32> @test_vbslq_u32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) #0 {
   1280 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
   1281 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
   1282 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %v3 to <16 x i8>
   1283 // CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   1284 // CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   1285 // CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
   1286 // CHECK:   [[VBSL3_I:%.*]] = and <4 x i32> [[VBSL_I]], [[VBSL1_I]]
   1287 // CHECK:   [[TMP3:%.*]] = xor <4 x i32> [[VBSL_I]], <i32 -1, i32 -1, i32 -1, i32 -1>
   1288 // CHECK:   [[VBSL4_I:%.*]] = and <4 x i32> [[TMP3]], [[VBSL2_I]]
   1289 // CHECK:   [[VBSL5_I:%.*]] = or <4 x i32> [[VBSL3_I]], [[VBSL4_I]]
   1290 // CHECK:   ret <4 x i32> [[VBSL5_I]]
   1291 int32x4_t test_vbslq_u32(uint32x4_t v1, int32x4_t v2, int32x4_t v3) {
   1292   return vbslq_s32(v1, v2, v3);
   1293 }
   1294 
   1295 // CHECK-LABEL: define <2 x i64> @test_vbslq_u64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3) #0 {
   1296 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %v1 to <16 x i8>
   1297 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %v2 to <16 x i8>
   1298 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %v3 to <16 x i8>
   1299 // CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   1300 // CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   1301 // CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
   1302 // CHECK:   [[VBSL3_I:%.*]] = and <2 x i64> [[VBSL_I]], [[VBSL1_I]]
   1303 // CHECK:   [[TMP3:%.*]] = xor <2 x i64> [[VBSL_I]], <i64 -1, i64 -1>
   1304 // CHECK:   [[VBSL4_I:%.*]] = and <2 x i64> [[TMP3]], [[VBSL2_I]]
   1305 // CHECK:   [[VBSL5_I:%.*]] = or <2 x i64> [[VBSL3_I]], [[VBSL4_I]]
   1306 // CHECK:   ret <2 x i64> [[VBSL5_I]]
   1307 uint64x2_t test_vbslq_u64(uint64x2_t v1, uint64x2_t v2, uint64x2_t v3) {
   1308   return vbslq_u64(v1, v2, v3);
   1309 }
   1310 
   1311 // CHECK-LABEL: define <4 x float> @test_vbslq_f32(<4 x i32> %v1, <4 x float> %v2, <4 x float> %v3) #0 {
   1312 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
   1313 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8>
   1314 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v3 to <16 x i8>
   1315 // CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   1316 // CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   1317 // CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
   1318 // CHECK:   [[VBSL3_I:%.*]] = and <4 x i32> [[VBSL_I]], [[VBSL1_I]]
   1319 // CHECK:   [[TMP3:%.*]] = xor <4 x i32> [[VBSL_I]], <i32 -1, i32 -1, i32 -1, i32 -1>
   1320 // CHECK:   [[VBSL4_I:%.*]] = and <4 x i32> [[TMP3]], [[VBSL2_I]]
   1321 // CHECK:   [[VBSL5_I:%.*]] = or <4 x i32> [[VBSL3_I]], [[VBSL4_I]]
   1322 // CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[VBSL5_I]] to <4 x float>
   1323 // CHECK:   ret <4 x float> [[TMP4]]
   1324 float32x4_t test_vbslq_f32(uint32x4_t v1, float32x4_t v2, float32x4_t v3) {
   1325   return vbslq_f32(v1, v2, v3);
   1326 }
   1327 
   1328 // CHECK-LABEL: define <16 x i8> @test_vbslq_p8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) #0 {
   1329 // CHECK:   [[VBSL_I:%.*]] = and <16 x i8> %v1, %v2
   1330 // CHECK:   [[TMP0:%.*]] = xor <16 x i8> %v1, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
   1331 // CHECK:   [[VBSL1_I:%.*]] = and <16 x i8> [[TMP0]], %v3
   1332 // CHECK:   [[VBSL2_I:%.*]] = or <16 x i8> [[VBSL_I]], [[VBSL1_I]]
   1333 // CHECK:   ret <16 x i8> [[VBSL2_I]]
   1334 poly8x16_t test_vbslq_p8(uint8x16_t v1, poly8x16_t v2, poly8x16_t v3) {
   1335   return vbslq_p8(v1, v2, v3);
   1336 }
   1337 
   1338 // CHECK-LABEL: define <8 x i16> @test_vbslq_p16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) #0 {
   1339 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
   1340 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
   1341 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %v3 to <16 x i8>
   1342 // CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   1343 // CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   1344 // CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
   1345 // CHECK:   [[VBSL3_I:%.*]] = and <8 x i16> [[VBSL_I]], [[VBSL1_I]]
   1346 // CHECK:   [[TMP3:%.*]] = xor <8 x i16> [[VBSL_I]], <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
   1347 // CHECK:   [[VBSL4_I:%.*]] = and <8 x i16> [[TMP3]], [[VBSL2_I]]
   1348 // CHECK:   [[VBSL5_I:%.*]] = or <8 x i16> [[VBSL3_I]], [[VBSL4_I]]
   1349 // CHECK:   ret <8 x i16> [[VBSL5_I]]
   1350 poly16x8_t test_vbslq_p16(uint16x8_t v1, poly16x8_t v2, poly16x8_t v3) {
   1351   return vbslq_p16(v1, v2, v3);
   1352 }
   1353 
   1354 // CHECK-LABEL: define <2 x double> @test_vbslq_f64(<2 x i64> %v1, <2 x double> %v2, <2 x double> %v3) #0 {
   1355 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %v1 to <16 x i8>
   1356 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8>
   1357 // CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v3 to <16 x i8>
   1358 // CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   1359 // CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   1360 // CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
   1361 // CHECK:   [[VBSL3_I:%.*]] = and <2 x i64> [[VBSL_I]], [[VBSL1_I]]
   1362 // CHECK:   [[TMP3:%.*]] = xor <2 x i64> [[VBSL_I]], <i64 -1, i64 -1>
   1363 // CHECK:   [[VBSL4_I:%.*]] = and <2 x i64> [[TMP3]], [[VBSL2_I]]
   1364 // CHECK:   [[VBSL5_I:%.*]] = or <2 x i64> [[VBSL3_I]], [[VBSL4_I]]
   1365 // CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[VBSL5_I]] to <2 x double>
   1366 // CHECK:   ret <2 x double> [[TMP4]]
   1367 float64x2_t test_vbslq_f64(uint64x2_t v1, float64x2_t v2, float64x2_t v3) {
   1368   return vbslq_f64(v1, v2, v3);
   1369 }
   1370 
   1371 // CHECK-LABEL: define <2 x float> @test_vrecps_f32(<2 x float> %v1, <2 x float> %v2) #0 {
   1372 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8>
   1373 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8>
   1374 // CHECK:   [[VRECPS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   1375 // CHECK:   [[VRECPS_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
   1376 // CHECK:   [[VRECPS_V2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frecps.v2f32(<2 x float> [[VRECPS_V_I]], <2 x float> [[VRECPS_V1_I]]) #4
   1377 // CHECK:   [[VRECPS_V3_I:%.*]] = bitcast <2 x float> [[VRECPS_V2_I]] to <8 x i8>
   1378 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRECPS_V3_I]] to <2 x float>
   1379 // CHECK:   ret <2 x float> [[TMP2]]
   1380 float32x2_t test_vrecps_f32(float32x2_t v1, float32x2_t v2) {
   1381    return vrecps_f32(v1, v2);
   1382 }
   1383 
   1384 // CHECK-LABEL: define <4 x float> @test_vrecpsq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
   1385 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8>
   1386 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8>
   1387 // CHECK:   [[VRECPSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
   1388 // CHECK:   [[VRECPSQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
   1389 // CHECK:   [[VRECPSQ_V2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frecps.v4f32(<4 x float> [[VRECPSQ_V_I]], <4 x float> [[VRECPSQ_V1_I]]) #4
   1390 // CHECK:   [[VRECPSQ_V3_I:%.*]] = bitcast <4 x float> [[VRECPSQ_V2_I]] to <16 x i8>
   1391 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRECPSQ_V3_I]] to <4 x float>
   1392 // CHECK:   ret <4 x float> [[TMP2]]
   1393 float32x4_t test_vrecpsq_f32(float32x4_t v1, float32x4_t v2) {
   1394    return vrecpsq_f32(v1, v2);
   1395 }
   1396 
   1397 // CHECK-LABEL: define <2 x double> @test_vrecpsq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
   1398 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8>
   1399 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8>
   1400 // CHECK:   [[VRECPSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
   1401 // CHECK:   [[VRECPSQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
   1402 // CHECK:   [[VRECPSQ_V2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frecps.v2f64(<2 x double> [[VRECPSQ_V_I]], <2 x double> [[VRECPSQ_V1_I]]) #4
   1403 // CHECK:   [[VRECPSQ_V3_I:%.*]] = bitcast <2 x double> [[VRECPSQ_V2_I]] to <16 x i8>
   1404 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRECPSQ_V3_I]] to <2 x double>
   1405 // CHECK:   ret <2 x double> [[TMP2]]
   1406 float64x2_t test_vrecpsq_f64(float64x2_t v1, float64x2_t v2) {
   1407   return vrecpsq_f64(v1, v2);
   1408 }
   1409 
   1410 // CHECK-LABEL: define <2 x float> @test_vrsqrts_f32(<2 x float> %v1, <2 x float> %v2) #0 {
   1411 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8>
   1412 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8>
   1413 // CHECK:   [[VRSQRTS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   1414 // CHECK:   [[VRSQRTS_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
   1415 // CHECK:   [[VRSQRTS_V2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frsqrts.v2f32(<2 x float> [[VRSQRTS_V_I]], <2 x float> [[VRSQRTS_V1_I]]) #4
   1416 // CHECK:   [[VRSQRTS_V3_I:%.*]] = bitcast <2 x float> [[VRSQRTS_V2_I]] to <8 x i8>
   1417 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSQRTS_V3_I]] to <2 x float>
   1418 // CHECK:   ret <2 x float> [[TMP2]]
   1419 float32x2_t test_vrsqrts_f32(float32x2_t v1, float32x2_t v2) {
   1420   return vrsqrts_f32(v1, v2);
   1421 }
   1422 
   1423 // CHECK-LABEL: define <4 x float> @test_vrsqrtsq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
   1424 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8>
   1425 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8>
   1426 // CHECK:   [[VRSQRTSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
   1427 // CHECK:   [[VRSQRTSQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
   1428 // CHECK:   [[VRSQRTSQ_V2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frsqrts.v4f32(<4 x float> [[VRSQRTSQ_V_I]], <4 x float> [[VRSQRTSQ_V1_I]]) #4
   1429 // CHECK:   [[VRSQRTSQ_V3_I:%.*]] = bitcast <4 x float> [[VRSQRTSQ_V2_I]] to <16 x i8>
   1430 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSQRTSQ_V3_I]] to <4 x float>
   1431 // CHECK:   ret <4 x float> [[TMP2]]
   1432 float32x4_t test_vrsqrtsq_f32(float32x4_t v1, float32x4_t v2) {
   1433   return vrsqrtsq_f32(v1, v2);
   1434 }
   1435 
   1436 // CHECK-LABEL: define <2 x double> @test_vrsqrtsq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
   1437 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8>
   1438 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8>
   1439 // CHECK:   [[VRSQRTSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
   1440 // CHECK:   [[VRSQRTSQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
   1441 // CHECK:   [[VRSQRTSQ_V2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frsqrts.v2f64(<2 x double> [[VRSQRTSQ_V_I]], <2 x double> [[VRSQRTSQ_V1_I]]) #4
   1442 // CHECK:   [[VRSQRTSQ_V3_I:%.*]] = bitcast <2 x double> [[VRSQRTSQ_V2_I]] to <16 x i8>
   1443 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSQRTSQ_V3_I]] to <2 x double>
   1444 // CHECK:   ret <2 x double> [[TMP2]]
   1445 float64x2_t test_vrsqrtsq_f64(float64x2_t v1, float64x2_t v2) {
   1446   return vrsqrtsq_f64(v1, v2);
   1447 }
   1448 
   1449 // CHECK-LABEL: define <2 x i32> @test_vcage_f32(<2 x float> %v1, <2 x float> %v2) #0 {
   1450 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8>
   1451 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8>
   1452 // CHECK:   [[VCAGE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   1453 // CHECK:   [[VCAGE_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
   1454 // CHECK:   [[VCAGE_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facge.v2i32.v2f32(<2 x float> [[VCAGE_V_I]], <2 x float> [[VCAGE_V1_I]]) #4
   1455 // CHECK:   ret <2 x i32> [[VCAGE_V2_I]]
   1456 uint32x2_t test_vcage_f32(float32x2_t v1, float32x2_t v2) {
   1457   return vcage_f32(v1, v2);
   1458 }
   1459 
   1460 // CHECK-LABEL: define <1 x i64> @test_vcage_f64(<1 x double> %a, <1 x double> %b) #0 {
   1461 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
   1462 // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
   1463 // CHECK:   [[VCAGE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
   1464 // CHECK:   [[VCAGE_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
   1465 // CHECK:   [[VCAGE_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.facge.v1i64.v1f64(<1 x double> [[VCAGE_V_I]], <1 x double> [[VCAGE_V1_I]]) #4
   1466 // CHECK:   ret <1 x i64> [[VCAGE_V2_I]]
   1467 uint64x1_t test_vcage_f64(float64x1_t a, float64x1_t b) {
   1468   return vcage_f64(a, b);
   1469 }
   1470 
   1471 // CHECK-LABEL: define <4 x i32> @test_vcageq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
   1472 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8>
   1473 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8>
   1474 // CHECK:   [[VCAGEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
   1475 // CHECK:   [[VCAGEQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
   1476 // CHECK:   [[VCAGEQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facge.v4i32.v4f32(<4 x float> [[VCAGEQ_V_I]], <4 x float> [[VCAGEQ_V1_I]]) #4
   1477 // CHECK:   ret <4 x i32> [[VCAGEQ_V2_I]]
   1478 uint32x4_t test_vcageq_f32(float32x4_t v1, float32x4_t v2) {
   1479   return vcageq_f32(v1, v2);
   1480 }
   1481 
   1482 // CHECK-LABEL: define <2 x i64> @test_vcageq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
   1483 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8>
   1484 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8>
   1485 // CHECK:   [[VCAGEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
   1486 // CHECK:   [[VCAGEQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
   1487 // CHECK:   [[VCAGEQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.facge.v2i64.v2f64(<2 x double> [[VCAGEQ_V_I]], <2 x double> [[VCAGEQ_V1_I]]) #4
   1488 // CHECK:   ret <2 x i64> [[VCAGEQ_V2_I]]
   1489 uint64x2_t test_vcageq_f64(float64x2_t v1, float64x2_t v2) {
   1490   return vcageq_f64(v1, v2);
   1491 }
   1492 
   1493 // CHECK-LABEL: define <2 x i32> @test_vcagt_f32(<2 x float> %v1, <2 x float> %v2) #0 {
   1494 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8>
   1495 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8>
   1496 // CHECK:   [[VCAGT_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   1497 // CHECK:   [[VCAGT_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
   1498 // CHECK:   [[VCAGT_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facgt.v2i32.v2f32(<2 x float> [[VCAGT_V_I]], <2 x float> [[VCAGT_V1_I]]) #4
   1499 // CHECK:   ret <2 x i32> [[VCAGT_V2_I]]
   1500 uint32x2_t test_vcagt_f32(float32x2_t v1, float32x2_t v2) {
   1501   return vcagt_f32(v1, v2);
   1502 }
   1503 
   1504 // CHECK-LABEL: define <1 x i64> @test_vcagt_f64(<1 x double> %a, <1 x double> %b) #0 {
   1505 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
   1506 // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
   1507 // CHECK:   [[VCAGT_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
   1508 // CHECK:   [[VCAGT_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
   1509 // CHECK:   [[VCAGT_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.facgt.v1i64.v1f64(<1 x double> [[VCAGT_V_I]], <1 x double> [[VCAGT_V1_I]]) #4
   1510 // CHECK:   ret <1 x i64> [[VCAGT_V2_I]]
   1511 uint64x1_t test_vcagt_f64(float64x1_t a, float64x1_t b) {
   1512   return vcagt_f64(a, b);
   1513 }
   1514 
   1515 // CHECK-LABEL: define <4 x i32> @test_vcagtq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
   1516 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8>
   1517 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8>
   1518 // CHECK:   [[VCAGTQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
   1519 // CHECK:   [[VCAGTQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
   1520 // CHECK:   [[VCAGTQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facgt.v4i32.v4f32(<4 x float> [[VCAGTQ_V_I]], <4 x float> [[VCAGTQ_V1_I]]) #4
   1521 // CHECK:   ret <4 x i32> [[VCAGTQ_V2_I]]
   1522 uint32x4_t test_vcagtq_f32(float32x4_t v1, float32x4_t v2) {
   1523   return vcagtq_f32(v1, v2);
   1524 }
   1525 
   1526 // CHECK-LABEL: define <2 x i64> @test_vcagtq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
   1527 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8>
   1528 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8>
   1529 // CHECK:   [[VCAGTQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
   1530 // CHECK:   [[VCAGTQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
   1531 // CHECK:   [[VCAGTQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.facgt.v2i64.v2f64(<2 x double> [[VCAGTQ_V_I]], <2 x double> [[VCAGTQ_V1_I]]) #4
   1532 // CHECK:   ret <2 x i64> [[VCAGTQ_V2_I]]
   1533 uint64x2_t test_vcagtq_f64(float64x2_t v1, float64x2_t v2) {
   1534   return vcagtq_f64(v1, v2);
   1535 }
   1536 
   1537 // CHECK-LABEL: define <2 x i32> @test_vcale_f32(<2 x float> %v1, <2 x float> %v2) #0 {
   1538 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8>
   1539 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8>
   1540 // CHECK:   [[VCALE_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
   1541 // CHECK:   [[VCALE_V1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   1542 // CHECK:   [[VCALE_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facge.v2i32.v2f32(<2 x float> [[VCALE_V_I]], <2 x float> [[VCALE_V1_I]]) #4
   1543 // CHECK:   ret <2 x i32> [[VCALE_V2_I]]
   1544 uint32x2_t test_vcale_f32(float32x2_t v1, float32x2_t v2) {
   1545   return vcale_f32(v1, v2);
   1546   // Using registers other than v0, v1 are possible, but would be odd.
   1547 }
   1548 
   1549 // CHECK-LABEL: define <1 x i64> @test_vcale_f64(<1 x double> %a, <1 x double> %b) #0 {
   1550 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
   1551 // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
   1552 // CHECK:   [[VCALE_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
   1553 // CHECK:   [[VCALE_V1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
   1554 // CHECK:   [[VCALE_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.facge.v1i64.v1f64(<1 x double> [[VCALE_V_I]], <1 x double> [[VCALE_V1_I]]) #4
   1555 // CHECK:   ret <1 x i64> [[VCALE_V2_I]]
   1556 uint64x1_t test_vcale_f64(float64x1_t a, float64x1_t b) {
   1557   return vcale_f64(a, b);
   1558 }
   1559 
   1560 // CHECK-LABEL: define <4 x i32> @test_vcaleq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
   1561 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8>
   1562 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8>
   1563 // CHECK:   [[VCALEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
   1564 // CHECK:   [[VCALEQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
   1565 // CHECK:   [[VCALEQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facge.v4i32.v4f32(<4 x float> [[VCALEQ_V_I]], <4 x float> [[VCALEQ_V1_I]]) #4
   1566 // CHECK:   ret <4 x i32> [[VCALEQ_V2_I]]
   1567 uint32x4_t test_vcaleq_f32(float32x4_t v1, float32x4_t v2) {
   1568   return vcaleq_f32(v1, v2);
   1569   // Using registers other than v0, v1 are possible, but would be odd.
   1570 }
   1571 
   1572 // CHECK-LABEL: define <2 x i64> @test_vcaleq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
   1573 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8>
   1574 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8>
   1575 // CHECK:   [[VCALEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
   1576 // CHECK:   [[VCALEQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
   1577 // CHECK:   [[VCALEQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.facge.v2i64.v2f64(<2 x double> [[VCALEQ_V_I]], <2 x double> [[VCALEQ_V1_I]]) #4
   1578 // CHECK:   ret <2 x i64> [[VCALEQ_V2_I]]
   1579 uint64x2_t test_vcaleq_f64(float64x2_t v1, float64x2_t v2) {
   1580   return vcaleq_f64(v1, v2);
   1581   // Using registers other than v0, v1 are possible, but would be odd.
   1582 }
   1583 
   1584 // CHECK-LABEL: define <2 x i32> @test_vcalt_f32(<2 x float> %v1, <2 x float> %v2) #0 {
   1585 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8>
   1586 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8>
   1587 // CHECK:   [[VCALT_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
   1588 // CHECK:   [[VCALT_V1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   1589 // CHECK:   [[VCALT_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facgt.v2i32.v2f32(<2 x float> [[VCALT_V_I]], <2 x float> [[VCALT_V1_I]]) #4
   1590 // CHECK:   ret <2 x i32> [[VCALT_V2_I]]
   1591 uint32x2_t test_vcalt_f32(float32x2_t v1, float32x2_t v2) {
   1592   return vcalt_f32(v1, v2);
   1593   // Using registers other than v0, v1 are possible, but would be odd.
   1594 }
   1595 
   1596 // CHECK-LABEL: define <1 x i64> @test_vcalt_f64(<1 x double> %a, <1 x double> %b) #0 {
   1597 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
   1598 // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
   1599 // CHECK:   [[VCALT_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
   1600 // CHECK:   [[VCALT_V1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
   1601 // CHECK:   [[VCALT_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.facgt.v1i64.v1f64(<1 x double> [[VCALT_V_I]], <1 x double> [[VCALT_V1_I]]) #4
   1602 // CHECK:   ret <1 x i64> [[VCALT_V2_I]]
   1603 uint64x1_t test_vcalt_f64(float64x1_t a, float64x1_t b) {
   1604   return vcalt_f64(a, b);
   1605 }
   1606 
   1607 // CHECK-LABEL: define <4 x i32> @test_vcaltq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
   1608 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8>
   1609 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8>
   1610 // CHECK:   [[VCALTQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
   1611 // CHECK:   [[VCALTQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
   1612 // CHECK:   [[VCALTQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facgt.v4i32.v4f32(<4 x float> [[VCALTQ_V_I]], <4 x float> [[VCALTQ_V1_I]]) #4
   1613 // CHECK:   ret <4 x i32> [[VCALTQ_V2_I]]
   1614 uint32x4_t test_vcaltq_f32(float32x4_t v1, float32x4_t v2) {
   1615   return vcaltq_f32(v1, v2);
   1616   // Using registers other than v0, v1 are possible, but would be odd.
   1617 }
   1618 
   1619 // CHECK-LABEL: define <2 x i64> @test_vcaltq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
   1620 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8>
   1621 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8>
   1622 // CHECK:   [[VCALTQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
   1623 // CHECK:   [[VCALTQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
   1624 // CHECK:   [[VCALTQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.facgt.v2i64.v2f64(<2 x double> [[VCALTQ_V_I]], <2 x double> [[VCALTQ_V1_I]]) #4
   1625 // CHECK:   ret <2 x i64> [[VCALTQ_V2_I]]
   1626 uint64x2_t test_vcaltq_f64(float64x2_t v1, float64x2_t v2) {
   1627   return vcaltq_f64(v1, v2);
   1628   // Using registers other than v0, v1 are possible, but would be odd.
   1629 }
   1630 
   1631 // CHECK-LABEL: define <8 x i8> @test_vtst_s8(<8 x i8> %v1, <8 x i8> %v2) #0 {
   1632 // CHECK:   [[TMP0:%.*]] = and <8 x i8> %v1, %v2
   1633 // CHECK:   [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer
   1634 // CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8>
   1635 // CHECK:   ret <8 x i8> [[VTST_I]]
   1636 uint8x8_t test_vtst_s8(int8x8_t v1, int8x8_t v2) {
   1637   return vtst_s8(v1, v2);
   1638 }
   1639 
   1640 // CHECK-LABEL: define <4 x i16> @test_vtst_s16(<4 x i16> %v1, <4 x i16> %v2) #0 {
   1641 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
   1642 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
   1643 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   1644 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   1645 // CHECK:   [[TMP4:%.*]] = and <4 x i16> [[TMP2]], [[TMP3]]
   1646 // CHECK:   [[TMP5:%.*]] = icmp ne <4 x i16> [[TMP4]], zeroinitializer
   1647 // CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i16>
   1648 // CHECK:   ret <4 x i16> [[VTST_I]]
   1649 uint16x4_t test_vtst_s16(int16x4_t v1, int16x4_t v2) {
   1650   return vtst_s16(v1, v2);
   1651 }
   1652 
   1653 // CHECK-LABEL: define <2 x i32> @test_vtst_s32(<2 x i32> %v1, <2 x i32> %v2) #0 {
   1654 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
   1655 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
   1656 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   1657 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   1658 // CHECK:   [[TMP4:%.*]] = and <2 x i32> [[TMP2]], [[TMP3]]
   1659 // CHECK:   [[TMP5:%.*]] = icmp ne <2 x i32> [[TMP4]], zeroinitializer
   1660 // CHECK:   [[VTST_I:%.*]] = sext <2 x i1> [[TMP5]] to <2 x i32>
   1661 // CHECK:   ret <2 x i32> [[VTST_I]]
   1662 uint32x2_t test_vtst_s32(int32x2_t v1, int32x2_t v2) {
   1663   return vtst_s32(v1, v2);
   1664 }
   1665 
   1666 // CHECK-LABEL: define <8 x i8> @test_vtst_u8(<8 x i8> %v1, <8 x i8> %v2) #0 {
   1667 // CHECK:   [[TMP0:%.*]] = and <8 x i8> %v1, %v2
   1668 // CHECK:   [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer
   1669 // CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8>
   1670 // CHECK:   ret <8 x i8> [[VTST_I]]
   1671 uint8x8_t test_vtst_u8(uint8x8_t v1, uint8x8_t v2) {
   1672   return vtst_u8(v1, v2);
   1673 }
   1674 
   1675 // CHECK-LABEL: define <4 x i16> @test_vtst_u16(<4 x i16> %v1, <4 x i16> %v2) #0 {
   1676 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
   1677 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
   1678 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   1679 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   1680 // CHECK:   [[TMP4:%.*]] = and <4 x i16> [[TMP2]], [[TMP3]]
   1681 // CHECK:   [[TMP5:%.*]] = icmp ne <4 x i16> [[TMP4]], zeroinitializer
   1682 // CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i16>
   1683 // CHECK:   ret <4 x i16> [[VTST_I]]
   1684 uint16x4_t test_vtst_u16(uint16x4_t v1, uint16x4_t v2) {
   1685   return vtst_u16(v1, v2);
   1686 }
   1687 
   1688 // CHECK-LABEL: define <2 x i32> @test_vtst_u32(<2 x i32> %v1, <2 x i32> %v2) #0 {
   1689 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
   1690 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
   1691 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   1692 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   1693 // CHECK:   [[TMP4:%.*]] = and <2 x i32> [[TMP2]], [[TMP3]]
   1694 // CHECK:   [[TMP5:%.*]] = icmp ne <2 x i32> [[TMP4]], zeroinitializer
   1695 // CHECK:   [[VTST_I:%.*]] = sext <2 x i1> [[TMP5]] to <2 x i32>
   1696 // CHECK:   ret <2 x i32> [[VTST_I]]
   1697 uint32x2_t test_vtst_u32(uint32x2_t v1, uint32x2_t v2) {
   1698   return vtst_u32(v1, v2);
   1699 }
   1700 
   1701 // CHECK-LABEL: define <16 x i8> @test_vtstq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 {
   1702 // CHECK:   [[TMP0:%.*]] = and <16 x i8> %v1, %v2
   1703 // CHECK:   [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer
   1704 // CHECK:   [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8>
   1705 // CHECK:   ret <16 x i8> [[VTST_I]]
   1706 uint8x16_t test_vtstq_s8(int8x16_t v1, int8x16_t v2) {
   1707   return vtstq_s8(v1, v2);
   1708 }
   1709 
   1710 // CHECK-LABEL: define <8 x i16> @test_vtstq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 {
   1711 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
   1712 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
   1713 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   1714 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   1715 // CHECK:   [[TMP4:%.*]] = and <8 x i16> [[TMP2]], [[TMP3]]
   1716 // CHECK:   [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer
   1717 // CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i16>
   1718 // CHECK:   ret <8 x i16> [[VTST_I]]
   1719 uint16x8_t test_vtstq_s16(int16x8_t v1, int16x8_t v2) {
   1720   return vtstq_s16(v1, v2);
   1721 }
   1722 
   1723 // CHECK-LABEL: define <4 x i32> @test_vtstq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 {
   1724 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
   1725 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
   1726 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   1727 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   1728 // CHECK:   [[TMP4:%.*]] = and <4 x i32> [[TMP2]], [[TMP3]]
   1729 // CHECK:   [[TMP5:%.*]] = icmp ne <4 x i32> [[TMP4]], zeroinitializer
   1730 // CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i32>
   1731 // CHECK:   ret <4 x i32> [[VTST_I]]
   1732 uint32x4_t test_vtstq_s32(int32x4_t v1, int32x4_t v2) {
   1733   return vtstq_s32(v1, v2);
   1734 }
   1735 
   1736 // CHECK-LABEL: define <16 x i8> @test_vtstq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 {
   1737 // CHECK:   [[TMP0:%.*]] = and <16 x i8> %v1, %v2
   1738 // CHECK:   [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer
   1739 // CHECK:   [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8>
   1740 // CHECK:   ret <16 x i8> [[VTST_I]]
   1741 uint8x16_t test_vtstq_u8(uint8x16_t v1, uint8x16_t v2) {
   1742   return vtstq_u8(v1, v2);
   1743 }
   1744 
   1745 // CHECK-LABEL: define <8 x i16> @test_vtstq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 {
   1746 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
   1747 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
   1748 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   1749 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   1750 // CHECK:   [[TMP4:%.*]] = and <8 x i16> [[TMP2]], [[TMP3]]
   1751 // CHECK:   [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer
   1752 // CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i16>
   1753 // CHECK:   ret <8 x i16> [[VTST_I]]
   1754 uint16x8_t test_vtstq_u16(uint16x8_t v1, uint16x8_t v2) {
   1755   return vtstq_u16(v1, v2);
   1756 }
   1757 
   1758 // CHECK-LABEL: define <4 x i32> @test_vtstq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 {
   1759 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
   1760 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
   1761 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   1762 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   1763 // CHECK:   [[TMP4:%.*]] = and <4 x i32> [[TMP2]], [[TMP3]]
   1764 // CHECK:   [[TMP5:%.*]] = icmp ne <4 x i32> [[TMP4]], zeroinitializer
   1765 // CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i32>
   1766 // CHECK:   ret <4 x i32> [[VTST_I]]
   1767 uint32x4_t test_vtstq_u32(uint32x4_t v1, uint32x4_t v2) {
   1768   return vtstq_u32(v1, v2);
   1769 }
   1770 
   1771 // CHECK-LABEL: define <2 x i64> @test_vtstq_s64(<2 x i64> %v1, <2 x i64> %v2) #0 {
   1772 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %v1 to <16 x i8>
   1773 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %v2 to <16 x i8>
   1774 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   1775 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   1776 // CHECK:   [[TMP4:%.*]] = and <2 x i64> [[TMP2]], [[TMP3]]
   1777 // CHECK:   [[TMP5:%.*]] = icmp ne <2 x i64> [[TMP4]], zeroinitializer
   1778 // CHECK:   [[VTST_I:%.*]] = sext <2 x i1> [[TMP5]] to <2 x i64>
   1779 // CHECK:   ret <2 x i64> [[VTST_I]]
   1780 uint64x2_t test_vtstq_s64(int64x2_t v1, int64x2_t v2) {
   1781   return vtstq_s64(v1, v2);
   1782 }
   1783 
   1784 // CHECK-LABEL: define <2 x i64> @test_vtstq_u64(<2 x i64> %v1, <2 x i64> %v2) #0 {
   1785 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %v1 to <16 x i8>
   1786 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %v2 to <16 x i8>
   1787 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   1788 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   1789 // CHECK:   [[TMP4:%.*]] = and <2 x i64> [[TMP2]], [[TMP3]]
   1790 // CHECK:   [[TMP5:%.*]] = icmp ne <2 x i64> [[TMP4]], zeroinitializer
   1791 // CHECK:   [[VTST_I:%.*]] = sext <2 x i1> [[TMP5]] to <2 x i64>
   1792 // CHECK:   ret <2 x i64> [[VTST_I]]
   1793 uint64x2_t test_vtstq_u64(uint64x2_t v1, uint64x2_t v2) {
   1794   return vtstq_u64(v1, v2);
   1795 }
   1796 
   1797 // CHECK-LABEL: define <8 x i8> @test_vtst_p8(<8 x i8> %v1, <8 x i8> %v2) #0 {
   1798 // CHECK:   [[TMP0:%.*]] = and <8 x i8> %v1, %v2
   1799 // CHECK:   [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer
   1800 // CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8>
   1801 // CHECK:   ret <8 x i8> [[VTST_I]]
   1802 uint8x8_t test_vtst_p8(poly8x8_t v1, poly8x8_t v2) {
   1803   return vtst_p8(v1, v2);
   1804 }
   1805 
   1806 // CHECK-LABEL: define <4 x i16> @test_vtst_p16(<4 x i16> %v1, <4 x i16> %v2) #0 {
   1807 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
   1808 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
   1809 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   1810 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   1811 // CHECK:   [[TMP4:%.*]] = and <4 x i16> [[TMP2]], [[TMP3]]
   1812 // CHECK:   [[TMP5:%.*]] = icmp ne <4 x i16> [[TMP4]], zeroinitializer
   1813 // CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i16>
   1814 // CHECK:   ret <4 x i16> [[VTST_I]]
   1815 uint16x4_t test_vtst_p16(poly16x4_t v1, poly16x4_t v2) {
   1816   return vtst_p16(v1, v2);
   1817 }
   1818 
   1819 // CHECK-LABEL: define <16 x i8> @test_vtstq_p8(<16 x i8> %v1, <16 x i8> %v2) #0 {
   1820 // CHECK:   [[TMP0:%.*]] = and <16 x i8> %v1, %v2
   1821 // CHECK:   [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer
   1822 // CHECK:   [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8>
   1823 // CHECK:   ret <16 x i8> [[VTST_I]]
   1824 uint8x16_t test_vtstq_p8(poly8x16_t v1, poly8x16_t v2) {
   1825   return vtstq_p8(v1, v2);
   1826 }
   1827 
   1828 // CHECK-LABEL: define <8 x i16> @test_vtstq_p16(<8 x i16> %v1, <8 x i16> %v2) #0 {
   1829 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
   1830 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
   1831 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   1832 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   1833 // CHECK:   [[TMP4:%.*]] = and <8 x i16> [[TMP2]], [[TMP3]]
   1834 // CHECK:   [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer
   1835 // CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i16>
   1836 // CHECK:   ret <8 x i16> [[VTST_I]]
   1837 uint16x8_t test_vtstq_p16(poly16x8_t v1, poly16x8_t v2) {
   1838   return vtstq_p16(v1, v2);
   1839 }
   1840 
   1841 // CHECK-LABEL: define <1 x i64> @test_vtst_s64(<1 x i64> %a, <1 x i64> %b) #0 {
   1842 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   1843 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   1844 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   1845 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   1846 // CHECK:   [[TMP4:%.*]] = and <1 x i64> [[TMP2]], [[TMP3]]
   1847 // CHECK:   [[TMP5:%.*]] = icmp ne <1 x i64> [[TMP4]], zeroinitializer
   1848 // CHECK:   [[VTST_I:%.*]] = sext <1 x i1> [[TMP5]] to <1 x i64>
   1849 // CHECK:   ret <1 x i64> [[VTST_I]]
   1850 uint64x1_t test_vtst_s64(int64x1_t a, int64x1_t b) {
   1851   return vtst_s64(a, b);
   1852 }
   1853 
   1854 // CHECK-LABEL: define <1 x i64> @test_vtst_u64(<1 x i64> %a, <1 x i64> %b) #0 {
   1855 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   1856 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   1857 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   1858 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   1859 // CHECK:   [[TMP4:%.*]] = and <1 x i64> [[TMP2]], [[TMP3]]
   1860 // CHECK:   [[TMP5:%.*]] = icmp ne <1 x i64> [[TMP4]], zeroinitializer
   1861 // CHECK:   [[VTST_I:%.*]] = sext <1 x i1> [[TMP5]] to <1 x i64>
   1862 // CHECK:   ret <1 x i64> [[VTST_I]]
   1863 uint64x1_t test_vtst_u64(uint64x1_t a, uint64x1_t b) {
   1864   return vtst_u64(a, b);
   1865 }
   1866 
   1867 // CHECK-LABEL: define <8 x i8> @test_vceq_s8(<8 x i8> %v1, <8 x i8> %v2) #0 {
   1868 // CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i8> %v1, %v2
   1869 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
   1870 // CHECK:   ret <8 x i8> [[SEXT_I]]
   1871 uint8x8_t test_vceq_s8(int8x8_t v1, int8x8_t v2) {
   1872   return vceq_s8(v1, v2);
   1873 }
   1874 
   1875 // CHECK-LABEL: define <4 x i16> @test_vceq_s16(<4 x i16> %v1, <4 x i16> %v2) #0 {
   1876 // CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i16> %v1, %v2
   1877 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
   1878 // CHECK:   ret <4 x i16> [[SEXT_I]]
   1879 uint16x4_t test_vceq_s16(int16x4_t v1, int16x4_t v2) {
   1880   return vceq_s16(v1, v2);
   1881 }
   1882 
   1883 // CHECK-LABEL: define <2 x i32> @test_vceq_s32(<2 x i32> %v1, <2 x i32> %v2) #0 {
   1884 // CHECK:   [[CMP_I:%.*]] = icmp eq <2 x i32> %v1, %v2
   1885 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
   1886 // CHECK:   ret <2 x i32> [[SEXT_I]]
   1887 uint32x2_t test_vceq_s32(int32x2_t v1, int32x2_t v2) {
   1888   return vceq_s32(v1, v2);
   1889 }
   1890 
   1891 // CHECK-LABEL: define <1 x i64> @test_vceq_s64(<1 x i64> %a, <1 x i64> %b) #0 {
   1892 // CHECK:   [[CMP_I:%.*]] = icmp eq <1 x i64> %a, %b
   1893 // CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
   1894 // CHECK:   ret <1 x i64> [[SEXT_I]]
   1895 uint64x1_t test_vceq_s64(int64x1_t a, int64x1_t b) {
   1896   return vceq_s64(a, b);
   1897 }
   1898 
   1899 // CHECK-LABEL: define <1 x i64> @test_vceq_u64(<1 x i64> %a, <1 x i64> %b) #0 {
   1900 // CHECK:   [[CMP_I:%.*]] = icmp eq <1 x i64> %a, %b
   1901 // CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
   1902 // CHECK:   ret <1 x i64> [[SEXT_I]]
   1903 uint64x1_t test_vceq_u64(uint64x1_t a, uint64x1_t b) {
   1904   return vceq_u64(a, b);
   1905 }
   1906 
   1907 // CHECK-LABEL: define <2 x i32> @test_vceq_f32(<2 x float> %v1, <2 x float> %v2) #0 {
   1908 // CHECK:   [[CMP_I:%.*]] = fcmp oeq <2 x float> %v1, %v2
   1909 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
   1910 // CHECK:   ret <2 x i32> [[SEXT_I]]
   1911 uint32x2_t test_vceq_f32(float32x2_t v1, float32x2_t v2) {
   1912   return vceq_f32(v1, v2);
   1913 }
   1914 
   1915 // CHECK-LABEL: define <1 x i64> @test_vceq_f64(<1 x double> %a, <1 x double> %b) #0 {
   1916 // CHECK:   [[CMP_I:%.*]] = fcmp oeq <1 x double> %a, %b
   1917 // CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
   1918 // CHECK:   ret <1 x i64> [[SEXT_I]]
   1919 uint64x1_t test_vceq_f64(float64x1_t a, float64x1_t b) {
   1920   return vceq_f64(a, b);
   1921 }
   1922 
   1923 // CHECK-LABEL: define <8 x i8> @test_vceq_u8(<8 x i8> %v1, <8 x i8> %v2) #0 {
   1924 // CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i8> %v1, %v2
   1925 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
   1926 // CHECK:   ret <8 x i8> [[SEXT_I]]
   1927 uint8x8_t test_vceq_u8(uint8x8_t v1, uint8x8_t v2) {
   1928   return vceq_u8(v1, v2);
   1929 }
   1930 
   1931 // CHECK-LABEL: define <4 x i16> @test_vceq_u16(<4 x i16> %v1, <4 x i16> %v2) #0 {
   1932 // CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i16> %v1, %v2
   1933 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
   1934 // CHECK:   ret <4 x i16> [[SEXT_I]]
   1935 uint16x4_t test_vceq_u16(uint16x4_t v1, uint16x4_t v2) {
   1936   return vceq_u16(v1, v2);
   1937 }
   1938 
   1939 // CHECK-LABEL: define <2 x i32> @test_vceq_u32(<2 x i32> %v1, <2 x i32> %v2) #0 {
   1940 // CHECK:   [[CMP_I:%.*]] = icmp eq <2 x i32> %v1, %v2
   1941 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
   1942 // CHECK:   ret <2 x i32> [[SEXT_I]]
   1943 uint32x2_t test_vceq_u32(uint32x2_t v1, uint32x2_t v2) {
   1944   return vceq_u32(v1, v2);
   1945 }
   1946 
   1947 // CHECK-LABEL: define <8 x i8> @test_vceq_p8(<8 x i8> %v1, <8 x i8> %v2) #0 {
   1948 // CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i8> %v1, %v2
   1949 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
   1950 // CHECK:   ret <8 x i8> [[SEXT_I]]
   1951 uint8x8_t test_vceq_p8(poly8x8_t v1, poly8x8_t v2) {
   1952   return vceq_p8(v1, v2);
   1953 }
   1954 
   1955 // CHECK-LABEL: define <16 x i8> @test_vceqq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 {
   1956 // CHECK:   [[CMP_I:%.*]] = icmp eq <16 x i8> %v1, %v2
   1957 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
   1958 // CHECK:   ret <16 x i8> [[SEXT_I]]
   1959 uint8x16_t test_vceqq_s8(int8x16_t v1, int8x16_t v2) {
   1960   return vceqq_s8(v1, v2);
   1961 }
   1962 
   1963 // CHECK-LABEL: define <8 x i16> @test_vceqq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 {
   1964 // CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i16> %v1, %v2
   1965 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
   1966 // CHECK:   ret <8 x i16> [[SEXT_I]]
   1967 uint16x8_t test_vceqq_s16(int16x8_t v1, int16x8_t v2) {
   1968   return vceqq_s16(v1, v2);
   1969 }
   1970 
   1971 // CHECK-LABEL: define <4 x i32> @test_vceqq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 {
   1972 // CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i32> %v1, %v2
   1973 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
   1974 // CHECK:   ret <4 x i32> [[SEXT_I]]
   1975 uint32x4_t test_vceqq_s32(int32x4_t v1, int32x4_t v2) {
   1976   return vceqq_s32(v1, v2);
   1977 }
   1978 
   1979 // CHECK-LABEL: define <4 x i32> @test_vceqq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
   1980 // CHECK:   [[CMP_I:%.*]] = fcmp oeq <4 x float> %v1, %v2
   1981 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
   1982 // CHECK:   ret <4 x i32> [[SEXT_I]]
   1983 uint32x4_t test_vceqq_f32(float32x4_t v1, float32x4_t v2) {
   1984   return vceqq_f32(v1, v2);
   1985 }
   1986 
   1987 // CHECK-LABEL: define <16 x i8> @test_vceqq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 {
   1988 // CHECK:   [[CMP_I:%.*]] = icmp eq <16 x i8> %v1, %v2
   1989 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
   1990 // CHECK:   ret <16 x i8> [[SEXT_I]]
   1991 uint8x16_t test_vceqq_u8(uint8x16_t v1, uint8x16_t v2) {
   1992   return vceqq_u8(v1, v2);
   1993 }
   1994 
   1995 // CHECK-LABEL: define <8 x i16> @test_vceqq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 {
   1996 // CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i16> %v1, %v2
   1997 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
   1998 // CHECK:   ret <8 x i16> [[SEXT_I]]
   1999 uint16x8_t test_vceqq_u16(uint16x8_t v1, uint16x8_t v2) {
   2000   return vceqq_u16(v1, v2);
   2001 }
   2002 
   2003 // CHECK-LABEL: define <4 x i32> @test_vceqq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 {
   2004 // CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i32> %v1, %v2
   2005 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
   2006 // CHECK:   ret <4 x i32> [[SEXT_I]]
   2007 uint32x4_t test_vceqq_u32(uint32x4_t v1, uint32x4_t v2) {
   2008   return vceqq_u32(v1, v2);
   2009 }
   2010 
   2011 // CHECK-LABEL: define <16 x i8> @test_vceqq_p8(<16 x i8> %v1, <16 x i8> %v2) #0 {
   2012 // CHECK:   [[CMP_I:%.*]] = icmp eq <16 x i8> %v1, %v2
   2013 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
   2014 // CHECK:   ret <16 x i8> [[SEXT_I]]
   2015 uint8x16_t test_vceqq_p8(poly8x16_t v1, poly8x16_t v2) {
   2016   return vceqq_p8(v1, v2);
   2017 }
   2018 
   2019 
   2020 // CHECK-LABEL: define <2 x i64> @test_vceqq_s64(<2 x i64> %v1, <2 x i64> %v2) #0 {
   2021 // CHECK:   [[CMP_I:%.*]] = icmp eq <2 x i64> %v1, %v2
   2022 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
   2023 // CHECK:   ret <2 x i64> [[SEXT_I]]
   2024 uint64x2_t test_vceqq_s64(int64x2_t v1, int64x2_t v2) {
   2025   return vceqq_s64(v1, v2);
   2026 }
   2027 
   2028 // CHECK-LABEL: define <2 x i64> @test_vceqq_u64(<2 x i64> %v1, <2 x i64> %v2) #0 {
   2029 // CHECK:   [[CMP_I:%.*]] = icmp eq <2 x i64> %v1, %v2
   2030 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
   2031 // CHECK:   ret <2 x i64> [[SEXT_I]]
   2032 uint64x2_t test_vceqq_u64(uint64x2_t v1, uint64x2_t v2) {
   2033   return vceqq_u64(v1, v2);
   2034 }
   2035 
   2036 // CHECK-LABEL: define <2 x i64> @test_vceqq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
   2037 // CHECK:   [[CMP_I:%.*]] = fcmp oeq <2 x double> %v1, %v2
   2038 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
   2039 // CHECK:   ret <2 x i64> [[SEXT_I]]
   2040 uint64x2_t test_vceqq_f64(float64x2_t v1, float64x2_t v2) {
   2041   return vceqq_f64(v1, v2);
   2042 }
   2043 // CHECK-LABEL: define <8 x i8> @test_vcge_s8(<8 x i8> %v1, <8 x i8> %v2) #0 {
   2044 // CHECK:   [[CMP_I:%.*]] = icmp sge <8 x i8> %v1, %v2
   2045 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
   2046 // CHECK:   ret <8 x i8> [[SEXT_I]]
   2047 uint8x8_t test_vcge_s8(int8x8_t v1, int8x8_t v2) {
   2048   return vcge_s8(v1, v2);
   2049 }
   2050 
   2051 // CHECK-LABEL: define <4 x i16> @test_vcge_s16(<4 x i16> %v1, <4 x i16> %v2) #0 {
   2052 // CHECK:   [[CMP_I:%.*]] = icmp sge <4 x i16> %v1, %v2
   2053 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
   2054 // CHECK:   ret <4 x i16> [[SEXT_I]]
   2055 uint16x4_t test_vcge_s16(int16x4_t v1, int16x4_t v2) {
   2056   return vcge_s16(v1, v2);
   2057 }
   2058 
   2059 // CHECK-LABEL: define <2 x i32> @test_vcge_s32(<2 x i32> %v1, <2 x i32> %v2) #0 {
   2060 // CHECK:   [[CMP_I:%.*]] = icmp sge <2 x i32> %v1, %v2
   2061 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
   2062 // CHECK:   ret <2 x i32> [[SEXT_I]]
   2063 uint32x2_t test_vcge_s32(int32x2_t v1, int32x2_t v2) {
   2064   return vcge_s32(v1, v2);
   2065 }
   2066 
   2067 // CHECK-LABEL: define <1 x i64> @test_vcge_s64(<1 x i64> %a, <1 x i64> %b) #0 {
   2068 // CHECK:   [[CMP_I:%.*]] = icmp sge <1 x i64> %a, %b
   2069 // CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
   2070 // CHECK:   ret <1 x i64> [[SEXT_I]]
   2071 uint64x1_t test_vcge_s64(int64x1_t a, int64x1_t b) {
   2072   return vcge_s64(a, b);
   2073 }
   2074 
   2075 // CHECK-LABEL: define <1 x i64> @test_vcge_u64(<1 x i64> %a, <1 x i64> %b) #0 {
   2076 // CHECK:   [[CMP_I:%.*]] = icmp uge <1 x i64> %a, %b
   2077 // CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
   2078 // CHECK:   ret <1 x i64> [[SEXT_I]]
   2079 uint64x1_t test_vcge_u64(uint64x1_t a, uint64x1_t b) {
   2080   return vcge_u64(a, b);
   2081 }
   2082 
   2083 // CHECK-LABEL: define <2 x i32> @test_vcge_f32(<2 x float> %v1, <2 x float> %v2) #0 {
   2084 // CHECK:   [[CMP_I:%.*]] = fcmp oge <2 x float> %v1, %v2
   2085 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
   2086 // CHECK:   ret <2 x i32> [[SEXT_I]]
   2087 uint32x2_t test_vcge_f32(float32x2_t v1, float32x2_t v2) {
   2088   return vcge_f32(v1, v2);
   2089 }
   2090 
   2091 // CHECK-LABEL: define <1 x i64> @test_vcge_f64(<1 x double> %a, <1 x double> %b) #0 {
   2092 // CHECK:   [[CMP_I:%.*]] = fcmp oge <1 x double> %a, %b
   2093 // CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
   2094 // CHECK:   ret <1 x i64> [[SEXT_I]]
   2095 uint64x1_t test_vcge_f64(float64x1_t a, float64x1_t b) {
   2096   return vcge_f64(a, b);
   2097 }
   2098 
   2099 // CHECK-LABEL: define <8 x i8> @test_vcge_u8(<8 x i8> %v1, <8 x i8> %v2) #0 {
   2100 // CHECK:   [[CMP_I:%.*]] = icmp uge <8 x i8> %v1, %v2
   2101 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
   2102 // CHECK:   ret <8 x i8> [[SEXT_I]]
   2103 uint8x8_t test_vcge_u8(uint8x8_t v1, uint8x8_t v2) {
   2104   return vcge_u8(v1, v2);
   2105 }
   2106 
   2107 // CHECK-LABEL: define <4 x i16> @test_vcge_u16(<4 x i16> %v1, <4 x i16> %v2) #0 {
   2108 // CHECK:   [[CMP_I:%.*]] = icmp uge <4 x i16> %v1, %v2
   2109 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
   2110 // CHECK:   ret <4 x i16> [[SEXT_I]]
   2111 uint16x4_t test_vcge_u16(uint16x4_t v1, uint16x4_t v2) {
   2112   return vcge_u16(v1, v2);
   2113 }
   2114 
   2115 // CHECK-LABEL: define <2 x i32> @test_vcge_u32(<2 x i32> %v1, <2 x i32> %v2) #0 {
   2116 // CHECK:   [[CMP_I:%.*]] = icmp uge <2 x i32> %v1, %v2
   2117 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
   2118 // CHECK:   ret <2 x i32> [[SEXT_I]]
   2119 uint32x2_t test_vcge_u32(uint32x2_t v1, uint32x2_t v2) {
   2120   return vcge_u32(v1, v2);
   2121 }
   2122 
   2123 // CHECK-LABEL: define <16 x i8> @test_vcgeq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 {
   2124 // CHECK:   [[CMP_I:%.*]] = icmp sge <16 x i8> %v1, %v2
   2125 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
   2126 // CHECK:   ret <16 x i8> [[SEXT_I]]
   2127 uint8x16_t test_vcgeq_s8(int8x16_t v1, int8x16_t v2) {
   2128   return vcgeq_s8(v1, v2);
   2129 }
   2130 
   2131 // CHECK-LABEL: define <8 x i16> @test_vcgeq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 {
   2132 // CHECK:   [[CMP_I:%.*]] = icmp sge <8 x i16> %v1, %v2
   2133 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
   2134 // CHECK:   ret <8 x i16> [[SEXT_I]]
   2135 uint16x8_t test_vcgeq_s16(int16x8_t v1, int16x8_t v2) {
   2136   return vcgeq_s16(v1, v2);
   2137 }
   2138 
   2139 // CHECK-LABEL: define <4 x i32> @test_vcgeq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 {
   2140 // CHECK:   [[CMP_I:%.*]] = icmp sge <4 x i32> %v1, %v2
   2141 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
   2142 // CHECK:   ret <4 x i32> [[SEXT_I]]
   2143 uint32x4_t test_vcgeq_s32(int32x4_t v1, int32x4_t v2) {
   2144   return vcgeq_s32(v1, v2);
   2145 }
   2146 
   2147 // CHECK-LABEL: define <4 x i32> @test_vcgeq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
   2148 // CHECK:   [[CMP_I:%.*]] = fcmp oge <4 x float> %v1, %v2
   2149 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
   2150 // CHECK:   ret <4 x i32> [[SEXT_I]]
   2151 uint32x4_t test_vcgeq_f32(float32x4_t v1, float32x4_t v2) {
   2152   return vcgeq_f32(v1, v2);
   2153 }
   2154 
   2155 // CHECK-LABEL: define <16 x i8> @test_vcgeq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 {
   2156 // CHECK:   [[CMP_I:%.*]] = icmp uge <16 x i8> %v1, %v2
   2157 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
   2158 // CHECK:   ret <16 x i8> [[SEXT_I]]
   2159 uint8x16_t test_vcgeq_u8(uint8x16_t v1, uint8x16_t v2) {
   2160   return vcgeq_u8(v1, v2);
   2161 }
   2162 
   2163 // CHECK-LABEL: define <8 x i16> @test_vcgeq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 {
   2164 // CHECK:   [[CMP_I:%.*]] = icmp uge <8 x i16> %v1, %v2
   2165 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
   2166 // CHECK:   ret <8 x i16> [[SEXT_I]]
   2167 uint16x8_t test_vcgeq_u16(uint16x8_t v1, uint16x8_t v2) {
   2168   return vcgeq_u16(v1, v2);
   2169 }
   2170 
   2171 // CHECK-LABEL: define <4 x i32> @test_vcgeq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 {
   2172 // CHECK:   [[CMP_I:%.*]] = icmp uge <4 x i32> %v1, %v2
   2173 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
   2174 // CHECK:   ret <4 x i32> [[SEXT_I]]
   2175 uint32x4_t test_vcgeq_u32(uint32x4_t v1, uint32x4_t v2) {
   2176   return vcgeq_u32(v1, v2);
   2177 }
   2178 
   2179 // CHECK-LABEL: define <2 x i64> @test_vcgeq_s64(<2 x i64> %v1, <2 x i64> %v2) #0 {
   2180 // CHECK:   [[CMP_I:%.*]] = icmp sge <2 x i64> %v1, %v2
   2181 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
   2182 // CHECK:   ret <2 x i64> [[SEXT_I]]
   2183 uint64x2_t test_vcgeq_s64(int64x2_t v1, int64x2_t v2) {
   2184   return vcgeq_s64(v1, v2);
   2185 }
   2186 
   2187 // CHECK-LABEL: define <2 x i64> @test_vcgeq_u64(<2 x i64> %v1, <2 x i64> %v2) #0 {
   2188 // CHECK:   [[CMP_I:%.*]] = icmp uge <2 x i64> %v1, %v2
   2189 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
   2190 // CHECK:   ret <2 x i64> [[SEXT_I]]
   2191 uint64x2_t test_vcgeq_u64(uint64x2_t v1, uint64x2_t v2) {
   2192   return vcgeq_u64(v1, v2);
   2193 }
   2194 
   2195 // CHECK-LABEL: define <2 x i64> @test_vcgeq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
   2196 // CHECK:   [[CMP_I:%.*]] = fcmp oge <2 x double> %v1, %v2
   2197 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
   2198 // CHECK:   ret <2 x i64> [[SEXT_I]]
   2199 uint64x2_t test_vcgeq_f64(float64x2_t v1, float64x2_t v2) {
   2200   return vcgeq_f64(v1, v2);
   2201 }
   2202 
   2203 // Notes about vcle:
   2204 // LE condition predicate implemented as GE, so check reversed operands.
   2205 // Using registers other than v0, v1 are possible, but would be odd.
   2206 // CHECK-LABEL: define <8 x i8> @test_vcle_s8(<8 x i8> %v1, <8 x i8> %v2) #0 {
   2207 // CHECK:   [[CMP_I:%.*]] = icmp sle <8 x i8> %v1, %v2
   2208 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
   2209 // CHECK:   ret <8 x i8> [[SEXT_I]]
   2210 uint8x8_t test_vcle_s8(int8x8_t v1, int8x8_t v2) {
   2211   return vcle_s8(v1, v2);
   2212 }
   2213 
   2214 // CHECK-LABEL: define <4 x i16> @test_vcle_s16(<4 x i16> %v1, <4 x i16> %v2) #0 {
   2215 // CHECK:   [[CMP_I:%.*]] = icmp sle <4 x i16> %v1, %v2
   2216 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
   2217 // CHECK:   ret <4 x i16> [[SEXT_I]]
   2218 uint16x4_t test_vcle_s16(int16x4_t v1, int16x4_t v2) {
   2219   return vcle_s16(v1, v2);
   2220 }
   2221 
   2222 // CHECK-LABEL: define <2 x i32> @test_vcle_s32(<2 x i32> %v1, <2 x i32> %v2) #0 {
   2223 // CHECK:   [[CMP_I:%.*]] = icmp sle <2 x i32> %v1, %v2
   2224 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
   2225 // CHECK:   ret <2 x i32> [[SEXT_I]]
   2226 uint32x2_t test_vcle_s32(int32x2_t v1, int32x2_t v2) {
   2227   return vcle_s32(v1, v2);
   2228 }
   2229 
   2230 // CHECK-LABEL: define <1 x i64> @test_vcle_s64(<1 x i64> %a, <1 x i64> %b) #0 {
   2231 // CHECK:   [[CMP_I:%.*]] = icmp sle <1 x i64> %a, %b
   2232 // CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
   2233 // CHECK:   ret <1 x i64> [[SEXT_I]]
   2234 uint64x1_t test_vcle_s64(int64x1_t a, int64x1_t b) {
   2235   return vcle_s64(a, b);
   2236 }
   2237 
   2238 // CHECK-LABEL: define <1 x i64> @test_vcle_u64(<1 x i64> %a, <1 x i64> %b) #0 {
   2239 // CHECK:   [[CMP_I:%.*]] = icmp ule <1 x i64> %a, %b
   2240 // CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
   2241 // CHECK:   ret <1 x i64> [[SEXT_I]]
   2242 uint64x1_t test_vcle_u64(uint64x1_t a, uint64x1_t b) {
   2243   return vcle_u64(a, b);
   2244 }
   2245 
   2246 // CHECK-LABEL: define <2 x i32> @test_vcle_f32(<2 x float> %v1, <2 x float> %v2) #0 {
   2247 // CHECK:   [[CMP_I:%.*]] = fcmp ole <2 x float> %v1, %v2
   2248 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
   2249 // CHECK:   ret <2 x i32> [[SEXT_I]]
   2250 uint32x2_t test_vcle_f32(float32x2_t v1, float32x2_t v2) {
   2251   return vcle_f32(v1, v2);
   2252 }
   2253 
   2254 // CHECK-LABEL: define <1 x i64> @test_vcle_f64(<1 x double> %a, <1 x double> %b) #0 {
   2255 // CHECK:   [[CMP_I:%.*]] = fcmp ole <1 x double> %a, %b
   2256 // CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
   2257 // CHECK:   ret <1 x i64> [[SEXT_I]]
   2258 uint64x1_t test_vcle_f64(float64x1_t a, float64x1_t b) {
   2259   return vcle_f64(a, b);
   2260 }
   2261 
   2262 // CHECK-LABEL: define <8 x i8> @test_vcle_u8(<8 x i8> %v1, <8 x i8> %v2) #0 {
   2263 // CHECK:   [[CMP_I:%.*]] = icmp ule <8 x i8> %v1, %v2
   2264 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
   2265 // CHECK:   ret <8 x i8> [[SEXT_I]]
   2266 uint8x8_t test_vcle_u8(uint8x8_t v1, uint8x8_t v2) {
   2267   return vcle_u8(v1, v2);
   2268 }
   2269 
   2270 // CHECK-LABEL: define <4 x i16> @test_vcle_u16(<4 x i16> %v1, <4 x i16> %v2) #0 {
   2271 // CHECK:   [[CMP_I:%.*]] = icmp ule <4 x i16> %v1, %v2
   2272 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
   2273 // CHECK:   ret <4 x i16> [[SEXT_I]]
   2274 uint16x4_t test_vcle_u16(uint16x4_t v1, uint16x4_t v2) {
   2275   return vcle_u16(v1, v2);
   2276 }
   2277 
   2278 // CHECK-LABEL: define <2 x i32> @test_vcle_u32(<2 x i32> %v1, <2 x i32> %v2) #0 {
   2279 // CHECK:   [[CMP_I:%.*]] = icmp ule <2 x i32> %v1, %v2
   2280 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
   2281 // CHECK:   ret <2 x i32> [[SEXT_I]]
   2282 uint32x2_t test_vcle_u32(uint32x2_t v1, uint32x2_t v2) {
   2283   return vcle_u32(v1, v2);
   2284 }
   2285 
   2286 // CHECK-LABEL: define <16 x i8> @test_vcleq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 {
   2287 // CHECK:   [[CMP_I:%.*]] = icmp sle <16 x i8> %v1, %v2
   2288 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
   2289 // CHECK:   ret <16 x i8> [[SEXT_I]]
   2290 uint8x16_t test_vcleq_s8(int8x16_t v1, int8x16_t v2) {
   2291   return vcleq_s8(v1, v2);
   2292 }
   2293 
   2294 // CHECK-LABEL: define <8 x i16> @test_vcleq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 {
   2295 // CHECK:   [[CMP_I:%.*]] = icmp sle <8 x i16> %v1, %v2
   2296 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
   2297 // CHECK:   ret <8 x i16> [[SEXT_I]]
   2298 uint16x8_t test_vcleq_s16(int16x8_t v1, int16x8_t v2) {
   2299   return vcleq_s16(v1, v2);
   2300 }
   2301 
   2302 // CHECK-LABEL: define <4 x i32> @test_vcleq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 {
   2303 // CHECK:   [[CMP_I:%.*]] = icmp sle <4 x i32> %v1, %v2
   2304 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
   2305 // CHECK:   ret <4 x i32> [[SEXT_I]]
   2306 uint32x4_t test_vcleq_s32(int32x4_t v1, int32x4_t v2) {
   2307   return vcleq_s32(v1, v2);
   2308 }
   2309 
   2310 // CHECK-LABEL: define <4 x i32> @test_vcleq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
   2311 // CHECK:   [[CMP_I:%.*]] = fcmp ole <4 x float> %v1, %v2
   2312 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
   2313 // CHECK:   ret <4 x i32> [[SEXT_I]]
   2314 uint32x4_t test_vcleq_f32(float32x4_t v1, float32x4_t v2) {
   2315   return vcleq_f32(v1, v2);
   2316 }
   2317 
   2318 // CHECK-LABEL: define <16 x i8> @test_vcleq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 {
   2319 // CHECK:   [[CMP_I:%.*]] = icmp ule <16 x i8> %v1, %v2
   2320 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
   2321 // CHECK:   ret <16 x i8> [[SEXT_I]]
   2322 uint8x16_t test_vcleq_u8(uint8x16_t v1, uint8x16_t v2) {
   2323   return vcleq_u8(v1, v2);
   2324 }
   2325 
   2326 // CHECK-LABEL: define <8 x i16> @test_vcleq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 {
   2327 // CHECK:   [[CMP_I:%.*]] = icmp ule <8 x i16> %v1, %v2
   2328 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
   2329 // CHECK:   ret <8 x i16> [[SEXT_I]]
   2330 uint16x8_t test_vcleq_u16(uint16x8_t v1, uint16x8_t v2) {
   2331   return vcleq_u16(v1, v2);
   2332 }
   2333 
   2334 // CHECK-LABEL: define <4 x i32> @test_vcleq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 {
   2335 // CHECK:   [[CMP_I:%.*]] = icmp ule <4 x i32> %v1, %v2
   2336 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
   2337 // CHECK:   ret <4 x i32> [[SEXT_I]]
   2338 uint32x4_t test_vcleq_u32(uint32x4_t v1, uint32x4_t v2) {
   2339   return vcleq_u32(v1, v2);
   2340 }
   2341 
   2342 // CHECK-LABEL: define <2 x i64> @test_vcleq_s64(<2 x i64> %v1, <2 x i64> %v2) #0 {
   2343 // CHECK:   [[CMP_I:%.*]] = icmp sle <2 x i64> %v1, %v2
   2344 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
   2345 // CHECK:   ret <2 x i64> [[SEXT_I]]
   2346 uint64x2_t test_vcleq_s64(int64x2_t v1, int64x2_t v2) {
   2347   return vcleq_s64(v1, v2);
   2348 }
   2349 
   2350 // CHECK-LABEL: define <2 x i64> @test_vcleq_u64(<2 x i64> %v1, <2 x i64> %v2) #0 {
   2351 // CHECK:   [[CMP_I:%.*]] = icmp ule <2 x i64> %v1, %v2
   2352 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
   2353 // CHECK:   ret <2 x i64> [[SEXT_I]]
   2354 uint64x2_t test_vcleq_u64(uint64x2_t v1, uint64x2_t v2) {
   2355   return vcleq_u64(v1, v2);
   2356 }
   2357 
   2358 // CHECK-LABEL: define <2 x i64> @test_vcleq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
   2359 // CHECK:   [[CMP_I:%.*]] = fcmp ole <2 x double> %v1, %v2
   2360 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
   2361 // CHECK:   ret <2 x i64> [[SEXT_I]]
   2362 uint64x2_t test_vcleq_f64(float64x2_t v1, float64x2_t v2) {
   2363   return vcleq_f64(v1, v2);
   2364 }
   2365 
   2366 
   2367 // CHECK-LABEL: define <8 x i8> @test_vcgt_s8(<8 x i8> %v1, <8 x i8> %v2) #0 {
   2368 // CHECK:   [[CMP_I:%.*]] = icmp sgt <8 x i8> %v1, %v2
   2369 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
   2370 // CHECK:   ret <8 x i8> [[SEXT_I]]
   2371 uint8x8_t test_vcgt_s8(int8x8_t v1, int8x8_t v2) {
   2372   return vcgt_s8(v1, v2);
   2373 }
   2374 
   2375 // CHECK-LABEL: define <4 x i16> @test_vcgt_s16(<4 x i16> %v1, <4 x i16> %v2) #0 {
   2376 // CHECK:   [[CMP_I:%.*]] = icmp sgt <4 x i16> %v1, %v2
   2377 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
   2378 // CHECK:   ret <4 x i16> [[SEXT_I]]
   2379 uint16x4_t test_vcgt_s16(int16x4_t v1, int16x4_t v2) {
   2380   return vcgt_s16(v1, v2);
   2381 }
   2382 
   2383 // CHECK-LABEL: define <2 x i32> @test_vcgt_s32(<2 x i32> %v1, <2 x i32> %v2) #0 {
   2384 // CHECK:   [[CMP_I:%.*]] = icmp sgt <2 x i32> %v1, %v2
   2385 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
   2386 // CHECK:   ret <2 x i32> [[SEXT_I]]
   2387 uint32x2_t test_vcgt_s32(int32x2_t v1, int32x2_t v2) {
   2388   return vcgt_s32(v1, v2);
   2389 }
   2390 
   2391 // CHECK-LABEL: define <1 x i64> @test_vcgt_s64(<1 x i64> %a, <1 x i64> %b) #0 {
   2392 // CHECK:   [[CMP_I:%.*]] = icmp sgt <1 x i64> %a, %b
   2393 // CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
   2394 // CHECK:   ret <1 x i64> [[SEXT_I]]
   2395 uint64x1_t test_vcgt_s64(int64x1_t a, int64x1_t b) {
   2396   return vcgt_s64(a, b);
   2397 }
   2398 
   2399 // CHECK-LABEL: define <1 x i64> @test_vcgt_u64(<1 x i64> %a, <1 x i64> %b) #0 {
   2400 // CHECK:   [[CMP_I:%.*]] = icmp ugt <1 x i64> %a, %b
   2401 // CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
   2402 // CHECK:   ret <1 x i64> [[SEXT_I]]
   2403 uint64x1_t test_vcgt_u64(uint64x1_t a, uint64x1_t b) {
   2404   return vcgt_u64(a, b);
   2405 }
   2406 
   2407 // CHECK-LABEL: define <2 x i32> @test_vcgt_f32(<2 x float> %v1, <2 x float> %v2) #0 {
   2408 // CHECK:   [[CMP_I:%.*]] = fcmp ogt <2 x float> %v1, %v2
   2409 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
   2410 // CHECK:   ret <2 x i32> [[SEXT_I]]
   2411 uint32x2_t test_vcgt_f32(float32x2_t v1, float32x2_t v2) {
   2412   return vcgt_f32(v1, v2);
   2413 }
   2414 
   2415 // CHECK-LABEL: define <1 x i64> @test_vcgt_f64(<1 x double> %a, <1 x double> %b) #0 {
   2416 // CHECK:   [[CMP_I:%.*]] = fcmp ogt <1 x double> %a, %b
   2417 // CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
   2418 // CHECK:   ret <1 x i64> [[SEXT_I]]
   2419 uint64x1_t test_vcgt_f64(float64x1_t a, float64x1_t b) {
   2420   return vcgt_f64(a, b);
   2421 }
   2422 
   2423 // CHECK-LABEL: define <8 x i8> @test_vcgt_u8(<8 x i8> %v1, <8 x i8> %v2) #0 {
   2424 // CHECK:   [[CMP_I:%.*]] = icmp ugt <8 x i8> %v1, %v2
   2425 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
   2426 // CHECK:   ret <8 x i8> [[SEXT_I]]
   2427 uint8x8_t test_vcgt_u8(uint8x8_t v1, uint8x8_t v2) {
   2428   return vcgt_u8(v1, v2);
   2429 }
   2430 
   2431 // CHECK-LABEL: define <4 x i16> @test_vcgt_u16(<4 x i16> %v1, <4 x i16> %v2) #0 {
   2432 // CHECK:   [[CMP_I:%.*]] = icmp ugt <4 x i16> %v1, %v2
   2433 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
   2434 // CHECK:   ret <4 x i16> [[SEXT_I]]
   2435 uint16x4_t test_vcgt_u16(uint16x4_t v1, uint16x4_t v2) {
   2436   return vcgt_u16(v1, v2);
   2437 }
   2438 
   2439 // CHECK-LABEL: define <2 x i32> @test_vcgt_u32(<2 x i32> %v1, <2 x i32> %v2) #0 {
   2440 // CHECK:   [[CMP_I:%.*]] = icmp ugt <2 x i32> %v1, %v2
   2441 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
   2442 // CHECK:   ret <2 x i32> [[SEXT_I]]
   2443 uint32x2_t test_vcgt_u32(uint32x2_t v1, uint32x2_t v2) {
   2444   return vcgt_u32(v1, v2);
   2445 }
   2446 
   2447 // CHECK-LABEL: define <16 x i8> @test_vcgtq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 {
   2448 // CHECK:   [[CMP_I:%.*]] = icmp sgt <16 x i8> %v1, %v2
   2449 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
   2450 // CHECK:   ret <16 x i8> [[SEXT_I]]
   2451 uint8x16_t test_vcgtq_s8(int8x16_t v1, int8x16_t v2) {
   2452   return vcgtq_s8(v1, v2);
   2453 }
   2454 
   2455 // CHECK-LABEL: define <8 x i16> @test_vcgtq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 {
   2456 // CHECK:   [[CMP_I:%.*]] = icmp sgt <8 x i16> %v1, %v2
   2457 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
   2458 // CHECK:   ret <8 x i16> [[SEXT_I]]
   2459 uint16x8_t test_vcgtq_s16(int16x8_t v1, int16x8_t v2) {
   2460   return vcgtq_s16(v1, v2);
   2461 }
   2462 
   2463 // CHECK-LABEL: define <4 x i32> @test_vcgtq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 {
   2464 // CHECK:   [[CMP_I:%.*]] = icmp sgt <4 x i32> %v1, %v2
   2465 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
   2466 // CHECK:   ret <4 x i32> [[SEXT_I]]
   2467 uint32x4_t test_vcgtq_s32(int32x4_t v1, int32x4_t v2) {
   2468   return vcgtq_s32(v1, v2);
   2469 }
   2470 
   2471 // CHECK-LABEL: define <4 x i32> @test_vcgtq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
   2472 // CHECK:   [[CMP_I:%.*]] = fcmp ogt <4 x float> %v1, %v2
   2473 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
   2474 // CHECK:   ret <4 x i32> [[SEXT_I]]
   2475 uint32x4_t test_vcgtq_f32(float32x4_t v1, float32x4_t v2) {
   2476   return vcgtq_f32(v1, v2);
   2477 }
   2478 
   2479 // CHECK-LABEL: define <16 x i8> @test_vcgtq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 {
   2480 // CHECK:   [[CMP_I:%.*]] = icmp ugt <16 x i8> %v1, %v2
   2481 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
   2482 // CHECK:   ret <16 x i8> [[SEXT_I]]
   2483 uint8x16_t test_vcgtq_u8(uint8x16_t v1, uint8x16_t v2) {
   2484   return vcgtq_u8(v1, v2);
   2485 }
   2486 
   2487 // CHECK-LABEL: define <8 x i16> @test_vcgtq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 {
   2488 // CHECK:   [[CMP_I:%.*]] = icmp ugt <8 x i16> %v1, %v2
   2489 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
   2490 // CHECK:   ret <8 x i16> [[SEXT_I]]
   2491 uint16x8_t test_vcgtq_u16(uint16x8_t v1, uint16x8_t v2) {
   2492   return vcgtq_u16(v1, v2);
   2493 }
   2494 
   2495 // CHECK-LABEL: define <4 x i32> @test_vcgtq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 {
   2496 // CHECK:   [[CMP_I:%.*]] = icmp ugt <4 x i32> %v1, %v2
   2497 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
   2498 // CHECK:   ret <4 x i32> [[SEXT_I]]
   2499 uint32x4_t test_vcgtq_u32(uint32x4_t v1, uint32x4_t v2) {
   2500   return vcgtq_u32(v1, v2);
   2501 }
   2502 
   2503 // CHECK-LABEL: define <2 x i64> @test_vcgtq_s64(<2 x i64> %v1, <2 x i64> %v2) #0 {
   2504 // CHECK:   [[CMP_I:%.*]] = icmp sgt <2 x i64> %v1, %v2
   2505 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
   2506 // CHECK:   ret <2 x i64> [[SEXT_I]]
   2507 uint64x2_t test_vcgtq_s64(int64x2_t v1, int64x2_t v2) {
   2508   return vcgtq_s64(v1, v2);
   2509 }
   2510 
   2511 // CHECK-LABEL: define <2 x i64> @test_vcgtq_u64(<2 x i64> %v1, <2 x i64> %v2) #0 {
   2512 // CHECK:   [[CMP_I:%.*]] = icmp ugt <2 x i64> %v1, %v2
   2513 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
   2514 // CHECK:   ret <2 x i64> [[SEXT_I]]
   2515 uint64x2_t test_vcgtq_u64(uint64x2_t v1, uint64x2_t v2) {
   2516   return vcgtq_u64(v1, v2);
   2517 }
   2518 
   2519 // CHECK-LABEL: define <2 x i64> @test_vcgtq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
   2520 // CHECK:   [[CMP_I:%.*]] = fcmp ogt <2 x double> %v1, %v2
   2521 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
   2522 // CHECK:   ret <2 x i64> [[SEXT_I]]
   2523 uint64x2_t test_vcgtq_f64(float64x2_t v1, float64x2_t v2) {
   2524   return vcgtq_f64(v1, v2);
   2525 }
   2526 
   2527 
   2528 // Notes about vclt:
   2529 // LT condition predicate implemented as GT, so check reversed operands.
   2530 // Using registers other than v0, v1 are possible, but would be odd.
   2531 
   2532 // CHECK-LABEL: define <8 x i8> @test_vclt_s8(<8 x i8> %v1, <8 x i8> %v2) #0 {
   2533 // CHECK:   [[CMP_I:%.*]] = icmp slt <8 x i8> %v1, %v2
   2534 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
   2535 // CHECK:   ret <8 x i8> [[SEXT_I]]
   2536 uint8x8_t test_vclt_s8(int8x8_t v1, int8x8_t v2) {
   2537   return vclt_s8(v1, v2);
   2538 }
   2539 
   2540 // CHECK-LABEL: define <4 x i16> @test_vclt_s16(<4 x i16> %v1, <4 x i16> %v2) #0 {
   2541 // CHECK:   [[CMP_I:%.*]] = icmp slt <4 x i16> %v1, %v2
   2542 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
   2543 // CHECK:   ret <4 x i16> [[SEXT_I]]
   2544 uint16x4_t test_vclt_s16(int16x4_t v1, int16x4_t v2) {
   2545   return vclt_s16(v1, v2);
   2546 }
   2547 
   2548 // CHECK-LABEL: define <2 x i32> @test_vclt_s32(<2 x i32> %v1, <2 x i32> %v2) #0 {
   2549 // CHECK:   [[CMP_I:%.*]] = icmp slt <2 x i32> %v1, %v2
   2550 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
   2551 // CHECK:   ret <2 x i32> [[SEXT_I]]
   2552 uint32x2_t test_vclt_s32(int32x2_t v1, int32x2_t v2) {
   2553   return vclt_s32(v1, v2);
   2554 }
   2555 
   2556 // CHECK-LABEL: define <1 x i64> @test_vclt_s64(<1 x i64> %a, <1 x i64> %b) #0 {
   2557 // CHECK:   [[CMP_I:%.*]] = icmp slt <1 x i64> %a, %b
   2558 // CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
   2559 // CHECK:   ret <1 x i64> [[SEXT_I]]
   2560 uint64x1_t test_vclt_s64(int64x1_t a, int64x1_t b) {
   2561   return vclt_s64(a, b);
   2562 }
   2563 
   2564 // CHECK-LABEL: define <1 x i64> @test_vclt_u64(<1 x i64> %a, <1 x i64> %b) #0 {
   2565 // CHECK:   [[CMP_I:%.*]] = icmp ult <1 x i64> %a, %b
   2566 // CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
   2567 // CHECK:   ret <1 x i64> [[SEXT_I]]
   2568 uint64x1_t test_vclt_u64(uint64x1_t a, uint64x1_t b) {
   2569   return vclt_u64(a, b);
   2570 }
   2571 
   2572 // CHECK-LABEL: define <2 x i32> @test_vclt_f32(<2 x float> %v1, <2 x float> %v2) #0 {
   2573 // CHECK:   [[CMP_I:%.*]] = fcmp olt <2 x float> %v1, %v2
   2574 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
   2575 // CHECK:   ret <2 x i32> [[SEXT_I]]
   2576 uint32x2_t test_vclt_f32(float32x2_t v1, float32x2_t v2) {
   2577   return vclt_f32(v1, v2);
   2578 }
   2579 
   2580 // CHECK-LABEL: define <1 x i64> @test_vclt_f64(<1 x double> %a, <1 x double> %b) #0 {
   2581 // CHECK:   [[CMP_I:%.*]] = fcmp olt <1 x double> %a, %b
   2582 // CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
   2583 // CHECK:   ret <1 x i64> [[SEXT_I]]
   2584 uint64x1_t test_vclt_f64(float64x1_t a, float64x1_t b) {
   2585   return vclt_f64(a, b);
   2586 }
   2587 
   2588 // CHECK-LABEL: define <8 x i8> @test_vclt_u8(<8 x i8> %v1, <8 x i8> %v2) #0 {
   2589 // CHECK:   [[CMP_I:%.*]] = icmp ult <8 x i8> %v1, %v2
   2590 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
   2591 // CHECK:   ret <8 x i8> [[SEXT_I]]
   2592 uint8x8_t test_vclt_u8(uint8x8_t v1, uint8x8_t v2) {
   2593   return vclt_u8(v1, v2);
   2594 }
   2595 
   2596 // CHECK-LABEL: define <4 x i16> @test_vclt_u16(<4 x i16> %v1, <4 x i16> %v2) #0 {
   2597 // CHECK:   [[CMP_I:%.*]] = icmp ult <4 x i16> %v1, %v2
   2598 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
   2599 // CHECK:   ret <4 x i16> [[SEXT_I]]
   2600 uint16x4_t test_vclt_u16(uint16x4_t v1, uint16x4_t v2) {
   2601   return vclt_u16(v1, v2);
   2602 }
   2603 
   2604 // CHECK-LABEL: define <2 x i32> @test_vclt_u32(<2 x i32> %v1, <2 x i32> %v2) #0 {
   2605 // CHECK:   [[CMP_I:%.*]] = icmp ult <2 x i32> %v1, %v2
   2606 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
   2607 // CHECK:   ret <2 x i32> [[SEXT_I]]
   2608 uint32x2_t test_vclt_u32(uint32x2_t v1, uint32x2_t v2) {
   2609   return vclt_u32(v1, v2);
   2610 }
   2611 
   2612 // CHECK-LABEL: define <16 x i8> @test_vcltq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 {
   2613 // CHECK:   [[CMP_I:%.*]] = icmp slt <16 x i8> %v1, %v2
   2614 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
   2615 // CHECK:   ret <16 x i8> [[SEXT_I]]
   2616 uint8x16_t test_vcltq_s8(int8x16_t v1, int8x16_t v2) {
   2617   return vcltq_s8(v1, v2);
   2618 }
   2619 
   2620 // CHECK-LABEL: define <8 x i16> @test_vcltq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 {
   2621 // CHECK:   [[CMP_I:%.*]] = icmp slt <8 x i16> %v1, %v2
   2622 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
   2623 // CHECK:   ret <8 x i16> [[SEXT_I]]
   2624 uint16x8_t test_vcltq_s16(int16x8_t v1, int16x8_t v2) {
   2625   return vcltq_s16(v1, v2);
   2626 }
   2627 
   2628 // CHECK-LABEL: define <4 x i32> @test_vcltq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 {
   2629 // CHECK:   [[CMP_I:%.*]] = icmp slt <4 x i32> %v1, %v2
   2630 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
   2631 // CHECK:   ret <4 x i32> [[SEXT_I]]
   2632 uint32x4_t test_vcltq_s32(int32x4_t v1, int32x4_t v2) {
   2633   return vcltq_s32(v1, v2);
   2634 }
   2635 
   2636 // CHECK-LABEL: define <4 x i32> @test_vcltq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
   2637 // CHECK:   [[CMP_I:%.*]] = fcmp olt <4 x float> %v1, %v2
   2638 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
   2639 // CHECK:   ret <4 x i32> [[SEXT_I]]
   2640 uint32x4_t test_vcltq_f32(float32x4_t v1, float32x4_t v2) {
   2641   return vcltq_f32(v1, v2);
   2642 }
   2643 
   2644 // CHECK-LABEL: define <16 x i8> @test_vcltq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 {
   2645 // CHECK:   [[CMP_I:%.*]] = icmp ult <16 x i8> %v1, %v2
   2646 // CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
   2647 // CHECK:   ret <16 x i8> [[SEXT_I]]
   2648 uint8x16_t test_vcltq_u8(uint8x16_t v1, uint8x16_t v2) {
   2649   return vcltq_u8(v1, v2);
   2650 }
   2651 
   2652 // CHECK-LABEL: define <8 x i16> @test_vcltq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 {
   2653 // CHECK:   [[CMP_I:%.*]] = icmp ult <8 x i16> %v1, %v2
   2654 // CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
   2655 // CHECK:   ret <8 x i16> [[SEXT_I]]
   2656 uint16x8_t test_vcltq_u16(uint16x8_t v1, uint16x8_t v2) {
   2657   return vcltq_u16(v1, v2);
   2658 }
   2659 
   2660 // CHECK-LABEL: define <4 x i32> @test_vcltq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 {
   2661 // CHECK:   [[CMP_I:%.*]] = icmp ult <4 x i32> %v1, %v2
   2662 // CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
   2663 // CHECK:   ret <4 x i32> [[SEXT_I]]
   2664 uint32x4_t test_vcltq_u32(uint32x4_t v1, uint32x4_t v2) {
   2665   return vcltq_u32(v1, v2);
   2666 }
   2667 
   2668 // CHECK-LABEL: define <2 x i64> @test_vcltq_s64(<2 x i64> %v1, <2 x i64> %v2) #0 {
   2669 // CHECK:   [[CMP_I:%.*]] = icmp slt <2 x i64> %v1, %v2
   2670 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
   2671 // CHECK:   ret <2 x i64> [[SEXT_I]]
   2672 uint64x2_t test_vcltq_s64(int64x2_t v1, int64x2_t v2) {
   2673   return vcltq_s64(v1, v2);
   2674 }
   2675 
   2676 // CHECK-LABEL: define <2 x i64> @test_vcltq_u64(<2 x i64> %v1, <2 x i64> %v2) #0 {
   2677 // CHECK:   [[CMP_I:%.*]] = icmp ult <2 x i64> %v1, %v2
   2678 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
   2679 // CHECK:   ret <2 x i64> [[SEXT_I]]
   2680 uint64x2_t test_vcltq_u64(uint64x2_t v1, uint64x2_t v2) {
   2681   return vcltq_u64(v1, v2);
   2682 }
   2683 
   2684 // CHECK-LABEL: define <2 x i64> @test_vcltq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
   2685 // CHECK:   [[CMP_I:%.*]] = fcmp olt <2 x double> %v1, %v2
   2686 // CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
   2687 // CHECK:   ret <2 x i64> [[SEXT_I]]
   2688 uint64x2_t test_vcltq_f64(float64x2_t v1, float64x2_t v2) {
   2689   return vcltq_f64(v1, v2);
   2690 }
   2691 
   2692 
   2693 // CHECK-LABEL: define <8 x i8> @test_vhadd_s8(<8 x i8> %v1, <8 x i8> %v2) #0 {
   2694 // CHECK:   [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> %v1, <8 x i8> %v2) #4
   2695 // CHECK:   ret <8 x i8> [[VHADD_V_I]]
   2696 int8x8_t test_vhadd_s8(int8x8_t v1, int8x8_t v2) {
   2697   return vhadd_s8(v1, v2);
   2698 }
   2699 
   2700 // CHECK-LABEL: define <4 x i16> @test_vhadd_s16(<4 x i16> %v1, <4 x i16> %v2) #0 {
   2701 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
   2702 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
   2703 // CHECK:   [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   2704 // CHECK:   [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   2705 // CHECK:   [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.shadd.v4i16(<4 x i16> [[VHADD_V_I]], <4 x i16> [[VHADD_V1_I]]) #4
   2706 // CHECK:   [[VHADD_V3_I:%.*]] = bitcast <4 x i16> [[VHADD_V2_I]] to <8 x i8>
   2707 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <4 x i16>
   2708 // CHECK:   ret <4 x i16> [[TMP2]]
   2709 int16x4_t test_vhadd_s16(int16x4_t v1, int16x4_t v2) {
   2710   return vhadd_s16(v1, v2);
   2711 }
   2712 
   2713 // CHECK-LABEL: define <2 x i32> @test_vhadd_s32(<2 x i32> %v1, <2 x i32> %v2) #0 {
   2714 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
   2715 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
   2716 // CHECK:   [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   2717 // CHECK:   [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   2718 // CHECK:   [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.shadd.v2i32(<2 x i32> [[VHADD_V_I]], <2 x i32> [[VHADD_V1_I]]) #4
   2719 // CHECK:   [[VHADD_V3_I:%.*]] = bitcast <2 x i32> [[VHADD_V2_I]] to <8 x i8>
   2720 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <2 x i32>
   2721 // CHECK:   ret <2 x i32> [[TMP2]]
   2722 int32x2_t test_vhadd_s32(int32x2_t v1, int32x2_t v2) {
   2723   return vhadd_s32(v1, v2);
   2724 }
   2725 
   2726 // CHECK-LABEL: define <8 x i8> @test_vhadd_u8(<8 x i8> %v1, <8 x i8> %v2) #0 {
   2727 // CHECK:   [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8> %v1, <8 x i8> %v2) #4
   2728 // CHECK:   ret <8 x i8> [[VHADD_V_I]]
   2729 uint8x8_t test_vhadd_u8(uint8x8_t v1, uint8x8_t v2) {
   2730   return vhadd_u8(v1, v2);
   2731 }
   2732 
   2733 // CHECK-LABEL: define <4 x i16> @test_vhadd_u16(<4 x i16> %v1, <4 x i16> %v2) #0 {
   2734 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
   2735 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
   2736 // CHECK:   [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   2737 // CHECK:   [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   2738 // CHECK:   [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uhadd.v4i16(<4 x i16> [[VHADD_V_I]], <4 x i16> [[VHADD_V1_I]]) #4
   2739 // CHECK:   [[VHADD_V3_I:%.*]] = bitcast <4 x i16> [[VHADD_V2_I]] to <8 x i8>
   2740 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <4 x i16>
   2741 // CHECK:   ret <4 x i16> [[TMP2]]
   2742 uint16x4_t test_vhadd_u16(uint16x4_t v1, uint16x4_t v2) {
   2743   return vhadd_u16(v1, v2);
   2744 }
   2745 
   2746 // CHECK-LABEL: define <2 x i32> @test_vhadd_u32(<2 x i32> %v1, <2 x i32> %v2) #0 {
   2747 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
   2748 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
   2749 // CHECK:   [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   2750 // CHECK:   [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   2751 // CHECK:   [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uhadd.v2i32(<2 x i32> [[VHADD_V_I]], <2 x i32> [[VHADD_V1_I]]) #4
   2752 // CHECK:   [[VHADD_V3_I:%.*]] = bitcast <2 x i32> [[VHADD_V2_I]] to <8 x i8>
   2753 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <2 x i32>
   2754 // CHECK:   ret <2 x i32> [[TMP2]]
   2755 uint32x2_t test_vhadd_u32(uint32x2_t v1, uint32x2_t v2) {
   2756   return vhadd_u32(v1, v2);
   2757 }
   2758 
   2759 // CHECK-LABEL: define <16 x i8> @test_vhaddq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 {
   2760 // CHECK:   [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.shadd.v16i8(<16 x i8> %v1, <16 x i8> %v2) #4
   2761 // CHECK:   ret <16 x i8> [[VHADDQ_V_I]]
   2762 int8x16_t test_vhaddq_s8(int8x16_t v1, int8x16_t v2) {
   2763   return vhaddq_s8(v1, v2);
   2764 }
   2765 
   2766 // CHECK-LABEL: define <8 x i16> @test_vhaddq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 {
   2767 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
   2768 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
   2769 // CHECK:   [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   2770 // CHECK:   [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   2771 // CHECK:   [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> [[VHADDQ_V_I]], <8 x i16> [[VHADDQ_V1_I]]) #4
   2772 // CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VHADDQ_V2_I]] to <16 x i8>
   2773 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <8 x i16>
   2774 // CHECK:   ret <8 x i16> [[TMP2]]
   2775 int16x8_t test_vhaddq_s16(int16x8_t v1, int16x8_t v2) {
   2776   return vhaddq_s16(v1, v2);
   2777 }
   2778 
   2779 // CHECK-LABEL: define <4 x i32> @test_vhaddq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 {
   2780 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
   2781 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
   2782 // CHECK:   [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   2783 // CHECK:   [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   2784 // CHECK:   [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.shadd.v4i32(<4 x i32> [[VHADDQ_V_I]], <4 x i32> [[VHADDQ_V1_I]]) #4
   2785 // CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VHADDQ_V2_I]] to <16 x i8>
   2786 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <4 x i32>
   2787 // CHECK:   ret <4 x i32> [[TMP2]]
   2788 int32x4_t test_vhaddq_s32(int32x4_t v1, int32x4_t v2) {
   2789   return vhaddq_s32(v1, v2);
   2790 }
   2791 
   2792 // CHECK-LABEL: define <16 x i8> @test_vhaddq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 {
   2793 // CHECK:   [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uhadd.v16i8(<16 x i8> %v1, <16 x i8> %v2) #4
   2794 // CHECK:   ret <16 x i8> [[VHADDQ_V_I]]
   2795 uint8x16_t test_vhaddq_u8(uint8x16_t v1, uint8x16_t v2) {
   2796   return vhaddq_u8(v1, v2);
   2797 }
   2798 
   2799 // CHECK-LABEL: define <8 x i16> @test_vhaddq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 {
   2800 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
   2801 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
   2802 // CHECK:   [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   2803 // CHECK:   [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   2804 // CHECK:   [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> [[VHADDQ_V_I]], <8 x i16> [[VHADDQ_V1_I]]) #4
   2805 // CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VHADDQ_V2_I]] to <16 x i8>
   2806 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <8 x i16>
   2807 // CHECK:   ret <8 x i16> [[TMP2]]
   2808 uint16x8_t test_vhaddq_u16(uint16x8_t v1, uint16x8_t v2) {
   2809   return vhaddq_u16(v1, v2);
   2810 }
   2811 
   2812 // CHECK-LABEL: define <4 x i32> @test_vhaddq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 {
   2813 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
   2814 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
   2815 // CHECK:   [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   2816 // CHECK:   [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   2817 // CHECK:   [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uhadd.v4i32(<4 x i32> [[VHADDQ_V_I]], <4 x i32> [[VHADDQ_V1_I]]) #4
   2818 // CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VHADDQ_V2_I]] to <16 x i8>
   2819 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <4 x i32>
   2820 // CHECK:   ret <4 x i32> [[TMP2]]
   2821 uint32x4_t test_vhaddq_u32(uint32x4_t v1, uint32x4_t v2) {
   2822   return vhaddq_u32(v1, v2);
   2823 }
   2824 
   2825 
   2826 // CHECK-LABEL: define <8 x i8> @test_vhsub_s8(<8 x i8> %v1, <8 x i8> %v2) #0 {
   2827 // CHECK:   [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.shsub.v8i8(<8 x i8> %v1, <8 x i8> %v2) #4
   2828 // CHECK:   ret <8 x i8> [[VHSUB_V_I]]
   2829 int8x8_t test_vhsub_s8(int8x8_t v1, int8x8_t v2) {
   2830   return vhsub_s8(v1, v2);
   2831 }
   2832 
   2833 // CHECK-LABEL: define <4 x i16> @test_vhsub_s16(<4 x i16> %v1, <4 x i16> %v2) #0 {
   2834 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
   2835 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
   2836 // CHECK:   [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   2837 // CHECK:   [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   2838 // CHECK:   [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.shsub.v4i16(<4 x i16> [[VHSUB_V_I]], <4 x i16> [[VHSUB_V1_I]]) #4
   2839 // CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <4 x i16> [[VHSUB_V2_I]] to <8 x i8>
   2840 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <4 x i16>
   2841 // CHECK:   ret <4 x i16> [[TMP2]]
   2842 int16x4_t test_vhsub_s16(int16x4_t v1, int16x4_t v2) {
   2843   return vhsub_s16(v1, v2);
   2844 }
   2845 
   2846 // CHECK-LABEL: define <2 x i32> @test_vhsub_s32(<2 x i32> %v1, <2 x i32> %v2) #0 {
   2847 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
   2848 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
   2849 // CHECK:   [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   2850 // CHECK:   [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   2851 // CHECK:   [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.shsub.v2i32(<2 x i32> [[VHSUB_V_I]], <2 x i32> [[VHSUB_V1_I]]) #4
   2852 // CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <2 x i32> [[VHSUB_V2_I]] to <8 x i8>
   2853 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <2 x i32>
   2854 // CHECK:   ret <2 x i32> [[TMP2]]
   2855 int32x2_t test_vhsub_s32(int32x2_t v1, int32x2_t v2) {
   2856   return vhsub_s32(v1, v2);
   2857 }
   2858 
   2859 // CHECK-LABEL: define <8 x i8> @test_vhsub_u8(<8 x i8> %v1, <8 x i8> %v2) #0 {
   2860 // CHECK:   [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uhsub.v8i8(<8 x i8> %v1, <8 x i8> %v2) #4
   2861 // CHECK:   ret <8 x i8> [[VHSUB_V_I]]
   2862 uint8x8_t test_vhsub_u8(uint8x8_t v1, uint8x8_t v2) {
   2863   return vhsub_u8(v1, v2);
   2864 }
   2865 
   2866 // CHECK-LABEL: define <4 x i16> @test_vhsub_u16(<4 x i16> %v1, <4 x i16> %v2) #0 {
   2867 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
   2868 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
   2869 // CHECK:   [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   2870 // CHECK:   [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   2871 // CHECK:   [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uhsub.v4i16(<4 x i16> [[VHSUB_V_I]], <4 x i16> [[VHSUB_V1_I]]) #4
   2872 // CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <4 x i16> [[VHSUB_V2_I]] to <8 x i8>
   2873 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <4 x i16>
   2874 // CHECK:   ret <4 x i16> [[TMP2]]
   2875 uint16x4_t test_vhsub_u16(uint16x4_t v1, uint16x4_t v2) {
   2876   return vhsub_u16(v1, v2);
   2877 }
   2878 
   2879 // CHECK-LABEL: define <2 x i32> @test_vhsub_u32(<2 x i32> %v1, <2 x i32> %v2) #0 {
   2880 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
   2881 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
   2882 // CHECK:   [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   2883 // CHECK:   [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   2884 // CHECK:   [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uhsub.v2i32(<2 x i32> [[VHSUB_V_I]], <2 x i32> [[VHSUB_V1_I]]) #4
   2885 // CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <2 x i32> [[VHSUB_V2_I]] to <8 x i8>
   2886 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <2 x i32>
   2887 // CHECK:   ret <2 x i32> [[TMP2]]
   2888 uint32x2_t test_vhsub_u32(uint32x2_t v1, uint32x2_t v2) {
   2889   return vhsub_u32(v1, v2);
   2890 }
   2891 
   2892 // CHECK-LABEL: define <16 x i8> @test_vhsubq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 {
   2893 // CHECK:   [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.shsub.v16i8(<16 x i8> %v1, <16 x i8> %v2) #4
   2894 // CHECK:   ret <16 x i8> [[VHSUBQ_V_I]]
   2895 int8x16_t test_vhsubq_s8(int8x16_t v1, int8x16_t v2) {
   2896   return vhsubq_s8(v1, v2);
   2897 }
   2898 
   2899 // CHECK-LABEL: define <8 x i16> @test_vhsubq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 {
   2900 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
   2901 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
   2902 // CHECK:   [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   2903 // CHECK:   [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   2904 // CHECK:   [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.shsub.v8i16(<8 x i16> [[VHSUBQ_V_I]], <8 x i16> [[VHSUBQ_V1_I]]) #4
   2905 // CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VHSUBQ_V2_I]] to <16 x i8>
   2906 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <8 x i16>
   2907 // CHECK:   ret <8 x i16> [[TMP2]]
   2908 int16x8_t test_vhsubq_s16(int16x8_t v1, int16x8_t v2) {
   2909   return vhsubq_s16(v1, v2);
   2910 }
   2911 
   2912 // CHECK-LABEL: define <4 x i32> @test_vhsubq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 {
   2913 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
   2914 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
   2915 // CHECK:   [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   2916 // CHECK:   [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   2917 // CHECK:   [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.shsub.v4i32(<4 x i32> [[VHSUBQ_V_I]], <4 x i32> [[VHSUBQ_V1_I]]) #4
   2918 // CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VHSUBQ_V2_I]] to <16 x i8>
   2919 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <4 x i32>
   2920 // CHECK:   ret <4 x i32> [[TMP2]]
   2921 int32x4_t test_vhsubq_s32(int32x4_t v1, int32x4_t v2) {
   2922   return vhsubq_s32(v1, v2);
   2923 }
   2924 
   2925 // CHECK-LABEL: define <16 x i8> @test_vhsubq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 {
   2926 // CHECK:   [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uhsub.v16i8(<16 x i8> %v1, <16 x i8> %v2) #4
   2927 // CHECK:   ret <16 x i8> [[VHSUBQ_V_I]]
   2928 uint8x16_t test_vhsubq_u8(uint8x16_t v1, uint8x16_t v2) {
   2929   return vhsubq_u8(v1, v2);
   2930 }
   2931 
   2932 // CHECK-LABEL: define <8 x i16> @test_vhsubq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 {
   2933 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
   2934 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
   2935 // CHECK:   [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   2936 // CHECK:   [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   2937 // CHECK:   [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uhsub.v8i16(<8 x i16> [[VHSUBQ_V_I]], <8 x i16> [[VHSUBQ_V1_I]]) #4
   2938 // CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VHSUBQ_V2_I]] to <16 x i8>
   2939 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <8 x i16>
   2940 // CHECK:   ret <8 x i16> [[TMP2]]
   2941 uint16x8_t test_vhsubq_u16(uint16x8_t v1, uint16x8_t v2) {
   2942   return vhsubq_u16(v1, v2);
   2943 }
   2944 
   2945 // CHECK-LABEL: define <4 x i32> @test_vhsubq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 {
   2946 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
   2947 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
   2948 // CHECK:   [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   2949 // CHECK:   [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   2950 // CHECK:   [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uhsub.v4i32(<4 x i32> [[VHSUBQ_V_I]], <4 x i32> [[VHSUBQ_V1_I]]) #4
   2951 // CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VHSUBQ_V2_I]] to <16 x i8>
   2952 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <4 x i32>
   2953 // CHECK:   ret <4 x i32> [[TMP2]]
   2954 uint32x4_t test_vhsubq_u32(uint32x4_t v1, uint32x4_t v2) {
   2955   return vhsubq_u32(v1, v2);
   2956 }
   2957 
   2958 
   2959 // CHECK-LABEL: define <8 x i8> @test_vrhadd_s8(<8 x i8> %v1, <8 x i8> %v2) #0 {
   2960 // CHECK:   [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8> %v1, <8 x i8> %v2) #4
   2961 // CHECK:   ret <8 x i8> [[VRHADD_V_I]]
   2962 int8x8_t test_vrhadd_s8(int8x8_t v1, int8x8_t v2) {
   2963   return vrhadd_s8(v1, v2);
   2964 }
   2965 
   2966 // CHECK-LABEL: define <4 x i16> @test_vrhadd_s16(<4 x i16> %v1, <4 x i16> %v2) #0 {
   2967 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
   2968 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
   2969 // CHECK:   [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   2970 // CHECK:   [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   2971 // CHECK:   [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.srhadd.v4i16(<4 x i16> [[VRHADD_V_I]], <4 x i16> [[VRHADD_V1_I]]) #4
   2972 // CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <4 x i16> [[VRHADD_V2_I]] to <8 x i8>
   2973 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <4 x i16>
   2974 // CHECK:   ret <4 x i16> [[TMP2]]
   2975 int16x4_t test_vrhadd_s16(int16x4_t v1, int16x4_t v2) {
   2976   return vrhadd_s16(v1, v2);
   2977 }
   2978 
   2979 // CHECK-LABEL: define <2 x i32> @test_vrhadd_s32(<2 x i32> %v1, <2 x i32> %v2) #0 {
   2980 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
   2981 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
   2982 // CHECK:   [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   2983 // CHECK:   [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   2984 // CHECK:   [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.srhadd.v2i32(<2 x i32> [[VRHADD_V_I]], <2 x i32> [[VRHADD_V1_I]]) #4
   2985 // CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <2 x i32> [[VRHADD_V2_I]] to <8 x i8>
   2986 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <2 x i32>
   2987 // CHECK:   ret <2 x i32> [[TMP2]]
   2988 int32x2_t test_vrhadd_s32(int32x2_t v1, int32x2_t v2) {
   2989   return vrhadd_s32(v1, v2);
   2990 }
   2991 
   2992 // CHECK-LABEL: define <8 x i8> @test_vrhadd_u8(<8 x i8> %v1, <8 x i8> %v2) #0 {
   2993 // CHECK:   [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> %v1, <8 x i8> %v2) #4
   2994 // CHECK:   ret <8 x i8> [[VRHADD_V_I]]
   2995 uint8x8_t test_vrhadd_u8(uint8x8_t v1, uint8x8_t v2) {
   2996   return vrhadd_u8(v1, v2);
   2997 }
   2998 
   2999 // CHECK-LABEL: define <4 x i16> @test_vrhadd_u16(<4 x i16> %v1, <4 x i16> %v2) #0 {
   3000 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
   3001 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
   3002 // CHECK:   [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   3003 // CHECK:   [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   3004 // CHECK:   [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.urhadd.v4i16(<4 x i16> [[VRHADD_V_I]], <4 x i16> [[VRHADD_V1_I]]) #4
   3005 // CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <4 x i16> [[VRHADD_V2_I]] to <8 x i8>
   3006 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <4 x i16>
   3007 // CHECK:   ret <4 x i16> [[TMP2]]
   3008 uint16x4_t test_vrhadd_u16(uint16x4_t v1, uint16x4_t v2) {
   3009   return vrhadd_u16(v1, v2);
   3010 }
   3011 
   3012 // CHECK-LABEL: define <2 x i32> @test_vrhadd_u32(<2 x i32> %v1, <2 x i32> %v2) #0 {
   3013 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
   3014 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
   3015 // CHECK:   [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   3016 // CHECK:   [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   3017 // CHECK:   [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.urhadd.v2i32(<2 x i32> [[VRHADD_V_I]], <2 x i32> [[VRHADD_V1_I]]) #4
   3018 // CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <2 x i32> [[VRHADD_V2_I]] to <8 x i8>
   3019 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <2 x i32>
   3020 // CHECK:   ret <2 x i32> [[TMP2]]
   3021 uint32x2_t test_vrhadd_u32(uint32x2_t v1, uint32x2_t v2) {
   3022   return vrhadd_u32(v1, v2);
   3023 }
   3024 
   3025 // CHECK-LABEL: define <16 x i8> @test_vrhaddq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 {
   3026 // CHECK:   [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.srhadd.v16i8(<16 x i8> %v1, <16 x i8> %v2) #4
   3027 // CHECK:   ret <16 x i8> [[VRHADDQ_V_I]]
   3028 int8x16_t test_vrhaddq_s8(int8x16_t v1, int8x16_t v2) {
   3029   return vrhaddq_s8(v1, v2);
   3030 }
   3031 
   3032 // CHECK-LABEL: define <8 x i16> @test_vrhaddq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 {
   3033 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
   3034 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
   3035 // CHECK:   [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   3036 // CHECK:   [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   3037 // CHECK:   [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> [[VRHADDQ_V_I]], <8 x i16> [[VRHADDQ_V1_I]]) #4
   3038 // CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VRHADDQ_V2_I]] to <16 x i8>
   3039 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <8 x i16>
   3040 // CHECK:   ret <8 x i16> [[TMP2]]
   3041 int16x8_t test_vrhaddq_s16(int16x8_t v1, int16x8_t v2) {
   3042   return vrhaddq_s16(v1, v2);
   3043 }
   3044 
   3045 // CHECK-LABEL: define <4 x i32> @test_vrhaddq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 {
   3046 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
   3047 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
   3048 // CHECK:   [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   3049 // CHECK:   [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   3050 // CHECK:   [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32> [[VRHADDQ_V_I]], <4 x i32> [[VRHADDQ_V1_I]]) #4
   3051 // CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VRHADDQ_V2_I]] to <16 x i8>
   3052 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <4 x i32>
   3053 // CHECK:   ret <4 x i32> [[TMP2]]
   3054 int32x4_t test_vrhaddq_s32(int32x4_t v1, int32x4_t v2) {
   3055   return vrhaddq_s32(v1, v2);
   3056 }
   3057 
   3058 // CHECK-LABEL: define <16 x i8> @test_vrhaddq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 {
   3059 // CHECK:   [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.urhadd.v16i8(<16 x i8> %v1, <16 x i8> %v2) #4
   3060 // CHECK:   ret <16 x i8> [[VRHADDQ_V_I]]
   3061 uint8x16_t test_vrhaddq_u8(uint8x16_t v1, uint8x16_t v2) {
   3062   return vrhaddq_u8(v1, v2);
   3063 }
   3064 
   3065 // CHECK-LABEL: define <8 x i16> @test_vrhaddq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 {
   3066 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
   3067 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
   3068 // CHECK:   [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   3069 // CHECK:   [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   3070 // CHECK:   [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> [[VRHADDQ_V_I]], <8 x i16> [[VRHADDQ_V1_I]]) #4
   3071 // CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VRHADDQ_V2_I]] to <16 x i8>
   3072 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <8 x i16>
   3073 // CHECK:   ret <8 x i16> [[TMP2]]
   3074 uint16x8_t test_vrhaddq_u16(uint16x8_t v1, uint16x8_t v2) {
   3075   return vrhaddq_u16(v1, v2);
   3076 }
   3077 
   3078 // CHECK-LABEL: define <4 x i32> @test_vrhaddq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 {
   3079 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
   3080 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
   3081 // CHECK:   [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   3082 // CHECK:   [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   3083 // CHECK:   [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32> [[VRHADDQ_V_I]], <4 x i32> [[VRHADDQ_V1_I]]) #4
   3084 // CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VRHADDQ_V2_I]] to <16 x i8>
   3085 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <4 x i32>
   3086 // CHECK:   ret <4 x i32> [[TMP2]]
   3087 uint32x4_t test_vrhaddq_u32(uint32x4_t v1, uint32x4_t v2) {
   3088   return vrhaddq_u32(v1, v2);
   3089 }
   3090 // CHECK-LABEL: define <8 x i8> @test_vqadd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
   3091 // CHECK:   [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqadd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
   3092 // CHECK:   ret <8 x i8> [[VQADD_V_I]]
   3093 int8x8_t test_vqadd_s8(int8x8_t a, int8x8_t b) {
   3094   return vqadd_s8(a, b);
   3095 }
   3096 
   3097 // CHECK-LABEL: define <4 x i16> @test_vqadd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   3098 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   3099 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   3100 // CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   3101 // CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   3102 // CHECK:   [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> [[VQADD_V_I]], <4 x i16> [[VQADD_V1_I]]) #4
   3103 // CHECK:   [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8>
   3104 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <4 x i16>
   3105 // CHECK:   ret <4 x i16> [[TMP2]]
   3106 int16x4_t test_vqadd_s16(int16x4_t a, int16x4_t b) {
   3107   return vqadd_s16(a, b);
   3108 }
   3109 
   3110 // CHECK-LABEL: define <2 x i32> @test_vqadd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   3111 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   3112 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   3113 // CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   3114 // CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   3115 // CHECK:   [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> [[VQADD_V_I]], <2 x i32> [[VQADD_V1_I]]) #4
   3116 // CHECK:   [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8>
   3117 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <2 x i32>
   3118 // CHECK:   ret <2 x i32> [[TMP2]]
   3119 int32x2_t test_vqadd_s32(int32x2_t a, int32x2_t b) {
   3120   return vqadd_s32(a, b);
   3121 }
   3122 
   3123 // CHECK-LABEL: define <1 x i64> @test_vqadd_s64(<1 x i64> %a, <1 x i64> %b) #0 {
   3124 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   3125 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   3126 // CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   3127 // CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   3128 // CHECK:   [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqadd.v1i64(<1 x i64> [[VQADD_V_I]], <1 x i64> [[VQADD_V1_I]]) #4
   3129 // CHECK:   [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8>
   3130 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <1 x i64>
   3131 // CHECK:   ret <1 x i64> [[TMP2]]
   3132 int64x1_t test_vqadd_s64(int64x1_t a, int64x1_t b) {
   3133   return vqadd_s64(a, b);
   3134 }
   3135 
   3136 // CHECK-LABEL: define <8 x i8> @test_vqadd_u8(<8 x i8> %a, <8 x i8> %b) #0 {
   3137 // CHECK:   [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqadd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
   3138 // CHECK:   ret <8 x i8> [[VQADD_V_I]]
   3139 uint8x8_t test_vqadd_u8(uint8x8_t a, uint8x8_t b) {
   3140   return vqadd_u8(a, b);
   3141 }
   3142 
   3143 // CHECK-LABEL: define <4 x i16> @test_vqadd_u16(<4 x i16> %a, <4 x i16> %b) #0 {
   3144 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   3145 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   3146 // CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   3147 // CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   3148 // CHECK:   [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqadd.v4i16(<4 x i16> [[VQADD_V_I]], <4 x i16> [[VQADD_V1_I]]) #4
   3149 // CHECK:   [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8>
   3150 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <4 x i16>
   3151 // CHECK:   ret <4 x i16> [[TMP2]]
   3152 uint16x4_t test_vqadd_u16(uint16x4_t a, uint16x4_t b) {
   3153   return vqadd_u16(a, b);
   3154 }
   3155 
   3156 // CHECK-LABEL: define <2 x i32> @test_vqadd_u32(<2 x i32> %a, <2 x i32> %b) #0 {
   3157 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   3158 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   3159 // CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   3160 // CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   3161 // CHECK:   [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqadd.v2i32(<2 x i32> [[VQADD_V_I]], <2 x i32> [[VQADD_V1_I]]) #4
   3162 // CHECK:   [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8>
   3163 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <2 x i32>
   3164 // CHECK:   ret <2 x i32> [[TMP2]]
   3165 uint32x2_t test_vqadd_u32(uint32x2_t a, uint32x2_t b) {
   3166   return vqadd_u32(a, b);
   3167 }
   3168 
   3169 // CHECK-LABEL: define <1 x i64> @test_vqadd_u64(<1 x i64> %a, <1 x i64> %b) #0 {
   3170 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   3171 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   3172 // CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   3173 // CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   3174 // CHECK:   [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqadd.v1i64(<1 x i64> [[VQADD_V_I]], <1 x i64> [[VQADD_V1_I]]) #4
   3175 // CHECK:   [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8>
   3176 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <1 x i64>
   3177 // CHECK:   ret <1 x i64> [[TMP2]]
   3178 uint64x1_t test_vqadd_u64(uint64x1_t a, uint64x1_t b) {
   3179   return vqadd_u64(a, b);
   3180 }
   3181 
   3182 // CHECK-LABEL: define <16 x i8> @test_vqaddq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
   3183 // CHECK:   [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqadd.v16i8(<16 x i8> %a, <16 x i8> %b) #4
   3184 // CHECK:   ret <16 x i8> [[VQADDQ_V_I]]
   3185 int8x16_t test_vqaddq_s8(int8x16_t a, int8x16_t b) {
   3186   return vqaddq_s8(a, b);
   3187 }
   3188 
   3189 // CHECK-LABEL: define <8 x i16> @test_vqaddq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   3190 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   3191 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   3192 // CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   3193 // CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   3194 // CHECK:   [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> [[VQADDQ_V_I]], <8 x i16> [[VQADDQ_V1_I]]) #4
   3195 // CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VQADDQ_V2_I]] to <16 x i8>
   3196 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <8 x i16>
   3197 // CHECK:   ret <8 x i16> [[TMP2]]
   3198 int16x8_t test_vqaddq_s16(int16x8_t a, int16x8_t b) {
   3199   return vqaddq_s16(a, b);
   3200 }
   3201 
   3202 // CHECK-LABEL: define <4 x i32> @test_vqaddq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   3203 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   3204 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   3205 // CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   3206 // CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   3207 // CHECK:   [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQADDQ_V_I]], <4 x i32> [[VQADDQ_V1_I]]) #4
   3208 // CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VQADDQ_V2_I]] to <16 x i8>
   3209 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <4 x i32>
   3210 // CHECK:   ret <4 x i32> [[TMP2]]
   3211 int32x4_t test_vqaddq_s32(int32x4_t a, int32x4_t b) {
   3212   return vqaddq_s32(a, b);
   3213 }
   3214 
   3215 // CHECK-LABEL: define <2 x i64> @test_vqaddq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
   3216 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   3217 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   3218 // CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   3219 // CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   3220 // CHECK:   [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQADDQ_V_I]], <2 x i64> [[VQADDQ_V1_I]]) #4
   3221 // CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VQADDQ_V2_I]] to <16 x i8>
   3222 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <2 x i64>
   3223 // CHECK:   ret <2 x i64> [[TMP2]]
   3224 int64x2_t test_vqaddq_s64(int64x2_t a, int64x2_t b) {
   3225   return vqaddq_s64(a, b);
   3226 }
   3227 
   3228 // CHECK-LABEL: define <16 x i8> @test_vqaddq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
   3229 // CHECK:   [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqadd.v16i8(<16 x i8> %a, <16 x i8> %b) #4
   3230 // CHECK:   ret <16 x i8> [[VQADDQ_V_I]]
   3231 uint8x16_t test_vqaddq_u8(uint8x16_t a, uint8x16_t b) {
   3232   return vqaddq_u8(a, b);
   3233 }
   3234 
   3235 // CHECK-LABEL: define <8 x i16> @test_vqaddq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
   3236 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   3237 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   3238 // CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   3239 // CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   3240 // CHECK:   [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqadd.v8i16(<8 x i16> [[VQADDQ_V_I]], <8 x i16> [[VQADDQ_V1_I]]) #4
   3241 // CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VQADDQ_V2_I]] to <16 x i8>
   3242 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <8 x i16>
   3243 // CHECK:   ret <8 x i16> [[TMP2]]
   3244 uint16x8_t test_vqaddq_u16(uint16x8_t a, uint16x8_t b) {
   3245   return vqaddq_u16(a, b);
   3246 }
   3247 
   3248 // CHECK-LABEL: define <4 x i32> @test_vqaddq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
   3249 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   3250 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   3251 // CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   3252 // CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   3253 // CHECK:   [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqadd.v4i32(<4 x i32> [[VQADDQ_V_I]], <4 x i32> [[VQADDQ_V1_I]]) #4
   3254 // CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VQADDQ_V2_I]] to <16 x i8>
   3255 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <4 x i32>
   3256 // CHECK:   ret <4 x i32> [[TMP2]]
   3257 uint32x4_t test_vqaddq_u32(uint32x4_t a, uint32x4_t b) {
   3258   return vqaddq_u32(a, b);
   3259 }
   3260 
   3261 // CHECK-LABEL: define <2 x i64> @test_vqaddq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
   3262 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   3263 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   3264 // CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   3265 // CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   3266 // CHECK:   [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqadd.v2i64(<2 x i64> [[VQADDQ_V_I]], <2 x i64> [[VQADDQ_V1_I]]) #4
   3267 // CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VQADDQ_V2_I]] to <16 x i8>
   3268 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <2 x i64>
   3269 // CHECK:   ret <2 x i64> [[TMP2]]
   3270 uint64x2_t test_vqaddq_u64(uint64x2_t a, uint64x2_t b) {
   3271   return vqaddq_u64(a, b);
   3272 }
   3273 
   3274 
   3275 // CHECK-LABEL: define <8 x i8> @test_vqsub_s8(<8 x i8> %a, <8 x i8> %b) #0 {
   3276 // CHECK:   [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqsub.v8i8(<8 x i8> %a, <8 x i8> %b) #4
   3277 // CHECK:   ret <8 x i8> [[VQSUB_V_I]]
   3278 int8x8_t test_vqsub_s8(int8x8_t a, int8x8_t b) {
   3279   return vqsub_s8(a, b);
   3280 }
   3281 
   3282 // CHECK-LABEL: define <4 x i16> @test_vqsub_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   3283 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   3284 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   3285 // CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   3286 // CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   3287 // CHECK:   [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> [[VQSUB_V_I]], <4 x i16> [[VQSUB_V1_I]]) #4
   3288 // CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <4 x i16> [[VQSUB_V2_I]] to <8 x i8>
   3289 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <4 x i16>
   3290 // CHECK:   ret <4 x i16> [[TMP2]]
   3291 int16x4_t test_vqsub_s16(int16x4_t a, int16x4_t b) {
   3292   return vqsub_s16(a, b);
   3293 }
   3294 
   3295 // CHECK-LABEL: define <2 x i32> @test_vqsub_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   3296 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   3297 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   3298 // CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   3299 // CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   3300 // CHECK:   [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> [[VQSUB_V_I]], <2 x i32> [[VQSUB_V1_I]]) #4
   3301 // CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <2 x i32> [[VQSUB_V2_I]] to <8 x i8>
   3302 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <2 x i32>
   3303 // CHECK:   ret <2 x i32> [[TMP2]]
   3304 int32x2_t test_vqsub_s32(int32x2_t a, int32x2_t b) {
   3305   return vqsub_s32(a, b);
   3306 }
   3307 
   3308 // CHECK-LABEL: define <1 x i64> @test_vqsub_s64(<1 x i64> %a, <1 x i64> %b) #0 {
   3309 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   3310 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   3311 // CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   3312 // CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   3313 // CHECK:   [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqsub.v1i64(<1 x i64> [[VQSUB_V_I]], <1 x i64> [[VQSUB_V1_I]]) #4
   3314 // CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <1 x i64> [[VQSUB_V2_I]] to <8 x i8>
   3315 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <1 x i64>
   3316 // CHECK:   ret <1 x i64> [[TMP2]]
   3317 int64x1_t test_vqsub_s64(int64x1_t a, int64x1_t b) {
   3318   return vqsub_s64(a, b);
   3319 }
   3320 
   3321 // CHECK-LABEL: define <8 x i8> @test_vqsub_u8(<8 x i8> %a, <8 x i8> %b) #0 {
   3322 // CHECK:   [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqsub.v8i8(<8 x i8> %a, <8 x i8> %b) #4
   3323 // CHECK:   ret <8 x i8> [[VQSUB_V_I]]
   3324 uint8x8_t test_vqsub_u8(uint8x8_t a, uint8x8_t b) {
   3325   return vqsub_u8(a, b);
   3326 }
   3327 
   3328 // CHECK-LABEL: define <4 x i16> @test_vqsub_u16(<4 x i16> %a, <4 x i16> %b) #0 {
   3329 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   3330 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   3331 // CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   3332 // CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   3333 // CHECK:   [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqsub.v4i16(<4 x i16> [[VQSUB_V_I]], <4 x i16> [[VQSUB_V1_I]]) #4
   3334 // CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <4 x i16> [[VQSUB_V2_I]] to <8 x i8>
   3335 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <4 x i16>
   3336 // CHECK:   ret <4 x i16> [[TMP2]]
   3337 uint16x4_t test_vqsub_u16(uint16x4_t a, uint16x4_t b) {
   3338   return vqsub_u16(a, b);
   3339 }
   3340 
   3341 // CHECK-LABEL: define <2 x i32> @test_vqsub_u32(<2 x i32> %a, <2 x i32> %b) #0 {
   3342 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   3343 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   3344 // CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   3345 // CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   3346 // CHECK:   [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqsub.v2i32(<2 x i32> [[VQSUB_V_I]], <2 x i32> [[VQSUB_V1_I]]) #4
   3347 // CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <2 x i32> [[VQSUB_V2_I]] to <8 x i8>
   3348 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <2 x i32>
   3349 // CHECK:   ret <2 x i32> [[TMP2]]
   3350 uint32x2_t test_vqsub_u32(uint32x2_t a, uint32x2_t b) {
   3351   return vqsub_u32(a, b);
   3352 }
   3353 
   3354 // CHECK-LABEL: define <1 x i64> @test_vqsub_u64(<1 x i64> %a, <1 x i64> %b) #0 {
   3355 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   3356 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   3357 // CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   3358 // CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   3359 // CHECK:   [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqsub.v1i64(<1 x i64> [[VQSUB_V_I]], <1 x i64> [[VQSUB_V1_I]]) #4
   3360 // CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <1 x i64> [[VQSUB_V2_I]] to <8 x i8>
   3361 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <1 x i64>
   3362 // CHECK:   ret <1 x i64> [[TMP2]]
   3363 uint64x1_t test_vqsub_u64(uint64x1_t a, uint64x1_t b) {
   3364   return vqsub_u64(a, b);
   3365 }
   3366 
   3367 // CHECK-LABEL: define <16 x i8> @test_vqsubq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
   3368 // CHECK:   [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqsub.v16i8(<16 x i8> %a, <16 x i8> %b) #4
   3369 // CHECK:   ret <16 x i8> [[VQSUBQ_V_I]]
   3370 int8x16_t test_vqsubq_s8(int8x16_t a, int8x16_t b) {
   3371   return vqsubq_s8(a, b);
   3372 }
   3373 
   3374 // CHECK-LABEL: define <8 x i16> @test_vqsubq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   3375 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   3376 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   3377 // CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   3378 // CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   3379 // CHECK:   [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> [[VQSUBQ_V_I]], <8 x i16> [[VQSUBQ_V1_I]]) #4
   3380 // CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSUBQ_V2_I]] to <16 x i8>
   3381 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <8 x i16>
   3382 // CHECK:   ret <8 x i16> [[TMP2]]
   3383 int16x8_t test_vqsubq_s16(int16x8_t a, int16x8_t b) {
   3384   return vqsubq_s16(a, b);
   3385 }
   3386 
   3387 // CHECK-LABEL: define <4 x i32> @test_vqsubq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   3388 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   3389 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   3390 // CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   3391 // CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   3392 // CHECK:   [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQSUBQ_V_I]], <4 x i32> [[VQSUBQ_V1_I]]) #4
   3393 // CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSUBQ_V2_I]] to <16 x i8>
   3394 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <4 x i32>
   3395 // CHECK:   ret <4 x i32> [[TMP2]]
   3396 int32x4_t test_vqsubq_s32(int32x4_t a, int32x4_t b) {
   3397   return vqsubq_s32(a, b);
   3398 }
   3399 
   3400 // CHECK-LABEL: define <2 x i64> @test_vqsubq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
   3401 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   3402 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   3403 // CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   3404 // CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   3405 // CHECK:   [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQSUBQ_V_I]], <2 x i64> [[VQSUBQ_V1_I]]) #4
   3406 // CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSUBQ_V2_I]] to <16 x i8>
   3407 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <2 x i64>
   3408 // CHECK:   ret <2 x i64> [[TMP2]]
   3409 int64x2_t test_vqsubq_s64(int64x2_t a, int64x2_t b) {
   3410   return vqsubq_s64(a, b);
   3411 }
   3412 
   3413 // CHECK-LABEL: define <16 x i8> @test_vqsubq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
   3414 // CHECK:   [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqsub.v16i8(<16 x i8> %a, <16 x i8> %b) #4
   3415 // CHECK:   ret <16 x i8> [[VQSUBQ_V_I]]
   3416 uint8x16_t test_vqsubq_u8(uint8x16_t a, uint8x16_t b) {
   3417   return vqsubq_u8(a, b);
   3418 }
   3419 
   3420 // CHECK-LABEL: define <8 x i16> @test_vqsubq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
   3421 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   3422 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   3423 // CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   3424 // CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   3425 // CHECK:   [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqsub.v8i16(<8 x i16> [[VQSUBQ_V_I]], <8 x i16> [[VQSUBQ_V1_I]]) #4
   3426 // CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSUBQ_V2_I]] to <16 x i8>
   3427 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <8 x i16>
   3428 // CHECK:   ret <8 x i16> [[TMP2]]
   3429 uint16x8_t test_vqsubq_u16(uint16x8_t a, uint16x8_t b) {
   3430   return vqsubq_u16(a, b);
   3431 }
   3432 
   3433 // CHECK-LABEL: define <4 x i32> @test_vqsubq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
   3434 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   3435 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   3436 // CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   3437 // CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   3438 // CHECK:   [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqsub.v4i32(<4 x i32> [[VQSUBQ_V_I]], <4 x i32> [[VQSUBQ_V1_I]]) #4
   3439 // CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSUBQ_V2_I]] to <16 x i8>
   3440 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <4 x i32>
   3441 // CHECK:   ret <4 x i32> [[TMP2]]
   3442 uint32x4_t test_vqsubq_u32(uint32x4_t a, uint32x4_t b) {
   3443   return vqsubq_u32(a, b);
   3444 }
   3445 
   3446 // CHECK-LABEL: define <2 x i64> @test_vqsubq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
   3447 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   3448 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   3449 // CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   3450 // CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   3451 // CHECK:   [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqsub.v2i64(<2 x i64> [[VQSUBQ_V_I]], <2 x i64> [[VQSUBQ_V1_I]]) #4
   3452 // CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSUBQ_V2_I]] to <16 x i8>
   3453 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <2 x i64>
   3454 // CHECK:   ret <2 x i64> [[TMP2]]
   3455 uint64x2_t test_vqsubq_u64(uint64x2_t a, uint64x2_t b) {
   3456   return vqsubq_u64(a, b);
   3457 }
   3458 
   3459 
   3460 // CHECK-LABEL: define <8 x i8> @test_vshl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
   3461 // CHECK:   [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4
   3462 // CHECK:   ret <8 x i8> [[VSHL_V_I]]
   3463 int8x8_t test_vshl_s8(int8x8_t a, int8x8_t b) {
   3464   return vshl_s8(a, b);
   3465 }
   3466 
   3467 // CHECK-LABEL: define <4 x i16> @test_vshl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   3468 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   3469 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   3470 // CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   3471 // CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   3472 // CHECK:   [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sshl.v4i16(<4 x i16> [[VSHL_V_I]], <4 x i16> [[VSHL_V1_I]]) #4
   3473 // CHECK:   [[VSHL_V3_I:%.*]] = bitcast <4 x i16> [[VSHL_V2_I]] to <8 x i8>
   3474 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <4 x i16>
   3475 // CHECK:   ret <4 x i16> [[TMP2]]
   3476 int16x4_t test_vshl_s16(int16x4_t a, int16x4_t b) {
   3477   return vshl_s16(a, b);
   3478 }
   3479 
   3480 // CHECK-LABEL: define <2 x i32> @test_vshl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   3481 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   3482 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   3483 // CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   3484 // CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   3485 // CHECK:   [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sshl.v2i32(<2 x i32> [[VSHL_V_I]], <2 x i32> [[VSHL_V1_I]]) #4
   3486 // CHECK:   [[VSHL_V3_I:%.*]] = bitcast <2 x i32> [[VSHL_V2_I]] to <8 x i8>
   3487 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <2 x i32>
   3488 // CHECK:   ret <2 x i32> [[TMP2]]
   3489 int32x2_t test_vshl_s32(int32x2_t a, int32x2_t b) {
   3490   return vshl_s32(a, b);
   3491 }
   3492 
   3493 // CHECK-LABEL: define <1 x i64> @test_vshl_s64(<1 x i64> %a, <1 x i64> %b) #0 {
   3494 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   3495 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   3496 // CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   3497 // CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   3498 // CHECK:   [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sshl.v1i64(<1 x i64> [[VSHL_V_I]], <1 x i64> [[VSHL_V1_I]]) #4
   3499 // CHECK:   [[VSHL_V3_I:%.*]] = bitcast <1 x i64> [[VSHL_V2_I]] to <8 x i8>
   3500 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <1 x i64>
   3501 // CHECK:   ret <1 x i64> [[TMP2]]
   3502 int64x1_t test_vshl_s64(int64x1_t a, int64x1_t b) {
   3503   return vshl_s64(a, b);
   3504 }
   3505 
   3506 // CHECK-LABEL: define <8 x i8> @test_vshl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
   3507 // CHECK:   [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.ushl.v8i8(<8 x i8> %a, <8 x i8> %b) #4
   3508 // CHECK:   ret <8 x i8> [[VSHL_V_I]]
   3509 uint8x8_t test_vshl_u8(uint8x8_t a, int8x8_t b) {
   3510   return vshl_u8(a, b);
   3511 }
   3512 
   3513 // CHECK-LABEL: define <4 x i16> @test_vshl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
   3514 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   3515 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   3516 // CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   3517 // CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   3518 // CHECK:   [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.ushl.v4i16(<4 x i16> [[VSHL_V_I]], <4 x i16> [[VSHL_V1_I]]) #4
   3519 // CHECK:   [[VSHL_V3_I:%.*]] = bitcast <4 x i16> [[VSHL_V2_I]] to <8 x i8>
   3520 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <4 x i16>
   3521 // CHECK:   ret <4 x i16> [[TMP2]]
   3522 uint16x4_t test_vshl_u16(uint16x4_t a, int16x4_t b) {
   3523   return vshl_u16(a, b);
   3524 }
   3525 
   3526 // CHECK-LABEL: define <2 x i32> @test_vshl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
   3527 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   3528 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   3529 // CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   3530 // CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   3531 // CHECK:   [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.ushl.v2i32(<2 x i32> [[VSHL_V_I]], <2 x i32> [[VSHL_V1_I]]) #4
   3532 // CHECK:   [[VSHL_V3_I:%.*]] = bitcast <2 x i32> [[VSHL_V2_I]] to <8 x i8>
   3533 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <2 x i32>
   3534 // CHECK:   ret <2 x i32> [[TMP2]]
   3535 uint32x2_t test_vshl_u32(uint32x2_t a, int32x2_t b) {
   3536   return vshl_u32(a, b);
   3537 }
   3538 
   3539 // CHECK-LABEL: define <1 x i64> @test_vshl_u64(<1 x i64> %a, <1 x i64> %b) #0 {
   3540 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   3541 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   3542 // CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   3543 // CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   3544 // CHECK:   [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.ushl.v1i64(<1 x i64> [[VSHL_V_I]], <1 x i64> [[VSHL_V1_I]]) #4
   3545 // CHECK:   [[VSHL_V3_I:%.*]] = bitcast <1 x i64> [[VSHL_V2_I]] to <8 x i8>
   3546 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <1 x i64>
   3547 // CHECK:   ret <1 x i64> [[TMP2]]
   3548 uint64x1_t test_vshl_u64(uint64x1_t a, int64x1_t b) {
   3549   return vshl_u64(a, b);
   3550 }
   3551 
   3552 // CHECK-LABEL: define <16 x i8> @test_vshlq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
   3553 // CHECK:   [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4
   3554 // CHECK:   ret <16 x i8> [[VSHLQ_V_I]]
   3555 int8x16_t test_vshlq_s8(int8x16_t a, int8x16_t b) {
   3556   return vshlq_s8(a, b);
   3557 }
   3558 
   3559 // CHECK-LABEL: define <8 x i16> @test_vshlq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   3560 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   3561 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   3562 // CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   3563 // CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   3564 // CHECK:   [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sshl.v8i16(<8 x i16> [[VSHLQ_V_I]], <8 x i16> [[VSHLQ_V1_I]]) #4
   3565 // CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VSHLQ_V2_I]] to <16 x i8>
   3566 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <8 x i16>
   3567 // CHECK:   ret <8 x i16> [[TMP2]]
   3568 int16x8_t test_vshlq_s16(int16x8_t a, int16x8_t b) {
   3569   return vshlq_s16(a, b);
   3570 }
   3571 
   3572 // CHECK-LABEL: define <4 x i32> @test_vshlq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   3573 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   3574 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   3575 // CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   3576 // CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   3577 // CHECK:   [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> [[VSHLQ_V_I]], <4 x i32> [[VSHLQ_V1_I]]) #4
   3578 // CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VSHLQ_V2_I]] to <16 x i8>
   3579 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <4 x i32>
   3580 // CHECK:   ret <4 x i32> [[TMP2]]
   3581 int32x4_t test_vshlq_s32(int32x4_t a, int32x4_t b) {
   3582   return vshlq_s32(a, b);
   3583 }
   3584 
   3585 // CHECK-LABEL: define <2 x i64> @test_vshlq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
   3586 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   3587 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   3588 // CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   3589 // CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   3590 // CHECK:   [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64> [[VSHLQ_V_I]], <2 x i64> [[VSHLQ_V1_I]]) #4
   3591 // CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VSHLQ_V2_I]] to <16 x i8>
   3592 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <2 x i64>
   3593 // CHECK:   ret <2 x i64> [[TMP2]]
   3594 int64x2_t test_vshlq_s64(int64x2_t a, int64x2_t b) {
   3595   return vshlq_s64(a, b);
   3596 }
   3597 
   3598 // CHECK-LABEL: define <16 x i8> @test_vshlq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
   3599 // CHECK:   [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.ushl.v16i8(<16 x i8> %a, <16 x i8> %b) #4
   3600 // CHECK:   ret <16 x i8> [[VSHLQ_V_I]]
   3601 uint8x16_t test_vshlq_u8(uint8x16_t a, int8x16_t b) {
   3602   return vshlq_u8(a, b);
   3603 }
   3604 
   3605 // CHECK-LABEL: define <8 x i16> @test_vshlq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
   3606 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   3607 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   3608 // CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   3609 // CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   3610 // CHECK:   [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.ushl.v8i16(<8 x i16> [[VSHLQ_V_I]], <8 x i16> [[VSHLQ_V1_I]]) #4
   3611 // CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VSHLQ_V2_I]] to <16 x i8>
   3612 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <8 x i16>
   3613 // CHECK:   ret <8 x i16> [[TMP2]]
   3614 uint16x8_t test_vshlq_u16(uint16x8_t a, int16x8_t b) {
   3615   return vshlq_u16(a, b);
   3616 }
   3617 
   3618 // CHECK-LABEL: define <4 x i32> @test_vshlq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
   3619 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   3620 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   3621 // CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   3622 // CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   3623 // CHECK:   [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.ushl.v4i32(<4 x i32> [[VSHLQ_V_I]], <4 x i32> [[VSHLQ_V1_I]]) #4
   3624 // CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VSHLQ_V2_I]] to <16 x i8>
   3625 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <4 x i32>
   3626 // CHECK:   ret <4 x i32> [[TMP2]]
   3627 uint32x4_t test_vshlq_u32(uint32x4_t a, int32x4_t b) {
   3628   return vshlq_u32(a, b);
   3629 }
   3630 
   3631 // CHECK-LABEL: define <2 x i64> @test_vshlq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
   3632 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   3633 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   3634 // CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   3635 // CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   3636 // CHECK:   [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.ushl.v2i64(<2 x i64> [[VSHLQ_V_I]], <2 x i64> [[VSHLQ_V1_I]]) #4
   3637 // CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VSHLQ_V2_I]] to <16 x i8>
   3638 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <2 x i64>
   3639 // CHECK:   ret <2 x i64> [[TMP2]]
   3640 uint64x2_t test_vshlq_u64(uint64x2_t a, int64x2_t b) {
   3641   return vshlq_u64(a, b);
   3642 }
   3643 
   3644 
   3645 // CHECK-LABEL: define <8 x i8> @test_vqshl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
   3646 // CHECK:   [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4
   3647 // CHECK:   ret <8 x i8> [[VQSHL_V_I]]
   3648 int8x8_t test_vqshl_s8(int8x8_t a, int8x8_t b) {
   3649   return vqshl_s8(a, b);
   3650 }
   3651 
   3652 // CHECK-LABEL: define <4 x i16> @test_vqshl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   3653 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   3654 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   3655 // CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   3656 // CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   3657 // CHECK:   [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[VQSHL_V_I]], <4 x i16> [[VQSHL_V1_I]]) #4
   3658 // CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQSHL_V2_I]] to <8 x i8>
   3659 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <4 x i16>
   3660 // CHECK:   ret <4 x i16> [[TMP2]]
   3661 int16x4_t test_vqshl_s16(int16x4_t a, int16x4_t b) {
   3662   return vqshl_s16(a, b);
   3663 }
   3664 
   3665 // CHECK-LABEL: define <2 x i32> @test_vqshl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   3666 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   3667 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   3668 // CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   3669 // CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   3670 // CHECK:   [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> [[VQSHL_V_I]], <2 x i32> [[VQSHL_V1_I]]) #4
   3671 // CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQSHL_V2_I]] to <8 x i8>
   3672 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <2 x i32>
   3673 // CHECK:   ret <2 x i32> [[TMP2]]
   3674 int32x2_t test_vqshl_s32(int32x2_t a, int32x2_t b) {
   3675   return vqshl_s32(a, b);
   3676 }
   3677 
   3678 // CHECK-LABEL: define <1 x i64> @test_vqshl_s64(<1 x i64> %a, <1 x i64> %b) #0 {
   3679 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   3680 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   3681 // CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   3682 // CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   3683 // CHECK:   [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshl.v1i64(<1 x i64> [[VQSHL_V_I]], <1 x i64> [[VQSHL_V1_I]]) #4
   3684 // CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQSHL_V2_I]] to <8 x i8>
   3685 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <1 x i64>
   3686 // CHECK:   ret <1 x i64> [[TMP2]]
   3687 int64x1_t test_vqshl_s64(int64x1_t a, int64x1_t b) {
   3688   return vqshl_s64(a, b);
   3689 }
   3690 
   3691 // CHECK-LABEL: define <8 x i8> @test_vqshl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
   3692 // CHECK:   [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4
   3693 // CHECK:   ret <8 x i8> [[VQSHL_V_I]]
   3694 uint8x8_t test_vqshl_u8(uint8x8_t a, int8x8_t b) {
   3695   return vqshl_u8(a, b);
   3696 }
   3697 
   3698 // CHECK-LABEL: define <4 x i16> @test_vqshl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
   3699 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   3700 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   3701 // CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   3702 // CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   3703 // CHECK:   [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[VQSHL_V_I]], <4 x i16> [[VQSHL_V1_I]]) #4
   3704 // CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQSHL_V2_I]] to <8 x i8>
   3705 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <4 x i16>
   3706 // CHECK:   ret <4 x i16> [[TMP2]]
   3707 uint16x4_t test_vqshl_u16(uint16x4_t a, int16x4_t b) {
   3708   return vqshl_u16(a, b);
   3709 }
   3710 
   3711 // CHECK-LABEL: define <2 x i32> @test_vqshl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
   3712 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   3713 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   3714 // CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   3715 // CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   3716 // CHECK:   [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> [[VQSHL_V_I]], <2 x i32> [[VQSHL_V1_I]]) #4
   3717 // CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQSHL_V2_I]] to <8 x i8>
   3718 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <2 x i32>
   3719 // CHECK:   ret <2 x i32> [[TMP2]]
   3720 uint32x2_t test_vqshl_u32(uint32x2_t a, int32x2_t b) {
   3721   return vqshl_u32(a, b);
   3722 }
   3723 
   3724 // CHECK-LABEL: define <1 x i64> @test_vqshl_u64(<1 x i64> %a, <1 x i64> %b) #0 {
   3725 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   3726 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   3727 // CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   3728 // CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   3729 // CHECK:   [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqshl.v1i64(<1 x i64> [[VQSHL_V_I]], <1 x i64> [[VQSHL_V1_I]]) #4
   3730 // CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQSHL_V2_I]] to <8 x i8>
   3731 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <1 x i64>
   3732 // CHECK:   ret <1 x i64> [[TMP2]]
   3733 uint64x1_t test_vqshl_u64(uint64x1_t a, int64x1_t b) {
   3734   return vqshl_u64(a, b);
   3735 }
   3736 
   3737 // CHECK-LABEL: define <16 x i8> @test_vqshlq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
   3738 // CHECK:   [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4
   3739 // CHECK:   ret <16 x i8> [[VQSHLQ_V_I]]
   3740 int8x16_t test_vqshlq_s8(int8x16_t a, int8x16_t b) {
   3741   return vqshlq_s8(a, b);
   3742 }
   3743 
   3744 // CHECK-LABEL: define <8 x i16> @test_vqshlq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   3745 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   3746 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   3747 // CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   3748 // CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   3749 // CHECK:   [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> [[VQSHLQ_V_I]], <8 x i16> [[VQSHLQ_V1_I]]) #4
   3750 // CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSHLQ_V2_I]] to <16 x i8>
   3751 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <8 x i16>
   3752 // CHECK:   ret <8 x i16> [[TMP2]]
   3753 int16x8_t test_vqshlq_s16(int16x8_t a, int16x8_t b) {
   3754   return vqshlq_s16(a, b);
   3755 }
   3756 
   3757 // CHECK-LABEL: define <4 x i32> @test_vqshlq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   3758 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   3759 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   3760 // CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   3761 // CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   3762 // CHECK:   [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> [[VQSHLQ_V_I]], <4 x i32> [[VQSHLQ_V1_I]]) #4
   3763 // CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSHLQ_V2_I]] to <16 x i8>
   3764 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <4 x i32>
   3765 // CHECK:   ret <4 x i32> [[TMP2]]
   3766 int32x4_t test_vqshlq_s32(int32x4_t a, int32x4_t b) {
   3767   return vqshlq_s32(a, b);
   3768 }
   3769 
   3770 // CHECK-LABEL: define <2 x i64> @test_vqshlq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
   3771 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   3772 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   3773 // CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   3774 // CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   3775 // CHECK:   [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> [[VQSHLQ_V_I]], <2 x i64> [[VQSHLQ_V1_I]]) #4
   3776 // CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSHLQ_V2_I]] to <16 x i8>
   3777 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <2 x i64>
   3778 // CHECK:   ret <2 x i64> [[TMP2]]
   3779 int64x2_t test_vqshlq_s64(int64x2_t a, int64x2_t b) {
   3780   return vqshlq_s64(a, b);
   3781 }
   3782 
   3783 // CHECK-LABEL: define <16 x i8> @test_vqshlq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
   3784 // CHECK:   [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4
   3785 // CHECK:   ret <16 x i8> [[VQSHLQ_V_I]]
   3786 uint8x16_t test_vqshlq_u8(uint8x16_t a, int8x16_t b) {
   3787   return vqshlq_u8(a, b);
   3788 }
   3789 
   3790 // CHECK-LABEL: define <8 x i16> @test_vqshlq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
   3791 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   3792 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   3793 // CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   3794 // CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   3795 // CHECK:   [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> [[VQSHLQ_V_I]], <8 x i16> [[VQSHLQ_V1_I]]) #4
   3796 // CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSHLQ_V2_I]] to <16 x i8>
   3797 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <8 x i16>
   3798 // CHECK:   ret <8 x i16> [[TMP2]]
   3799 uint16x8_t test_vqshlq_u16(uint16x8_t a, int16x8_t b) {
   3800   return vqshlq_u16(a, b);
   3801 }
   3802 
   3803 // CHECK-LABEL: define <4 x i32> @test_vqshlq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
   3804 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   3805 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   3806 // CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   3807 // CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   3808 // CHECK:   [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> [[VQSHLQ_V_I]], <4 x i32> [[VQSHLQ_V1_I]]) #4
   3809 // CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSHLQ_V2_I]] to <16 x i8>
   3810 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <4 x i32>
   3811 // CHECK:   ret <4 x i32> [[TMP2]]
   3812 uint32x4_t test_vqshlq_u32(uint32x4_t a, int32x4_t b) {
   3813   return vqshlq_u32(a, b);
   3814 }
   3815 
   3816 // CHECK-LABEL: define <2 x i64> @test_vqshlq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
   3817 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   3818 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   3819 // CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   3820 // CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   3821 // CHECK:   [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> [[VQSHLQ_V_I]], <2 x i64> [[VQSHLQ_V1_I]]) #4
   3822 // CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSHLQ_V2_I]] to <16 x i8>
   3823 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <2 x i64>
   3824 // CHECK:   ret <2 x i64> [[TMP2]]
   3825 uint64x2_t test_vqshlq_u64(uint64x2_t a, int64x2_t b) {
   3826   return vqshlq_u64(a, b);
   3827 }
   3828 
   3829 // CHECK-LABEL: define <8 x i8> @test_vrshl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
   3830 // CHECK:   [[VRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4
   3831 // CHECK:   ret <8 x i8> [[VRSHL_V_I]]
   3832 int8x8_t test_vrshl_s8(int8x8_t a, int8x8_t b) {
   3833   return vrshl_s8(a, b);
   3834 }
   3835 
   3836 // CHECK-LABEL: define <4 x i16> @test_vrshl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   3837 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   3838 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   3839 // CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   3840 // CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   3841 // CHECK:   [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> [[VRSHL_V_I]], <4 x i16> [[VRSHL_V1_I]]) #4
   3842 // CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VRSHL_V2_I]] to <8 x i8>
   3843 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <4 x i16>
   3844 // CHECK:   ret <4 x i16> [[TMP2]]
   3845 int16x4_t test_vrshl_s16(int16x4_t a, int16x4_t b) {
   3846   return vrshl_s16(a, b);
   3847 }
   3848 
   3849 // CHECK-LABEL: define <2 x i32> @test_vrshl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   3850 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   3851 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   3852 // CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   3853 // CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   3854 // CHECK:   [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> [[VRSHL_V_I]], <2 x i32> [[VRSHL_V1_I]]) #4
   3855 // CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VRSHL_V2_I]] to <8 x i8>
   3856 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <2 x i32>
   3857 // CHECK:   ret <2 x i32> [[TMP2]]
   3858 int32x2_t test_vrshl_s32(int32x2_t a, int32x2_t b) {
   3859   return vrshl_s32(a, b);
   3860 }
   3861 
   3862 // CHECK-LABEL: define <1 x i64> @test_vrshl_s64(<1 x i64> %a, <1 x i64> %b) #0 {
   3863 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   3864 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   3865 // CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   3866 // CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   3867 // CHECK:   [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> [[VRSHL_V_I]], <1 x i64> [[VRSHL_V1_I]]) #4
   3868 // CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VRSHL_V2_I]] to <8 x i8>
   3869 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <1 x i64>
   3870 // CHECK:   ret <1 x i64> [[TMP2]]
   3871 int64x1_t test_vrshl_s64(int64x1_t a, int64x1_t b) {
   3872   return vrshl_s64(a, b);
   3873 }
   3874 
   3875 // CHECK-LABEL: define <8 x i8> @test_vrshl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
   3876 // CHECK:   [[VRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4
   3877 // CHECK:   ret <8 x i8> [[VRSHL_V_I]]
   3878 uint8x8_t test_vrshl_u8(uint8x8_t a, int8x8_t b) {
   3879   return vrshl_u8(a, b);
   3880 }
   3881 
   3882 // CHECK-LABEL: define <4 x i16> @test_vrshl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
   3883 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   3884 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   3885 // CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   3886 // CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   3887 // CHECK:   [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> [[VRSHL_V_I]], <4 x i16> [[VRSHL_V1_I]]) #4
   3888 // CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VRSHL_V2_I]] to <8 x i8>
   3889 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <4 x i16>
   3890 // CHECK:   ret <4 x i16> [[TMP2]]
   3891 uint16x4_t test_vrshl_u16(uint16x4_t a, int16x4_t b) {
   3892   return vrshl_u16(a, b);
   3893 }
   3894 
   3895 // CHECK-LABEL: define <2 x i32> @test_vrshl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
   3896 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   3897 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   3898 // CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   3899 // CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   3900 // CHECK:   [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> [[VRSHL_V_I]], <2 x i32> [[VRSHL_V1_I]]) #4
   3901 // CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VRSHL_V2_I]] to <8 x i8>
   3902 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <2 x i32>
   3903 // CHECK:   ret <2 x i32> [[TMP2]]
   3904 uint32x2_t test_vrshl_u32(uint32x2_t a, int32x2_t b) {
   3905   return vrshl_u32(a, b);
   3906 }
   3907 
   3908 // CHECK-LABEL: define <1 x i64> @test_vrshl_u64(<1 x i64> %a, <1 x i64> %b) #0 {
   3909 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   3910 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   3911 // CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   3912 // CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   3913 // CHECK:   [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> [[VRSHL_V_I]], <1 x i64> [[VRSHL_V1_I]]) #4
   3914 // CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VRSHL_V2_I]] to <8 x i8>
   3915 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <1 x i64>
   3916 // CHECK:   ret <1 x i64> [[TMP2]]
   3917 uint64x1_t test_vrshl_u64(uint64x1_t a, int64x1_t b) {
   3918   return vrshl_u64(a, b);
   3919 }
   3920 
   3921 // CHECK-LABEL: define <16 x i8> @test_vrshlq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
   3922 // CHECK:   [[VRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4
   3923 // CHECK:   ret <16 x i8> [[VRSHLQ_V_I]]
   3924 int8x16_t test_vrshlq_s8(int8x16_t a, int8x16_t b) {
   3925   return vrshlq_s8(a, b);
   3926 }
   3927 
   3928 // CHECK-LABEL: define <8 x i16> @test_vrshlq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   3929 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   3930 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   3931 // CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   3932 // CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   3933 // CHECK:   [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> [[VRSHLQ_V_I]], <8 x i16> [[VRSHLQ_V1_I]]) #4
   3934 // CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VRSHLQ_V2_I]] to <16 x i8>
   3935 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <8 x i16>
   3936 // CHECK:   ret <8 x i16> [[TMP2]]
   3937 int16x8_t test_vrshlq_s16(int16x8_t a, int16x8_t b) {
   3938   return vrshlq_s16(a, b);
   3939 }
   3940 
   3941 // CHECK-LABEL: define <4 x i32> @test_vrshlq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   3942 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   3943 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   3944 // CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   3945 // CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   3946 // CHECK:   [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> [[VRSHLQ_V_I]], <4 x i32> [[VRSHLQ_V1_I]]) #4
   3947 // CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VRSHLQ_V2_I]] to <16 x i8>
   3948 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <4 x i32>
   3949 // CHECK:   ret <4 x i32> [[TMP2]]
   3950 int32x4_t test_vrshlq_s32(int32x4_t a, int32x4_t b) {
   3951   return vrshlq_s32(a, b);
   3952 }
   3953 
   3954 // CHECK-LABEL: define <2 x i64> @test_vrshlq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
   3955 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   3956 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   3957 // CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   3958 // CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   3959 // CHECK:   [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> [[VRSHLQ_V_I]], <2 x i64> [[VRSHLQ_V1_I]]) #4
   3960 // CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VRSHLQ_V2_I]] to <16 x i8>
   3961 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <2 x i64>
   3962 // CHECK:   ret <2 x i64> [[TMP2]]
   3963 int64x2_t test_vrshlq_s64(int64x2_t a, int64x2_t b) {
   3964   return vrshlq_s64(a, b);
   3965 }
   3966 
   3967 // CHECK-LABEL: define <16 x i8> @test_vrshlq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
   3968 // CHECK:   [[VRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4
   3969 // CHECK:   ret <16 x i8> [[VRSHLQ_V_I]]
   3970 uint8x16_t test_vrshlq_u8(uint8x16_t a, int8x16_t b) {
   3971   return vrshlq_u8(a, b);
   3972 }
   3973 
   3974 // CHECK-LABEL: define <8 x i16> @test_vrshlq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
   3975 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   3976 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   3977 // CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   3978 // CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   3979 // CHECK:   [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> [[VRSHLQ_V_I]], <8 x i16> [[VRSHLQ_V1_I]]) #4
   3980 // CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VRSHLQ_V2_I]] to <16 x i8>
   3981 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <8 x i16>
   3982 // CHECK:   ret <8 x i16> [[TMP2]]
   3983 uint16x8_t test_vrshlq_u16(uint16x8_t a, int16x8_t b) {
   3984   return vrshlq_u16(a, b);
   3985 }
   3986 
   3987 // CHECK-LABEL: define <4 x i32> @test_vrshlq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
   3988 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   3989 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   3990 // CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   3991 // CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   3992 // CHECK:   [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> [[VRSHLQ_V_I]], <4 x i32> [[VRSHLQ_V1_I]]) #4
   3993 // CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VRSHLQ_V2_I]] to <16 x i8>
   3994 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <4 x i32>
   3995 // CHECK:   ret <4 x i32> [[TMP2]]
   3996 uint32x4_t test_vrshlq_u32(uint32x4_t a, int32x4_t b) {
   3997   return vrshlq_u32(a, b);
   3998 }
   3999 
   4000 // CHECK-LABEL: define <2 x i64> @test_vrshlq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
   4001 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   4002 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   4003 // CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   4004 // CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   4005 // CHECK:   [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> [[VRSHLQ_V_I]], <2 x i64> [[VRSHLQ_V1_I]]) #4
   4006 // CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VRSHLQ_V2_I]] to <16 x i8>
   4007 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <2 x i64>
   4008 // CHECK:   ret <2 x i64> [[TMP2]]
   4009 uint64x2_t test_vrshlq_u64(uint64x2_t a, int64x2_t b) {
   4010   return vrshlq_u64(a, b);
   4011 }
   4012 
   4013 
   4014 // CHECK-LABEL: define <8 x i8> @test_vqrshl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
   4015 // CHECK:   [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4
   4016 // CHECK:   ret <8 x i8> [[VQRSHL_V_I]]
   4017 int8x8_t test_vqrshl_s8(int8x8_t a, int8x8_t b) {
   4018   return vqrshl_s8(a, b);
   4019 }
   4020 
   4021 // CHECK-LABEL: define <4 x i16> @test_vqrshl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   4022 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   4023 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   4024 // CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   4025 // CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   4026 // CHECK:   [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshl.v4i16(<4 x i16> [[VQRSHL_V_I]], <4 x i16> [[VQRSHL_V1_I]]) #4
   4027 // CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQRSHL_V2_I]] to <8 x i8>
   4028 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <4 x i16>
   4029 // CHECK:   ret <4 x i16> [[TMP2]]
   4030 int16x4_t test_vqrshl_s16(int16x4_t a, int16x4_t b) {
   4031   return vqrshl_s16(a, b);
   4032 }
   4033 
   4034 // CHECK-LABEL: define <2 x i32> @test_vqrshl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   4035 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   4036 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   4037 // CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   4038 // CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   4039 // CHECK:   [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshl.v2i32(<2 x i32> [[VQRSHL_V_I]], <2 x i32> [[VQRSHL_V1_I]]) #4
   4040 // CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQRSHL_V2_I]] to <8 x i8>
   4041 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <2 x i32>
   4042 // CHECK:   ret <2 x i32> [[TMP2]]
   4043 int32x2_t test_vqrshl_s32(int32x2_t a, int32x2_t b) {
   4044   return vqrshl_s32(a, b);
   4045 }
   4046 
   4047 // CHECK-LABEL: define <1 x i64> @test_vqrshl_s64(<1 x i64> %a, <1 x i64> %b) #0 {
   4048 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   4049 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   4050 // CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   4051 // CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   4052 // CHECK:   [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqrshl.v1i64(<1 x i64> [[VQRSHL_V_I]], <1 x i64> [[VQRSHL_V1_I]]) #4
   4053 // CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQRSHL_V2_I]] to <8 x i8>
   4054 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <1 x i64>
   4055 // CHECK:   ret <1 x i64> [[TMP2]]
   4056 int64x1_t test_vqrshl_s64(int64x1_t a, int64x1_t b) {
   4057   return vqrshl_s64(a, b);
   4058 }
   4059 
   4060 // CHECK-LABEL: define <8 x i8> @test_vqrshl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
   4061 // CHECK:   [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4
   4062 // CHECK:   ret <8 x i8> [[VQRSHL_V_I]]
   4063 uint8x8_t test_vqrshl_u8(uint8x8_t a, int8x8_t b) {
   4064   return vqrshl_u8(a, b);
   4065 }
   4066 
   4067 // CHECK-LABEL: define <4 x i16> @test_vqrshl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
   4068 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   4069 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   4070 // CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   4071 // CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   4072 // CHECK:   [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshl.v4i16(<4 x i16> [[VQRSHL_V_I]], <4 x i16> [[VQRSHL_V1_I]]) #4
   4073 // CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQRSHL_V2_I]] to <8 x i8>
   4074 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <4 x i16>
   4075 // CHECK:   ret <4 x i16> [[TMP2]]
   4076 uint16x4_t test_vqrshl_u16(uint16x4_t a, int16x4_t b) {
   4077   return vqrshl_u16(a, b);
   4078 }
   4079 
   4080 // CHECK-LABEL: define <2 x i32> @test_vqrshl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
   4081 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   4082 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   4083 // CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   4084 // CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   4085 // CHECK:   [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqrshl.v2i32(<2 x i32> [[VQRSHL_V_I]], <2 x i32> [[VQRSHL_V1_I]]) #4
   4086 // CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQRSHL_V2_I]] to <8 x i8>
   4087 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <2 x i32>
   4088 // CHECK:   ret <2 x i32> [[TMP2]]
   4089 uint32x2_t test_vqrshl_u32(uint32x2_t a, int32x2_t b) {
   4090   return vqrshl_u32(a, b);
   4091 }
   4092 
   4093 // CHECK-LABEL: define <1 x i64> @test_vqrshl_u64(<1 x i64> %a, <1 x i64> %b) #0 {
   4094 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   4095 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   4096 // CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   4097 // CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   4098 // CHECK:   [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqrshl.v1i64(<1 x i64> [[VQRSHL_V_I]], <1 x i64> [[VQRSHL_V1_I]]) #4
   4099 // CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQRSHL_V2_I]] to <8 x i8>
   4100 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <1 x i64>
   4101 // CHECK:   ret <1 x i64> [[TMP2]]
   4102 uint64x1_t test_vqrshl_u64(uint64x1_t a, int64x1_t b) {
   4103   return vqrshl_u64(a, b);
   4104 }
   4105 
   4106 // CHECK-LABEL: define <16 x i8> @test_vqrshlq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
   4107 // CHECK:   [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqrshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4
   4108 // CHECK:   ret <16 x i8> [[VQRSHLQ_V_I]]
   4109 int8x16_t test_vqrshlq_s8(int8x16_t a, int8x16_t b) {
   4110   return vqrshlq_s8(a, b);
   4111 }
   4112 
   4113 // CHECK-LABEL: define <8 x i16> @test_vqrshlq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   4114 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   4115 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   4116 // CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   4117 // CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   4118 // CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrshl.v8i16(<8 x i16> [[VQRSHLQ_V_I]], <8 x i16> [[VQRSHLQ_V1_I]]) #4
   4119 // CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRSHLQ_V2_I]] to <16 x i8>
   4120 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <8 x i16>
   4121 // CHECK:   ret <8 x i16> [[TMP2]]
   4122 int16x8_t test_vqrshlq_s16(int16x8_t a, int16x8_t b) {
   4123   return vqrshlq_s16(a, b);
   4124 }
   4125 
   4126 // CHECK-LABEL: define <4 x i32> @test_vqrshlq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   4127 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   4128 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   4129 // CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   4130 // CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   4131 // CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrshl.v4i32(<4 x i32> [[VQRSHLQ_V_I]], <4 x i32> [[VQRSHLQ_V1_I]]) #4
   4132 // CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRSHLQ_V2_I]] to <16 x i8>
   4133 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <4 x i32>
   4134 // CHECK:   ret <4 x i32> [[TMP2]]
   4135 int32x4_t test_vqrshlq_s32(int32x4_t a, int32x4_t b) {
   4136   return vqrshlq_s32(a, b);
   4137 }
   4138 
   4139 // CHECK-LABEL: define <2 x i64> @test_vqrshlq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
   4140 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   4141 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   4142 // CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   4143 // CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   4144 // CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqrshl.v2i64(<2 x i64> [[VQRSHLQ_V_I]], <2 x i64> [[VQRSHLQ_V1_I]]) #4
   4145 // CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQRSHLQ_V2_I]] to <16 x i8>
   4146 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <2 x i64>
   4147 // CHECK:   ret <2 x i64> [[TMP2]]
   4148 int64x2_t test_vqrshlq_s64(int64x2_t a, int64x2_t b) {
   4149   return vqrshlq_s64(a, b);
   4150 }
   4151 
   4152 // CHECK-LABEL: define <16 x i8> @test_vqrshlq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
   4153 // CHECK:   [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqrshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4
   4154 // CHECK:   ret <16 x i8> [[VQRSHLQ_V_I]]
   4155 uint8x16_t test_vqrshlq_u8(uint8x16_t a, int8x16_t b) {
   4156   return vqrshlq_u8(a, b);
   4157 }
   4158 
   4159 // CHECK-LABEL: define <8 x i16> @test_vqrshlq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
   4160 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   4161 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   4162 // CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   4163 // CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   4164 // CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqrshl.v8i16(<8 x i16> [[VQRSHLQ_V_I]], <8 x i16> [[VQRSHLQ_V1_I]]) #4
   4165 // CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRSHLQ_V2_I]] to <16 x i8>
   4166 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <8 x i16>
   4167 // CHECK:   ret <8 x i16> [[TMP2]]
   4168 uint16x8_t test_vqrshlq_u16(uint16x8_t a, int16x8_t b) {
   4169   return vqrshlq_u16(a, b);
   4170 }
   4171 
   4172 // CHECK-LABEL: define <4 x i32> @test_vqrshlq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
   4173 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   4174 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   4175 // CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   4176 // CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   4177 // CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqrshl.v4i32(<4 x i32> [[VQRSHLQ_V_I]], <4 x i32> [[VQRSHLQ_V1_I]]) #4
   4178 // CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRSHLQ_V2_I]] to <16 x i8>
   4179 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <4 x i32>
   4180 // CHECK:   ret <4 x i32> [[TMP2]]
   4181 uint32x4_t test_vqrshlq_u32(uint32x4_t a, int32x4_t b) {
   4182   return vqrshlq_u32(a, b);
   4183 }
   4184 
   4185 // CHECK-LABEL: define <2 x i64> @test_vqrshlq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
   4186 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   4187 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   4188 // CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   4189 // CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   4190 // CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqrshl.v2i64(<2 x i64> [[VQRSHLQ_V_I]], <2 x i64> [[VQRSHLQ_V1_I]]) #4
   4191 // CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQRSHLQ_V2_I]] to <16 x i8>
   4192 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <2 x i64>
   4193 // CHECK:   ret <2 x i64> [[TMP2]]
   4194 uint64x2_t test_vqrshlq_u64(uint64x2_t a, int64x2_t b) {
   4195   return vqrshlq_u64(a, b);
   4196 }
   4197 
   4198 // CHECK-LABEL: define <1 x i64> @test_vsli_n_p64(<1 x i64> %a, <1 x i64> %b) #0 {
   4199 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   4200 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   4201 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   4202 // CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   4203 // CHECK:   [[VSLI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], i32 0)
   4204 // CHECK:   ret <1 x i64> [[VSLI_N2]]
   4205 poly64x1_t test_vsli_n_p64(poly64x1_t a, poly64x1_t b) {
   4206   return vsli_n_p64(a, b, 0);
   4207 }
   4208 
   4209 // CHECK-LABEL: define <2 x i64> @test_vsliq_n_p64(<2 x i64> %a, <2 x i64> %b) #0 {
   4210 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   4211 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   4212 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   4213 // CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   4214 // CHECK:   [[VSLI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], i32 0)
   4215 // CHECK:   ret <2 x i64> [[VSLI_N2]]
   4216 poly64x2_t test_vsliq_n_p64(poly64x2_t a, poly64x2_t b) {
   4217   return vsliq_n_p64(a, b, 0);
   4218 }
   4219 
   4220 // CHECK-LABEL: define <8 x i8> @test_vmax_s8(<8 x i8> %a, <8 x i8> %b) #0 {
   4221 // CHECK:   [[VMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.smax.v8i8(<8 x i8> %a, <8 x i8> %b) #4
   4222 // CHECK:   ret <8 x i8> [[VMAX_I]]
   4223 int8x8_t test_vmax_s8(int8x8_t a, int8x8_t b) {
   4224   return vmax_s8(a, b);
   4225 }
   4226 
   4227 // CHECK-LABEL: define <4 x i16> @test_vmax_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   4228 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   4229 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   4230 // CHECK:   [[VMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   4231 // CHECK:   [[VMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   4232 // CHECK:   [[VMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.smax.v4i16(<4 x i16> [[VMAX_I]], <4 x i16> [[VMAX1_I]]) #4
   4233 // CHECK:   ret <4 x i16> [[VMAX2_I]]
   4234 int16x4_t test_vmax_s16(int16x4_t a, int16x4_t b) {
   4235   return vmax_s16(a, b);
   4236 }
   4237 
   4238 // CHECK-LABEL: define <2 x i32> @test_vmax_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   4239 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   4240 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   4241 // CHECK:   [[VMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   4242 // CHECK:   [[VMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   4243 // CHECK:   [[VMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.smax.v2i32(<2 x i32> [[VMAX_I]], <2 x i32> [[VMAX1_I]]) #4
   4244 // CHECK:   ret <2 x i32> [[VMAX2_I]]
   4245 int32x2_t test_vmax_s32(int32x2_t a, int32x2_t b) {
   4246   return vmax_s32(a, b);
   4247 }
   4248 
   4249 // CHECK-LABEL: define <8 x i8> @test_vmax_u8(<8 x i8> %a, <8 x i8> %b) #0 {
   4250 // CHECK:   [[VMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.umax.v8i8(<8 x i8> %a, <8 x i8> %b) #4
   4251 // CHECK:   ret <8 x i8> [[VMAX_I]]
   4252 uint8x8_t test_vmax_u8(uint8x8_t a, uint8x8_t b) {
   4253   return vmax_u8(a, b);
   4254 }
   4255 
   4256 // CHECK-LABEL: define <4 x i16> @test_vmax_u16(<4 x i16> %a, <4 x i16> %b) #0 {
   4257 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   4258 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   4259 // CHECK:   [[VMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   4260 // CHECK:   [[VMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   4261 // CHECK:   [[VMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.umax.v4i16(<4 x i16> [[VMAX_I]], <4 x i16> [[VMAX1_I]]) #4
   4262 // CHECK:   ret <4 x i16> [[VMAX2_I]]
   4263 uint16x4_t test_vmax_u16(uint16x4_t a, uint16x4_t b) {
   4264   return vmax_u16(a, b);
   4265 }
   4266 
   4267 // CHECK-LABEL: define <2 x i32> @test_vmax_u32(<2 x i32> %a, <2 x i32> %b) #0 {
   4268 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   4269 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   4270 // CHECK:   [[VMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   4271 // CHECK:   [[VMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   4272 // CHECK:   [[VMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.umax.v2i32(<2 x i32> [[VMAX_I]], <2 x i32> [[VMAX1_I]]) #4
   4273 // CHECK:   ret <2 x i32> [[VMAX2_I]]
   4274 uint32x2_t test_vmax_u32(uint32x2_t a, uint32x2_t b) {
   4275   return vmax_u32(a, b);
   4276 }
   4277 
   4278 // CHECK-LABEL: define <2 x float> @test_vmax_f32(<2 x float> %a, <2 x float> %b) #0 {
   4279 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   4280 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
   4281 // CHECK:   [[VMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   4282 // CHECK:   [[VMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
   4283 // CHECK:   [[VMAX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmax.v2f32(<2 x float> [[VMAX_I]], <2 x float> [[VMAX1_I]]) #4
   4284 // CHECK:   ret <2 x float> [[VMAX2_I]]
   4285 float32x2_t test_vmax_f32(float32x2_t a, float32x2_t b) {
   4286   return vmax_f32(a, b);
   4287 }
   4288 
   4289 // CHECK-LABEL: define <16 x i8> @test_vmaxq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
   4290 // CHECK:   [[VMAX_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.smax.v16i8(<16 x i8> %a, <16 x i8> %b) #4
   4291 // CHECK:   ret <16 x i8> [[VMAX_I]]
   4292 int8x16_t test_vmaxq_s8(int8x16_t a, int8x16_t b) {
   4293   return vmaxq_s8(a, b);
   4294 }
   4295 
   4296 // CHECK-LABEL: define <8 x i16> @test_vmaxq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   4297 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   4298 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   4299 // CHECK:   [[VMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   4300 // CHECK:   [[VMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   4301 // CHECK:   [[VMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smax.v8i16(<8 x i16> [[VMAX_I]], <8 x i16> [[VMAX1_I]]) #4
   4302 // CHECK:   ret <8 x i16> [[VMAX2_I]]
   4303 int16x8_t test_vmaxq_s16(int16x8_t a, int16x8_t b) {
   4304   return vmaxq_s16(a, b);
   4305 }
   4306 
   4307 // CHECK-LABEL: define <4 x i32> @test_vmaxq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   4308 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   4309 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   4310 // CHECK:   [[VMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   4311 // CHECK:   [[VMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   4312 // CHECK:   [[VMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> [[VMAX_I]], <4 x i32> [[VMAX1_I]]) #4
   4313 // CHECK:   ret <4 x i32> [[VMAX2_I]]
   4314 int32x4_t test_vmaxq_s32(int32x4_t a, int32x4_t b) {
   4315   return vmaxq_s32(a, b);
   4316 }
   4317 
   4318 // CHECK-LABEL: define <16 x i8> @test_vmaxq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
   4319 // CHECK:   [[VMAX_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.umax.v16i8(<16 x i8> %a, <16 x i8> %b) #4
   4320 // CHECK:   ret <16 x i8> [[VMAX_I]]
   4321 uint8x16_t test_vmaxq_u8(uint8x16_t a, uint8x16_t b) {
   4322   return vmaxq_u8(a, b);
   4323 }
   4324 
   4325 // CHECK-LABEL: define <8 x i16> @test_vmaxq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
   4326 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   4327 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   4328 // CHECK:   [[VMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   4329 // CHECK:   [[VMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   4330 // CHECK:   [[VMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umax.v8i16(<8 x i16> [[VMAX_I]], <8 x i16> [[VMAX1_I]]) #4
   4331 // CHECK:   ret <8 x i16> [[VMAX2_I]]
   4332 uint16x8_t test_vmaxq_u16(uint16x8_t a, uint16x8_t b) {
   4333   return vmaxq_u16(a, b);
   4334 }
   4335 
   4336 // CHECK-LABEL: define <4 x i32> @test_vmaxq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
   4337 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   4338 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   4339 // CHECK:   [[VMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   4340 // CHECK:   [[VMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   4341 // CHECK:   [[VMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> [[VMAX_I]], <4 x i32> [[VMAX1_I]]) #4
   4342 // CHECK:   ret <4 x i32> [[VMAX2_I]]
   4343 uint32x4_t test_vmaxq_u32(uint32x4_t a, uint32x4_t b) {
   4344   return vmaxq_u32(a, b);
   4345 }
   4346 
   4347 // CHECK-LABEL: define <4 x float> @test_vmaxq_f32(<4 x float> %a, <4 x float> %b) #0 {
   4348 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
   4349 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
   4350 // CHECK:   [[VMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
   4351 // CHECK:   [[VMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
   4352 // CHECK:   [[VMAX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float> [[VMAX_I]], <4 x float> [[VMAX1_I]]) #4
   4353 // CHECK:   ret <4 x float> [[VMAX2_I]]
   4354 float32x4_t test_vmaxq_f32(float32x4_t a, float32x4_t b) {
   4355   return vmaxq_f32(a, b);
   4356 }
   4357 
   4358 // CHECK-LABEL: define <2 x double> @test_vmaxq_f64(<2 x double> %a, <2 x double> %b) #0 {
   4359 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
   4360 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
   4361 // CHECK:   [[VMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
   4362 // CHECK:   [[VMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
   4363 // CHECK:   [[VMAX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmax.v2f64(<2 x double> [[VMAX_I]], <2 x double> [[VMAX1_I]]) #4
   4364 // CHECK:   ret <2 x double> [[VMAX2_I]]
   4365 float64x2_t test_vmaxq_f64(float64x2_t a, float64x2_t b) {
   4366   return vmaxq_f64(a, b);
   4367 }
   4368 
   4369 
   4370 // CHECK-LABEL: define <8 x i8> @test_vmin_s8(<8 x i8> %a, <8 x i8> %b) #0 {
   4371 // CHECK:   [[VMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.smin.v8i8(<8 x i8> %a, <8 x i8> %b) #4
   4372 // CHECK:   ret <8 x i8> [[VMIN_I]]
   4373 int8x8_t test_vmin_s8(int8x8_t a, int8x8_t b) {
   4374   return vmin_s8(a, b);
   4375 }
   4376 
   4377 // CHECK-LABEL: define <4 x i16> @test_vmin_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   4378 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   4379 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   4380 // CHECK:   [[VMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   4381 // CHECK:   [[VMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   4382 // CHECK:   [[VMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.smin.v4i16(<4 x i16> [[VMIN_I]], <4 x i16> [[VMIN1_I]]) #4
   4383 // CHECK:   ret <4 x i16> [[VMIN2_I]]
   4384 int16x4_t test_vmin_s16(int16x4_t a, int16x4_t b) {
   4385   return vmin_s16(a, b);
   4386 }
   4387 
   4388 // CHECK-LABEL: define <2 x i32> @test_vmin_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   4389 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   4390 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   4391 // CHECK:   [[VMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   4392 // CHECK:   [[VMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   4393 // CHECK:   [[VMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.smin.v2i32(<2 x i32> [[VMIN_I]], <2 x i32> [[VMIN1_I]]) #4
   4394 // CHECK:   ret <2 x i32> [[VMIN2_I]]
   4395 int32x2_t test_vmin_s32(int32x2_t a, int32x2_t b) {
   4396   return vmin_s32(a, b);
   4397 }
   4398 
   4399 // CHECK-LABEL: define <8 x i8> @test_vmin_u8(<8 x i8> %a, <8 x i8> %b) #0 {
   4400 // CHECK:   [[VMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.umin.v8i8(<8 x i8> %a, <8 x i8> %b) #4
   4401 // CHECK:   ret <8 x i8> [[VMIN_I]]
   4402 uint8x8_t test_vmin_u8(uint8x8_t a, uint8x8_t b) {
   4403   return vmin_u8(a, b);
   4404 }
   4405 
   4406 // CHECK-LABEL: define <4 x i16> @test_vmin_u16(<4 x i16> %a, <4 x i16> %b) #0 {
   4407 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   4408 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   4409 // CHECK:   [[VMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   4410 // CHECK:   [[VMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   4411 // CHECK:   [[VMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.umin.v4i16(<4 x i16> [[VMIN_I]], <4 x i16> [[VMIN1_I]]) #4
   4412 // CHECK:   ret <4 x i16> [[VMIN2_I]]
   4413 uint16x4_t test_vmin_u16(uint16x4_t a, uint16x4_t b) {
   4414   return vmin_u16(a, b);
   4415 }
   4416 
   4417 // CHECK-LABEL: define <2 x i32> @test_vmin_u32(<2 x i32> %a, <2 x i32> %b) #0 {
   4418 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   4419 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   4420 // CHECK:   [[VMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   4421 // CHECK:   [[VMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   4422 // CHECK:   [[VMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.umin.v2i32(<2 x i32> [[VMIN_I]], <2 x i32> [[VMIN1_I]]) #4
   4423 // CHECK:   ret <2 x i32> [[VMIN2_I]]
   4424 uint32x2_t test_vmin_u32(uint32x2_t a, uint32x2_t b) {
   4425   return vmin_u32(a, b);
   4426 }
   4427 
   4428 // CHECK-LABEL: define <2 x float> @test_vmin_f32(<2 x float> %a, <2 x float> %b) #0 {
   4429 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   4430 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
   4431 // CHECK:   [[VMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   4432 // CHECK:   [[VMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
   4433 // CHECK:   [[VMIN2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmin.v2f32(<2 x float> [[VMIN_I]], <2 x float> [[VMIN1_I]]) #4
   4434 // CHECK:   ret <2 x float> [[VMIN2_I]]
   4435 float32x2_t test_vmin_f32(float32x2_t a, float32x2_t b) {
   4436   return vmin_f32(a, b);
   4437 }
   4438 
   4439 // CHECK-LABEL: define <16 x i8> @test_vminq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
   4440 // CHECK:   [[VMIN_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.smin.v16i8(<16 x i8> %a, <16 x i8> %b) #4
   4441 // CHECK:   ret <16 x i8> [[VMIN_I]]
   4442 int8x16_t test_vminq_s8(int8x16_t a, int8x16_t b) {
   4443   return vminq_s8(a, b);
   4444 }
   4445 
   4446 // CHECK-LABEL: define <8 x i16> @test_vminq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   4447 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   4448 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   4449 // CHECK:   [[VMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   4450 // CHECK:   [[VMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   4451 // CHECK:   [[VMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smin.v8i16(<8 x i16> [[VMIN_I]], <8 x i16> [[VMIN1_I]]) #4
   4452 // CHECK:   ret <8 x i16> [[VMIN2_I]]
   4453 int16x8_t test_vminq_s16(int16x8_t a, int16x8_t b) {
   4454   return vminq_s16(a, b);
   4455 }
   4456 
   4457 // CHECK-LABEL: define <4 x i32> @test_vminq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   4458 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   4459 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   4460 // CHECK:   [[VMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   4461 // CHECK:   [[VMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   4462 // CHECK:   [[VMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> [[VMIN_I]], <4 x i32> [[VMIN1_I]]) #4
   4463 // CHECK:   ret <4 x i32> [[VMIN2_I]]
   4464 int32x4_t test_vminq_s32(int32x4_t a, int32x4_t b) {
   4465   return vminq_s32(a, b);
   4466 }
   4467 
   4468 // CHECK-LABEL: define <16 x i8> @test_vminq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
   4469 // CHECK:   [[VMIN_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.umin.v16i8(<16 x i8> %a, <16 x i8> %b) #4
   4470 // CHECK:   ret <16 x i8> [[VMIN_I]]
   4471 uint8x16_t test_vminq_u8(uint8x16_t a, uint8x16_t b) {
   4472   return vminq_u8(a, b);
   4473 }
   4474 
   4475 // CHECK-LABEL: define <8 x i16> @test_vminq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
   4476 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   4477 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   4478 // CHECK:   [[VMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   4479 // CHECK:   [[VMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   4480 // CHECK:   [[VMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umin.v8i16(<8 x i16> [[VMIN_I]], <8 x i16> [[VMIN1_I]]) #4
   4481 // CHECK:   ret <8 x i16> [[VMIN2_I]]
   4482 uint16x8_t test_vminq_u16(uint16x8_t a, uint16x8_t b) {
   4483   return vminq_u16(a, b);
   4484 }
   4485 
   4486 // CHECK-LABEL: define <4 x i32> @test_vminq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
   4487 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   4488 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   4489 // CHECK:   [[VMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   4490 // CHECK:   [[VMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   4491 // CHECK:   [[VMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> [[VMIN_I]], <4 x i32> [[VMIN1_I]]) #4
   4492 // CHECK:   ret <4 x i32> [[VMIN2_I]]
   4493 uint32x4_t test_vminq_u32(uint32x4_t a, uint32x4_t b) {
   4494   return vminq_u32(a, b);
   4495 }
   4496 
   4497 // CHECK-LABEL: define <4 x float> @test_vminq_f32(<4 x float> %a, <4 x float> %b) #0 {
   4498 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
   4499 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
   4500 // CHECK:   [[VMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
   4501 // CHECK:   [[VMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
   4502 // CHECK:   [[VMIN2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float> [[VMIN_I]], <4 x float> [[VMIN1_I]]) #4
   4503 // CHECK:   ret <4 x float> [[VMIN2_I]]
   4504 float32x4_t test_vminq_f32(float32x4_t a, float32x4_t b) {
   4505   return vminq_f32(a, b);
   4506 }
   4507 
   4508 // CHECK-LABEL: define <2 x double> @test_vminq_f64(<2 x double> %a, <2 x double> %b) #0 {
   4509 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
   4510 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
   4511 // CHECK:   [[VMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
   4512 // CHECK:   [[VMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
   4513 // CHECK:   [[VMIN2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmin.v2f64(<2 x double> [[VMIN_I]], <2 x double> [[VMIN1_I]]) #4
   4514 // CHECK:   ret <2 x double> [[VMIN2_I]]
   4515 float64x2_t test_vminq_f64(float64x2_t a, float64x2_t b) {
   4516   return vminq_f64(a, b);
   4517 }
   4518 
   4519 // CHECK-LABEL: define <2 x float> @test_vmaxnm_f32(<2 x float> %a, <2 x float> %b) #0 {
   4520 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   4521 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
   4522 // CHECK:   [[VMAXNM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   4523 // CHECK:   [[VMAXNM1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
   4524 // CHECK:   [[VMAXNM2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmaxnm.v2f32(<2 x float> [[VMAXNM_I]], <2 x float> [[VMAXNM1_I]]) #4
   4525 // CHECK:   ret <2 x float> [[VMAXNM2_I]]
   4526 float32x2_t test_vmaxnm_f32(float32x2_t a, float32x2_t b) {
   4527   return vmaxnm_f32(a, b);
   4528 }
   4529 
   4530 // CHECK-LABEL: define <4 x float> @test_vmaxnmq_f32(<4 x float> %a, <4 x float> %b) #0 {
   4531 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
   4532 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
   4533 // CHECK:   [[VMAXNM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
   4534 // CHECK:   [[VMAXNM1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
   4535 // CHECK:   [[VMAXNM2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmaxnm.v4f32(<4 x float> [[VMAXNM_I]], <4 x float> [[VMAXNM1_I]]) #4
   4536 // CHECK:   ret <4 x float> [[VMAXNM2_I]]
   4537 float32x4_t test_vmaxnmq_f32(float32x4_t a, float32x4_t b) {
   4538   return vmaxnmq_f32(a, b);
   4539 }
   4540 
   4541 // CHECK-LABEL: define <2 x double> @test_vmaxnmq_f64(<2 x double> %a, <2 x double> %b) #0 {
   4542 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
   4543 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
   4544 // CHECK:   [[VMAXNM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
   4545 // CHECK:   [[VMAXNM1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
   4546 // CHECK:   [[VMAXNM2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmaxnm.v2f64(<2 x double> [[VMAXNM_I]], <2 x double> [[VMAXNM1_I]]) #4
   4547 // CHECK:   ret <2 x double> [[VMAXNM2_I]]
   4548 float64x2_t test_vmaxnmq_f64(float64x2_t a, float64x2_t b) {
   4549   return vmaxnmq_f64(a, b);
   4550 }
   4551 
   4552 // CHECK-LABEL: define <2 x float> @test_vminnm_f32(<2 x float> %a, <2 x float> %b) #0 {
   4553 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   4554 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
   4555 // CHECK:   [[VMINNM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   4556 // CHECK:   [[VMINNM1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
   4557 // CHECK:   [[VMINNM2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fminnm.v2f32(<2 x float> [[VMINNM_I]], <2 x float> [[VMINNM1_I]]) #4
   4558 // CHECK:   ret <2 x float> [[VMINNM2_I]]
   4559 float32x2_t test_vminnm_f32(float32x2_t a, float32x2_t b) {
   4560   return vminnm_f32(a, b);
   4561 }
   4562 
   4563 // CHECK-LABEL: define <4 x float> @test_vminnmq_f32(<4 x float> %a, <4 x float> %b) #0 {
   4564 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
   4565 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
   4566 // CHECK:   [[VMINNM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
   4567 // CHECK:   [[VMINNM1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
   4568 // CHECK:   [[VMINNM2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fminnm.v4f32(<4 x float> [[VMINNM_I]], <4 x float> [[VMINNM1_I]]) #4
   4569 // CHECK:   ret <4 x float> [[VMINNM2_I]]
   4570 float32x4_t test_vminnmq_f32(float32x4_t a, float32x4_t b) {
   4571   return vminnmq_f32(a, b);
   4572 }
   4573 
   4574 // CHECK-LABEL: define <2 x double> @test_vminnmq_f64(<2 x double> %a, <2 x double> %b) #0 {
   4575 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
   4576 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
   4577 // CHECK:   [[VMINNM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
   4578 // CHECK:   [[VMINNM1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
   4579 // CHECK:   [[VMINNM2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fminnm.v2f64(<2 x double> [[VMINNM_I]], <2 x double> [[VMINNM1_I]]) #4
   4580 // CHECK:   ret <2 x double> [[VMINNM2_I]]
   4581 float64x2_t test_vminnmq_f64(float64x2_t a, float64x2_t b) {
   4582   return vminnmq_f64(a, b);
   4583 }
   4584 
   4585 // CHECK-LABEL: define <8 x i8> @test_vpmax_s8(<8 x i8> %a, <8 x i8> %b) #0 {
   4586 // CHECK:   [[VPMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.smaxp.v8i8(<8 x i8> %a, <8 x i8> %b) #4
   4587 // CHECK:   ret <8 x i8> [[VPMAX_I]]
   4588 int8x8_t test_vpmax_s8(int8x8_t a, int8x8_t b) {
   4589   return vpmax_s8(a, b);
   4590 }
   4591 
   4592 // CHECK-LABEL: define <4 x i16> @test_vpmax_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   4593 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   4594 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   4595 // CHECK:   [[VPMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   4596 // CHECK:   [[VPMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   4597 // CHECK:   [[VPMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.smaxp.v4i16(<4 x i16> [[VPMAX_I]], <4 x i16> [[VPMAX1_I]]) #4
   4598 // CHECK:   ret <4 x i16> [[VPMAX2_I]]
   4599 int16x4_t test_vpmax_s16(int16x4_t a, int16x4_t b) {
   4600   return vpmax_s16(a, b);
   4601 }
   4602 
   4603 // CHECK-LABEL: define <2 x i32> @test_vpmax_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   4604 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   4605 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   4606 // CHECK:   [[VPMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   4607 // CHECK:   [[VPMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   4608 // CHECK:   [[VPMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.smaxp.v2i32(<2 x i32> [[VPMAX_I]], <2 x i32> [[VPMAX1_I]]) #4
   4609 // CHECK:   ret <2 x i32> [[VPMAX2_I]]
   4610 int32x2_t test_vpmax_s32(int32x2_t a, int32x2_t b) {
   4611   return vpmax_s32(a, b);
   4612 }
   4613 
   4614 // CHECK-LABEL: define <8 x i8> @test_vpmax_u8(<8 x i8> %a, <8 x i8> %b) #0 {
   4615 // CHECK:   [[VPMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.umaxp.v8i8(<8 x i8> %a, <8 x i8> %b) #4
   4616 // CHECK:   ret <8 x i8> [[VPMAX_I]]
   4617 uint8x8_t test_vpmax_u8(uint8x8_t a, uint8x8_t b) {
   4618   return vpmax_u8(a, b);
   4619 }
   4620 
   4621 // CHECK-LABEL: define <4 x i16> @test_vpmax_u16(<4 x i16> %a, <4 x i16> %b) #0 {
   4622 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   4623 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   4624 // CHECK:   [[VPMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   4625 // CHECK:   [[VPMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   4626 // CHECK:   [[VPMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.umaxp.v4i16(<4 x i16> [[VPMAX_I]], <4 x i16> [[VPMAX1_I]]) #4
   4627 // CHECK:   ret <4 x i16> [[VPMAX2_I]]
   4628 uint16x4_t test_vpmax_u16(uint16x4_t a, uint16x4_t b) {
   4629   return vpmax_u16(a, b);
   4630 }
   4631 
   4632 // CHECK-LABEL: define <2 x i32> @test_vpmax_u32(<2 x i32> %a, <2 x i32> %b) #0 {
   4633 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   4634 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   4635 // CHECK:   [[VPMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   4636 // CHECK:   [[VPMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   4637 // CHECK:   [[VPMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.umaxp.v2i32(<2 x i32> [[VPMAX_I]], <2 x i32> [[VPMAX1_I]]) #4
   4638 // CHECK:   ret <2 x i32> [[VPMAX2_I]]
   4639 uint32x2_t test_vpmax_u32(uint32x2_t a, uint32x2_t b) {
   4640   return vpmax_u32(a, b);
   4641 }
   4642 
   4643 // CHECK-LABEL: define <2 x float> @test_vpmax_f32(<2 x float> %a, <2 x float> %b) #0 {
   4644 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   4645 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
   4646 // CHECK:   [[VPMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   4647 // CHECK:   [[VPMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
   4648 // CHECK:   [[VPMAX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmaxp.v2f32(<2 x float> [[VPMAX_I]], <2 x float> [[VPMAX1_I]]) #4
   4649 // CHECK:   ret <2 x float> [[VPMAX2_I]]
   4650 float32x2_t test_vpmax_f32(float32x2_t a, float32x2_t b) {
   4651   return vpmax_f32(a, b);
   4652 }
   4653 
   4654 // CHECK-LABEL: define <16 x i8> @test_vpmaxq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
   4655 // CHECK:   [[VPMAX_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.smaxp.v16i8(<16 x i8> %a, <16 x i8> %b) #4
   4656 // CHECK:   ret <16 x i8> [[VPMAX_I]]
   4657 int8x16_t test_vpmaxq_s8(int8x16_t a, int8x16_t b) {
   4658   return vpmaxq_s8(a, b);
   4659 }
   4660 
   4661 // CHECK-LABEL: define <8 x i16> @test_vpmaxq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   4662 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   4663 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   4664 // CHECK:   [[VPMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   4665 // CHECK:   [[VPMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   4666 // CHECK:   [[VPMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smaxp.v8i16(<8 x i16> [[VPMAX_I]], <8 x i16> [[VPMAX1_I]]) #4
   4667 // CHECK:   ret <8 x i16> [[VPMAX2_I]]
   4668 int16x8_t test_vpmaxq_s16(int16x8_t a, int16x8_t b) {
   4669   return vpmaxq_s16(a, b);
   4670 }
   4671 
   4672 // CHECK-LABEL: define <4 x i32> @test_vpmaxq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   4673 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   4674 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   4675 // CHECK:   [[VPMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   4676 // CHECK:   [[VPMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   4677 // CHECK:   [[VPMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smaxp.v4i32(<4 x i32> [[VPMAX_I]], <4 x i32> [[VPMAX1_I]]) #4
   4678 // CHECK:   ret <4 x i32> [[VPMAX2_I]]
   4679 int32x4_t test_vpmaxq_s32(int32x4_t a, int32x4_t b) {
   4680   return vpmaxq_s32(a, b);
   4681 }
   4682 
   4683 // CHECK-LABEL: define <16 x i8> @test_vpmaxq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
   4684 // CHECK:   [[VPMAX_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.umaxp.v16i8(<16 x i8> %a, <16 x i8> %b) #4
   4685 // CHECK:   ret <16 x i8> [[VPMAX_I]]
   4686 uint8x16_t test_vpmaxq_u8(uint8x16_t a, uint8x16_t b) {
   4687   return vpmaxq_u8(a, b);
   4688 }
   4689 
   4690 // CHECK-LABEL: define <8 x i16> @test_vpmaxq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
   4691 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   4692 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   4693 // CHECK:   [[VPMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   4694 // CHECK:   [[VPMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   4695 // CHECK:   [[VPMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umaxp.v8i16(<8 x i16> [[VPMAX_I]], <8 x i16> [[VPMAX1_I]]) #4
   4696 // CHECK:   ret <8 x i16> [[VPMAX2_I]]
   4697 uint16x8_t test_vpmaxq_u16(uint16x8_t a, uint16x8_t b) {
   4698   return vpmaxq_u16(a, b);
   4699 }
   4700 
   4701 // CHECK-LABEL: define <4 x i32> @test_vpmaxq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
   4702 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   4703 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   4704 // CHECK:   [[VPMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   4705 // CHECK:   [[VPMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   4706 // CHECK:   [[VPMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umaxp.v4i32(<4 x i32> [[VPMAX_I]], <4 x i32> [[VPMAX1_I]]) #4
   4707 // CHECK:   ret <4 x i32> [[VPMAX2_I]]
   4708 uint32x4_t test_vpmaxq_u32(uint32x4_t a, uint32x4_t b) {
   4709   return vpmaxq_u32(a, b);
   4710 }
   4711 
   4712 // CHECK-LABEL: define <4 x float> @test_vpmaxq_f32(<4 x float> %a, <4 x float> %b) #0 {
   4713 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
   4714 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
   4715 // CHECK:   [[VPMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
   4716 // CHECK:   [[VPMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
   4717 // CHECK:   [[VPMAX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmaxp.v4f32(<4 x float> [[VPMAX_I]], <4 x float> [[VPMAX1_I]]) #4
   4718 // CHECK:   ret <4 x float> [[VPMAX2_I]]
   4719 float32x4_t test_vpmaxq_f32(float32x4_t a, float32x4_t b) {
   4720   return vpmaxq_f32(a, b);
   4721 }
   4722 
   4723 // CHECK-LABEL: define <2 x double> @test_vpmaxq_f64(<2 x double> %a, <2 x double> %b) #0 {
   4724 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
   4725 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
   4726 // CHECK:   [[VPMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
   4727 // CHECK:   [[VPMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
   4728 // CHECK:   [[VPMAX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmaxp.v2f64(<2 x double> [[VPMAX_I]], <2 x double> [[VPMAX1_I]]) #4
   4729 // CHECK:   ret <2 x double> [[VPMAX2_I]]
   4730 float64x2_t test_vpmaxq_f64(float64x2_t a, float64x2_t b) {
   4731   return vpmaxq_f64(a, b);
   4732 }
   4733 
   4734 // CHECK-LABEL: define <8 x i8> @test_vpmin_s8(<8 x i8> %a, <8 x i8> %b) #0 {
   4735 // CHECK:   [[VPMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sminp.v8i8(<8 x i8> %a, <8 x i8> %b) #4
   4736 // CHECK:   ret <8 x i8> [[VPMIN_I]]
   4737 int8x8_t test_vpmin_s8(int8x8_t a, int8x8_t b) {
   4738   return vpmin_s8(a, b);
   4739 }
   4740 
   4741 // CHECK-LABEL: define <4 x i16> @test_vpmin_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   4742 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   4743 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   4744 // CHECK:   [[VPMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   4745 // CHECK:   [[VPMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   4746 // CHECK:   [[VPMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sminp.v4i16(<4 x i16> [[VPMIN_I]], <4 x i16> [[VPMIN1_I]]) #4
   4747 // CHECK:   ret <4 x i16> [[VPMIN2_I]]
   4748 int16x4_t test_vpmin_s16(int16x4_t a, int16x4_t b) {
   4749   return vpmin_s16(a, b);
   4750 }
   4751 
   4752 // CHECK-LABEL: define <2 x i32> @test_vpmin_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   4753 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   4754 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   4755 // CHECK:   [[VPMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   4756 // CHECK:   [[VPMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   4757 // CHECK:   [[VPMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sminp.v2i32(<2 x i32> [[VPMIN_I]], <2 x i32> [[VPMIN1_I]]) #4
   4758 // CHECK:   ret <2 x i32> [[VPMIN2_I]]
   4759 int32x2_t test_vpmin_s32(int32x2_t a, int32x2_t b) {
   4760   return vpmin_s32(a, b);
   4761 }
   4762 
   4763 // CHECK-LABEL: define <8 x i8> @test_vpmin_u8(<8 x i8> %a, <8 x i8> %b) #0 {
   4764 // CHECK:   [[VPMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uminp.v8i8(<8 x i8> %a, <8 x i8> %b) #4
   4765 // CHECK:   ret <8 x i8> [[VPMIN_I]]
   4766 uint8x8_t test_vpmin_u8(uint8x8_t a, uint8x8_t b) {
   4767   return vpmin_u8(a, b);
   4768 }
   4769 
   4770 // CHECK-LABEL: define <4 x i16> @test_vpmin_u16(<4 x i16> %a, <4 x i16> %b) #0 {
   4771 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   4772 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   4773 // CHECK:   [[VPMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   4774 // CHECK:   [[VPMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   4775 // CHECK:   [[VPMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uminp.v4i16(<4 x i16> [[VPMIN_I]], <4 x i16> [[VPMIN1_I]]) #4
   4776 // CHECK:   ret <4 x i16> [[VPMIN2_I]]
   4777 uint16x4_t test_vpmin_u16(uint16x4_t a, uint16x4_t b) {
   4778   return vpmin_u16(a, b);
   4779 }
   4780 
   4781 // CHECK-LABEL: define <2 x i32> @test_vpmin_u32(<2 x i32> %a, <2 x i32> %b) #0 {
   4782 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   4783 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   4784 // CHECK:   [[VPMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   4785 // CHECK:   [[VPMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   4786 // CHECK:   [[VPMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uminp.v2i32(<2 x i32> [[VPMIN_I]], <2 x i32> [[VPMIN1_I]]) #4
   4787 // CHECK:   ret <2 x i32> [[VPMIN2_I]]
   4788 uint32x2_t test_vpmin_u32(uint32x2_t a, uint32x2_t b) {
   4789   return vpmin_u32(a, b);
   4790 }
   4791 
   4792 // CHECK-LABEL: define <2 x float> @test_vpmin_f32(<2 x float> %a, <2 x float> %b) #0 {
   4793 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   4794 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
   4795 // CHECK:   [[VPMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   4796 // CHECK:   [[VPMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
   4797 // CHECK:   [[VPMIN2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fminp.v2f32(<2 x float> [[VPMIN_I]], <2 x float> [[VPMIN1_I]]) #4
   4798 // CHECK:   ret <2 x float> [[VPMIN2_I]]
   4799 float32x2_t test_vpmin_f32(float32x2_t a, float32x2_t b) {
   4800   return vpmin_f32(a, b);
   4801 }
   4802 
   4803 // CHECK-LABEL: define <16 x i8> @test_vpminq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
   4804 // CHECK:   [[VPMIN_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sminp.v16i8(<16 x i8> %a, <16 x i8> %b) #4
   4805 // CHECK:   ret <16 x i8> [[VPMIN_I]]
   4806 int8x16_t test_vpminq_s8(int8x16_t a, int8x16_t b) {
   4807   return vpminq_s8(a, b);
   4808 }
   4809 
   4810 // CHECK-LABEL: define <8 x i16> @test_vpminq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   4811 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   4812 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   4813 // CHECK:   [[VPMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   4814 // CHECK:   [[VPMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   4815 // CHECK:   [[VPMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sminp.v8i16(<8 x i16> [[VPMIN_I]], <8 x i16> [[VPMIN1_I]]) #4
   4816 // CHECK:   ret <8 x i16> [[VPMIN2_I]]
   4817 int16x8_t test_vpminq_s16(int16x8_t a, int16x8_t b) {
   4818   return vpminq_s16(a, b);
   4819 }
   4820 
   4821 // CHECK-LABEL: define <4 x i32> @test_vpminq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   4822 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   4823 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   4824 // CHECK:   [[VPMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   4825 // CHECK:   [[VPMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   4826 // CHECK:   [[VPMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sminp.v4i32(<4 x i32> [[VPMIN_I]], <4 x i32> [[VPMIN1_I]]) #4
   4827 // CHECK:   ret <4 x i32> [[VPMIN2_I]]
   4828 int32x4_t test_vpminq_s32(int32x4_t a, int32x4_t b) {
   4829   return vpminq_s32(a, b);
   4830 }
   4831 
   4832 // CHECK-LABEL: define <16 x i8> @test_vpminq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
   4833 // CHECK:   [[VPMIN_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uminp.v16i8(<16 x i8> %a, <16 x i8> %b) #4
   4834 // CHECK:   ret <16 x i8> [[VPMIN_I]]
   4835 uint8x16_t test_vpminq_u8(uint8x16_t a, uint8x16_t b) {
   4836   return vpminq_u8(a, b);
   4837 }
   4838 
   4839 // CHECK-LABEL: define <8 x i16> @test_vpminq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
   4840 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   4841 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   4842 // CHECK:   [[VPMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   4843 // CHECK:   [[VPMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   4844 // CHECK:   [[VPMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uminp.v8i16(<8 x i16> [[VPMIN_I]], <8 x i16> [[VPMIN1_I]]) #4
   4845 // CHECK:   ret <8 x i16> [[VPMIN2_I]]
   4846 uint16x8_t test_vpminq_u16(uint16x8_t a, uint16x8_t b) {
   4847   return vpminq_u16(a, b);
   4848 }
   4849 
   4850 // CHECK-LABEL: define <4 x i32> @test_vpminq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
   4851 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   4852 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   4853 // CHECK:   [[VPMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   4854 // CHECK:   [[VPMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   4855 // CHECK:   [[VPMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uminp.v4i32(<4 x i32> [[VPMIN_I]], <4 x i32> [[VPMIN1_I]]) #4
   4856 // CHECK:   ret <4 x i32> [[VPMIN2_I]]
   4857 uint32x4_t test_vpminq_u32(uint32x4_t a, uint32x4_t b) {
   4858   return vpminq_u32(a, b);
   4859 }
   4860 
   4861 // CHECK-LABEL: define <4 x float> @test_vpminq_f32(<4 x float> %a, <4 x float> %b) #0 {
   4862 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
   4863 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
   4864 // CHECK:   [[VPMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
   4865 // CHECK:   [[VPMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
   4866 // CHECK:   [[VPMIN2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fminp.v4f32(<4 x float> [[VPMIN_I]], <4 x float> [[VPMIN1_I]]) #4
   4867 // CHECK:   ret <4 x float> [[VPMIN2_I]]
   4868 float32x4_t test_vpminq_f32(float32x4_t a, float32x4_t b) {
   4869   return vpminq_f32(a, b);
   4870 }
   4871 
   4872 // CHECK-LABEL: define <2 x double> @test_vpminq_f64(<2 x double> %a, <2 x double> %b) #0 {
   4873 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
   4874 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
   4875 // CHECK:   [[VPMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
   4876 // CHECK:   [[VPMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
   4877 // CHECK:   [[VPMIN2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fminp.v2f64(<2 x double> [[VPMIN_I]], <2 x double> [[VPMIN1_I]]) #4
   4878 // CHECK:   ret <2 x double> [[VPMIN2_I]]
   4879 float64x2_t test_vpminq_f64(float64x2_t a, float64x2_t b) {
   4880   return vpminq_f64(a, b);
   4881 }
   4882 
   4883 // CHECK-LABEL: define <2 x float> @test_vpmaxnm_f32(<2 x float> %a, <2 x float> %b) #0 {
   4884 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   4885 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
   4886 // CHECK:   [[VPMAXNM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   4887 // CHECK:   [[VPMAXNM1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
   4888 // CHECK:   [[VPMAXNM2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmaxnmp.v2f32(<2 x float> [[VPMAXNM_I]], <2 x float> [[VPMAXNM1_I]]) #4
   4889 // CHECK:   ret <2 x float> [[VPMAXNM2_I]]
   4890 float32x2_t test_vpmaxnm_f32(float32x2_t a, float32x2_t b) {
   4891   return vpmaxnm_f32(a, b);
   4892 }
   4893 
   4894 // CHECK-LABEL: define <4 x float> @test_vpmaxnmq_f32(<4 x float> %a, <4 x float> %b) #0 {
   4895 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
   4896 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
   4897 // CHECK:   [[VPMAXNM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
   4898 // CHECK:   [[VPMAXNM1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
   4899 // CHECK:   [[VPMAXNM2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmaxnmp.v4f32(<4 x float> [[VPMAXNM_I]], <4 x float> [[VPMAXNM1_I]]) #4
   4900 // CHECK:   ret <4 x float> [[VPMAXNM2_I]]
   4901 float32x4_t test_vpmaxnmq_f32(float32x4_t a, float32x4_t b) {
   4902   return vpmaxnmq_f32(a, b);
   4903 }
   4904 
   4905 // CHECK-LABEL: define <2 x double> @test_vpmaxnmq_f64(<2 x double> %a, <2 x double> %b) #0 {
   4906 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
   4907 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
   4908 // CHECK:   [[VPMAXNM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
   4909 // CHECK:   [[VPMAXNM1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
   4910 // CHECK:   [[VPMAXNM2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmaxnmp.v2f64(<2 x double> [[VPMAXNM_I]], <2 x double> [[VPMAXNM1_I]]) #4
   4911 // CHECK:   ret <2 x double> [[VPMAXNM2_I]]
   4912 float64x2_t test_vpmaxnmq_f64(float64x2_t a, float64x2_t b) {
   4913   return vpmaxnmq_f64(a, b);
   4914 }
   4915 
   4916 // CHECK-LABEL: define <2 x float> @test_vpminnm_f32(<2 x float> %a, <2 x float> %b) #0 {
   4917 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   4918 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
   4919 // CHECK:   [[VPMINNM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   4920 // CHECK:   [[VPMINNM1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
   4921 // CHECK:   [[VPMINNM2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fminnmp.v2f32(<2 x float> [[VPMINNM_I]], <2 x float> [[VPMINNM1_I]]) #4
   4922 // CHECK:   ret <2 x float> [[VPMINNM2_I]]
   4923 float32x2_t test_vpminnm_f32(float32x2_t a, float32x2_t b) {
   4924   return vpminnm_f32(a, b);
   4925 }
   4926 
   4927 // CHECK-LABEL: define <4 x float> @test_vpminnmq_f32(<4 x float> %a, <4 x float> %b) #0 {
   4928 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
   4929 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
   4930 // CHECK:   [[VPMINNM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
   4931 // CHECK:   [[VPMINNM1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
   4932 // CHECK:   [[VPMINNM2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fminnmp.v4f32(<4 x float> [[VPMINNM_I]], <4 x float> [[VPMINNM1_I]]) #4
   4933 // CHECK:   ret <4 x float> [[VPMINNM2_I]]
   4934 float32x4_t test_vpminnmq_f32(float32x4_t a, float32x4_t b) {
   4935   return vpminnmq_f32(a, b);
   4936 }
   4937 
   4938 // CHECK-LABEL: define <2 x double> @test_vpminnmq_f64(<2 x double> %a, <2 x double> %b) #0 {
   4939 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
   4940 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
   4941 // CHECK:   [[VPMINNM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
   4942 // CHECK:   [[VPMINNM1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
   4943 // CHECK:   [[VPMINNM2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fminnmp.v2f64(<2 x double> [[VPMINNM_I]], <2 x double> [[VPMINNM1_I]]) #4
   4944 // CHECK:   ret <2 x double> [[VPMINNM2_I]]
   4945 float64x2_t test_vpminnmq_f64(float64x2_t a, float64x2_t b) {
   4946   return vpminnmq_f64(a, b);
   4947 }
   4948 
   4949 // CHECK-LABEL: define <8 x i8> @test_vpadd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
   4950 // CHECK:   [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> %a, <8 x i8> %b) #4
   4951 // CHECK:   ret <8 x i8> [[VPADD_V_I]]
   4952 int8x8_t test_vpadd_s8(int8x8_t a, int8x8_t b) {
   4953   return vpadd_s8(a, b);
   4954 }
   4955 
   4956 // CHECK-LABEL: define <4 x i16> @test_vpadd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   4957 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   4958 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   4959 // CHECK:   [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   4960 // CHECK:   [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   4961 // CHECK:   [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> [[VPADD_V_I]], <4 x i16> [[VPADD_V1_I]]) #4
   4962 // CHECK:   [[VPADD_V3_I:%.*]] = bitcast <4 x i16> [[VPADD_V2_I]] to <8 x i8>
   4963 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <4 x i16>
   4964 // CHECK:   ret <4 x i16> [[TMP2]]
   4965 int16x4_t test_vpadd_s16(int16x4_t a, int16x4_t b) {
   4966   return vpadd_s16(a, b);
   4967 }
   4968 
   4969 // CHECK-LABEL: define <2 x i32> @test_vpadd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   4970 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   4971 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   4972 // CHECK:   [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   4973 // CHECK:   [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   4974 // CHECK:   [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> [[VPADD_V_I]], <2 x i32> [[VPADD_V1_I]]) #4
   4975 // CHECK:   [[VPADD_V3_I:%.*]] = bitcast <2 x i32> [[VPADD_V2_I]] to <8 x i8>
   4976 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <2 x i32>
   4977 // CHECK:   ret <2 x i32> [[TMP2]]
   4978 int32x2_t test_vpadd_s32(int32x2_t a, int32x2_t b) {
   4979   return vpadd_s32(a, b);
   4980 }
   4981 
   4982 // CHECK-LABEL: define <8 x i8> @test_vpadd_u8(<8 x i8> %a, <8 x i8> %b) #0 {
   4983 // CHECK:   [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> %a, <8 x i8> %b) #4
   4984 // CHECK:   ret <8 x i8> [[VPADD_V_I]]
   4985 uint8x8_t test_vpadd_u8(uint8x8_t a, uint8x8_t b) {
   4986   return vpadd_u8(a, b);
   4987 }
   4988 
   4989 // CHECK-LABEL: define <4 x i16> @test_vpadd_u16(<4 x i16> %a, <4 x i16> %b) #0 {
   4990 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   4991 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   4992 // CHECK:   [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   4993 // CHECK:   [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   4994 // CHECK:   [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> [[VPADD_V_I]], <4 x i16> [[VPADD_V1_I]]) #4
   4995 // CHECK:   [[VPADD_V3_I:%.*]] = bitcast <4 x i16> [[VPADD_V2_I]] to <8 x i8>
   4996 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <4 x i16>
   4997 // CHECK:   ret <4 x i16> [[TMP2]]
   4998 uint16x4_t test_vpadd_u16(uint16x4_t a, uint16x4_t b) {
   4999   return vpadd_u16(a, b);
   5000 }
   5001 
   5002 // CHECK-LABEL: define <2 x i32> @test_vpadd_u32(<2 x i32> %a, <2 x i32> %b) #0 {
   5003 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   5004 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   5005 // CHECK:   [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   5006 // CHECK:   [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   5007 // CHECK:   [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> [[VPADD_V_I]], <2 x i32> [[VPADD_V1_I]]) #4
   5008 // CHECK:   [[VPADD_V3_I:%.*]] = bitcast <2 x i32> [[VPADD_V2_I]] to <8 x i8>
   5009 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <2 x i32>
   5010 // CHECK:   ret <2 x i32> [[TMP2]]
   5011 uint32x2_t test_vpadd_u32(uint32x2_t a, uint32x2_t b) {
   5012   return vpadd_u32(a, b);
   5013 }
   5014 
   5015 // CHECK-LABEL: define <2 x float> @test_vpadd_f32(<2 x float> %a, <2 x float> %b) #0 {
   5016 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   5017 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
   5018 // CHECK:   [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   5019 // CHECK:   [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
   5020 // CHECK:   [[VPADD_V2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float> [[VPADD_V_I]], <2 x float> [[VPADD_V1_I]]) #4
   5021 // CHECK:   [[VPADD_V3_I:%.*]] = bitcast <2 x float> [[VPADD_V2_I]] to <8 x i8>
   5022 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <2 x float>
   5023 // CHECK:   ret <2 x float> [[TMP2]]
   5024 float32x2_t test_vpadd_f32(float32x2_t a, float32x2_t b) {
   5025   return vpadd_f32(a, b);
   5026 }
   5027 
   5028 // CHECK-LABEL: define <16 x i8> @test_vpaddq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
   5029 // CHECK:   [[VPADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8> %a, <16 x i8> %b) #4
   5030 // CHECK:   ret <16 x i8> [[VPADDQ_V_I]]
   5031 int8x16_t test_vpaddq_s8(int8x16_t a, int8x16_t b) {
   5032   return vpaddq_s8(a, b);
   5033 }
   5034 
   5035 // CHECK-LABEL: define <8 x i16> @test_vpaddq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   5036 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   5037 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   5038 // CHECK:   [[VPADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   5039 // CHECK:   [[VPADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   5040 // CHECK:   [[VPADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16> [[VPADDQ_V_I]], <8 x i16> [[VPADDQ_V1_I]]) #4
   5041 // CHECK:   [[VPADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VPADDQ_V2_I]] to <16 x i8>
   5042 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VPADDQ_V3_I]] to <8 x i16>
   5043 // CHECK:   ret <8 x i16> [[TMP2]]
   5044 int16x8_t test_vpaddq_s16(int16x8_t a, int16x8_t b) {
   5045   return vpaddq_s16(a, b);
   5046 }
   5047 
   5048 // CHECK-LABEL: define <4 x i32> @test_vpaddq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   5049 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   5050 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   5051 // CHECK:   [[VPADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   5052 // CHECK:   [[VPADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   5053 // CHECK:   [[VPADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32> [[VPADDQ_V_I]], <4 x i32> [[VPADDQ_V1_I]]) #4
   5054 // CHECK:   [[VPADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VPADDQ_V2_I]] to <16 x i8>
   5055 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VPADDQ_V3_I]] to <4 x i32>
   5056 // CHECK:   ret <4 x i32> [[TMP2]]
   5057 int32x4_t test_vpaddq_s32(int32x4_t a, int32x4_t b) {
   5058   return vpaddq_s32(a, b);
   5059 }
   5060 
   5061 // CHECK-LABEL: define <16 x i8> @test_vpaddq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
   5062 // CHECK:   [[VPADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8> %a, <16 x i8> %b) #4
   5063 // CHECK:   ret <16 x i8> [[VPADDQ_V_I]]
   5064 uint8x16_t test_vpaddq_u8(uint8x16_t a, uint8x16_t b) {
   5065   return vpaddq_u8(a, b);
   5066 }
   5067 
   5068 // CHECK-LABEL: define <8 x i16> @test_vpaddq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
   5069 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   5070 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   5071 // CHECK:   [[VPADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   5072 // CHECK:   [[VPADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   5073 // CHECK:   [[VPADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16> [[VPADDQ_V_I]], <8 x i16> [[VPADDQ_V1_I]]) #4
   5074 // CHECK:   [[VPADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VPADDQ_V2_I]] to <16 x i8>
   5075 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VPADDQ_V3_I]] to <8 x i16>
   5076 // CHECK:   ret <8 x i16> [[TMP2]]
   5077 uint16x8_t test_vpaddq_u16(uint16x8_t a, uint16x8_t b) {
   5078   return vpaddq_u16(a, b);
   5079 }
   5080 
   5081 // CHECK-LABEL: define <4 x i32> @test_vpaddq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
   5082 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   5083 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   5084 // CHECK:   [[VPADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   5085 // CHECK:   [[VPADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   5086 // CHECK:   [[VPADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32> [[VPADDQ_V_I]], <4 x i32> [[VPADDQ_V1_I]]) #4
   5087 // CHECK:   [[VPADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VPADDQ_V2_I]] to <16 x i8>
   5088 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VPADDQ_V3_I]] to <4 x i32>
   5089 // CHECK:   ret <4 x i32> [[TMP2]]
   5090 uint32x4_t test_vpaddq_u32(uint32x4_t a, uint32x4_t b) {
   5091   return vpaddq_u32(a, b);
   5092 }
   5093 
   5094 // CHECK-LABEL: define <4 x float> @test_vpaddq_f32(<4 x float> %a, <4 x float> %b) #0 {
   5095 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
   5096 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
   5097 // CHECK:   [[VPADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
   5098 // CHECK:   [[VPADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
   5099 // CHECK:   [[VPADDQ_V2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float> [[VPADDQ_V_I]], <4 x float> [[VPADDQ_V1_I]]) #4
   5100 // CHECK:   [[VPADDQ_V3_I:%.*]] = bitcast <4 x float> [[VPADDQ_V2_I]] to <16 x i8>
   5101 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VPADDQ_V3_I]] to <4 x float>
   5102 // CHECK:   ret <4 x float> [[TMP2]]
   5103 float32x4_t test_vpaddq_f32(float32x4_t a, float32x4_t b) {
   5104   return vpaddq_f32(a, b);
   5105 }
   5106 
   5107 // CHECK-LABEL: define <2 x double> @test_vpaddq_f64(<2 x double> %a, <2 x double> %b) #0 {
   5108 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
   5109 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
   5110 // CHECK:   [[VPADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
   5111 // CHECK:   [[VPADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
   5112 // CHECK:   [[VPADDQ_V2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double> [[VPADDQ_V_I]], <2 x double> [[VPADDQ_V1_I]]) #4
   5113 // CHECK:   [[VPADDQ_V3_I:%.*]] = bitcast <2 x double> [[VPADDQ_V2_I]] to <16 x i8>
   5114 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VPADDQ_V3_I]] to <2 x double>
   5115 // CHECK:   ret <2 x double> [[TMP2]]
   5116 float64x2_t test_vpaddq_f64(float64x2_t a, float64x2_t b) {
   5117   return vpaddq_f64(a, b);
   5118 }
   5119 
   5120 // CHECK-LABEL: define <4 x i16> @test_vqdmulh_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   5121 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   5122 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   5123 // CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   5124 // CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   5125 // CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) #4
   5126 // CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
   5127 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16>
   5128 // CHECK:   ret <4 x i16> [[TMP2]]
   5129 int16x4_t test_vqdmulh_s16(int16x4_t a, int16x4_t b) {
   5130   return vqdmulh_s16(a, b);
   5131 }
   5132 
   5133 // CHECK-LABEL: define <2 x i32> @test_vqdmulh_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   5134 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   5135 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   5136 // CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   5137 // CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   5138 // CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) #4
   5139 // CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
   5140 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32>
   5141 // CHECK:   ret <2 x i32> [[TMP2]]
   5142 int32x2_t test_vqdmulh_s32(int32x2_t a, int32x2_t b) {
   5143   return vqdmulh_s32(a, b);
   5144 }
   5145 
   5146 // CHECK-LABEL: define <8 x i16> @test_vqdmulhq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   5147 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   5148 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   5149 // CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   5150 // CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   5151 // CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) #4
   5152 // CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
   5153 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16>
   5154 // CHECK:   ret <8 x i16> [[TMP2]]
   5155 int16x8_t test_vqdmulhq_s16(int16x8_t a, int16x8_t b) {
   5156   return vqdmulhq_s16(a, b);
   5157 }
   5158 
   5159 // CHECK-LABEL: define <4 x i32> @test_vqdmulhq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   5160 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   5161 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   5162 // CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   5163 // CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   5164 // CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) #4
   5165 // CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
   5166 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32>
   5167 // CHECK:   ret <4 x i32> [[TMP2]]
   5168 int32x4_t test_vqdmulhq_s32(int32x4_t a, int32x4_t b) {
   5169   return vqdmulhq_s32(a, b);
   5170 }
   5171 
   5172 // CHECK-LABEL: define <4 x i16> @test_vqrdmulh_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   5173 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   5174 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   5175 // CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   5176 // CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   5177 // CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) #4
   5178 // CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
   5179 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16>
   5180 // CHECK:   ret <4 x i16> [[TMP2]]
   5181 int16x4_t test_vqrdmulh_s16(int16x4_t a, int16x4_t b) {
   5182   return vqrdmulh_s16(a, b);
   5183 }
   5184 
   5185 // CHECK-LABEL: define <2 x i32> @test_vqrdmulh_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   5186 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   5187 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   5188 // CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   5189 // CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   5190 // CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) #4
   5191 // CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
   5192 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32>
   5193 // CHECK:   ret <2 x i32> [[TMP2]]
   5194 int32x2_t test_vqrdmulh_s32(int32x2_t a, int32x2_t b) {
   5195   return vqrdmulh_s32(a, b);
   5196 }
   5197 
   5198 // CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   5199 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   5200 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   5201 // CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   5202 // CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   5203 // CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) #4
   5204 // CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
   5205 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16>
   5206 // CHECK:   ret <8 x i16> [[TMP2]]
   5207 int16x8_t test_vqrdmulhq_s16(int16x8_t a, int16x8_t b) {
   5208   return vqrdmulhq_s16(a, b);
   5209 }
   5210 
   5211 // CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   5212 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   5213 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   5214 // CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   5215 // CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   5216 // CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) #4
   5217 // CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
   5218 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32>
   5219 // CHECK:   ret <4 x i32> [[TMP2]]
   5220 int32x4_t test_vqrdmulhq_s32(int32x4_t a, int32x4_t b) {
   5221   return vqrdmulhq_s32(a, b);
   5222 }
   5223 
   5224 // CHECK-LABEL: define <2 x float> @test_vmulx_f32(<2 x float> %a, <2 x float> %b) #0 {
   5225 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   5226 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
   5227 // CHECK:   [[VMULX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   5228 // CHECK:   [[VMULX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
   5229 // CHECK:   [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[VMULX_I]], <2 x float> [[VMULX1_I]]) #4
   5230 // CHECK:   ret <2 x float> [[VMULX2_I]]
   5231 float32x2_t test_vmulx_f32(float32x2_t a, float32x2_t b) {
   5232   return vmulx_f32(a, b);
   5233 }
   5234 
   5235 // CHECK-LABEL: define <4 x float> @test_vmulxq_f32(<4 x float> %a, <4 x float> %b) #0 {
   5236 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
   5237 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
   5238 // CHECK:   [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
   5239 // CHECK:   [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
   5240 // CHECK:   [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[VMULX_I]], <4 x float> [[VMULX1_I]]) #4
   5241 // CHECK:   ret <4 x float> [[VMULX2_I]]
   5242 float32x4_t test_vmulxq_f32(float32x4_t a, float32x4_t b) {
   5243   return vmulxq_f32(a, b);
   5244 }
   5245 
   5246 // CHECK-LABEL: define <2 x double> @test_vmulxq_f64(<2 x double> %a, <2 x double> %b) #0 {
   5247 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
   5248 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
   5249 // CHECK:   [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
   5250 // CHECK:   [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
   5251 // CHECK:   [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[VMULX_I]], <2 x double> [[VMULX1_I]]) #4
   5252 // CHECK:   ret <2 x double> [[VMULX2_I]]
   5253 float64x2_t test_vmulxq_f64(float64x2_t a, float64x2_t b) {
   5254   return vmulxq_f64(a, b);
   5255 }
   5256 
   5257 // CHECK-LABEL: define <8 x i8> @test_vshl_n_s8(<8 x i8> %a) #0 {
   5258 // CHECK:   [[VSHL_N:%.*]] = shl <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   5259 // CHECK:   ret <8 x i8> [[VSHL_N]]
   5260 int8x8_t test_vshl_n_s8(int8x8_t a) {
   5261   return vshl_n_s8(a, 3);
   5262 }
   5263 
   5264 // CHECK-LABEL: define <4 x i16> @test_vshl_n_s16(<4 x i16> %a) #0 {
   5265 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   5266 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   5267 // CHECK:   [[VSHL_N:%.*]] = shl <4 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3>
   5268 // CHECK:   ret <4 x i16> [[VSHL_N]]
   5269 int16x4_t test_vshl_n_s16(int16x4_t a) {
   5270   return vshl_n_s16(a, 3);
   5271 }
   5272 
   5273 // CHECK-LABEL: define <2 x i32> @test_vshl_n_s32(<2 x i32> %a) #0 {
   5274 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   5275 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   5276 // CHECK:   [[VSHL_N:%.*]] = shl <2 x i32> [[TMP1]], <i32 3, i32 3>
   5277 // CHECK:   ret <2 x i32> [[VSHL_N]]
   5278 int32x2_t test_vshl_n_s32(int32x2_t a) {
   5279   return vshl_n_s32(a, 3);
   5280 }
   5281 
   5282 // CHECK-LABEL: define <16 x i8> @test_vshlq_n_s8(<16 x i8> %a) #0 {
   5283 // CHECK:   [[VSHL_N:%.*]] = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   5284 // CHECK:   ret <16 x i8> [[VSHL_N]]
   5285 int8x16_t test_vshlq_n_s8(int8x16_t a) {
   5286   return vshlq_n_s8(a, 3);
   5287 }
   5288 
   5289 // CHECK-LABEL: define <8 x i16> @test_vshlq_n_s16(<8 x i16> %a) #0 {
   5290 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   5291 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   5292 // CHECK:   [[VSHL_N:%.*]] = shl <8 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   5293 // CHECK:   ret <8 x i16> [[VSHL_N]]
   5294 int16x8_t test_vshlq_n_s16(int16x8_t a) {
   5295   return vshlq_n_s16(a, 3);
   5296 }
   5297 
   5298 // CHECK-LABEL: define <4 x i32> @test_vshlq_n_s32(<4 x i32> %a) #0 {
   5299 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   5300 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   5301 // CHECK:   [[VSHL_N:%.*]] = shl <4 x i32> [[TMP1]], <i32 3, i32 3, i32 3, i32 3>
   5302 // CHECK:   ret <4 x i32> [[VSHL_N]]
   5303 int32x4_t test_vshlq_n_s32(int32x4_t a) {
   5304   return vshlq_n_s32(a, 3);
   5305 }
   5306 
   5307 // CHECK-LABEL: define <2 x i64> @test_vshlq_n_s64(<2 x i64> %a) #0 {
   5308 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   5309 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   5310 // CHECK:   [[VSHL_N:%.*]] = shl <2 x i64> [[TMP1]], <i64 3, i64 3>
   5311 // CHECK:   ret <2 x i64> [[VSHL_N]]
   5312 int64x2_t test_vshlq_n_s64(int64x2_t a) {
   5313   return vshlq_n_s64(a, 3);
   5314 }
   5315 
   5316 // CHECK-LABEL: define <8 x i8> @test_vshl_n_u8(<8 x i8> %a) #0 {
   5317 // CHECK:   [[VSHL_N:%.*]] = shl <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   5318 // CHECK:   ret <8 x i8> [[VSHL_N]]
   5319 int8x8_t test_vshl_n_u8(int8x8_t a) {
   5320   return vshl_n_u8(a, 3);
   5321 }
   5322 
   5323 // CHECK-LABEL: define <4 x i16> @test_vshl_n_u16(<4 x i16> %a) #0 {
   5324 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   5325 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   5326 // CHECK:   [[VSHL_N:%.*]] = shl <4 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3>
   5327 // CHECK:   ret <4 x i16> [[VSHL_N]]
   5328 int16x4_t test_vshl_n_u16(int16x4_t a) {
   5329   return vshl_n_u16(a, 3);
   5330 }
   5331 
   5332 // CHECK-LABEL: define <2 x i32> @test_vshl_n_u32(<2 x i32> %a) #0 {
   5333 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   5334 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   5335 // CHECK:   [[VSHL_N:%.*]] = shl <2 x i32> [[TMP1]], <i32 3, i32 3>
   5336 // CHECK:   ret <2 x i32> [[VSHL_N]]
   5337 int32x2_t test_vshl_n_u32(int32x2_t a) {
   5338   return vshl_n_u32(a, 3);
   5339 }
   5340 
   5341 // CHECK-LABEL: define <16 x i8> @test_vshlq_n_u8(<16 x i8> %a) #0 {
   5342 // CHECK:   [[VSHL_N:%.*]] = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   5343 // CHECK:   ret <16 x i8> [[VSHL_N]]
   5344 int8x16_t test_vshlq_n_u8(int8x16_t a) {
   5345   return vshlq_n_u8(a, 3);
   5346 }
   5347 
   5348 // CHECK-LABEL: define <8 x i16> @test_vshlq_n_u16(<8 x i16> %a) #0 {
   5349 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   5350 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   5351 // CHECK:   [[VSHL_N:%.*]] = shl <8 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   5352 // CHECK:   ret <8 x i16> [[VSHL_N]]
   5353 int16x8_t test_vshlq_n_u16(int16x8_t a) {
   5354   return vshlq_n_u16(a, 3);
   5355 }
   5356 
   5357 // CHECK-LABEL: define <4 x i32> @test_vshlq_n_u32(<4 x i32> %a) #0 {
   5358 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   5359 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   5360 // CHECK:   [[VSHL_N:%.*]] = shl <4 x i32> [[TMP1]], <i32 3, i32 3, i32 3, i32 3>
   5361 // CHECK:   ret <4 x i32> [[VSHL_N]]
   5362 int32x4_t test_vshlq_n_u32(int32x4_t a) {
   5363   return vshlq_n_u32(a, 3);
   5364 }
   5365 
   5366 // CHECK-LABEL: define <2 x i64> @test_vshlq_n_u64(<2 x i64> %a) #0 {
   5367 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   5368 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   5369 // CHECK:   [[VSHL_N:%.*]] = shl <2 x i64> [[TMP1]], <i64 3, i64 3>
   5370 // CHECK:   ret <2 x i64> [[VSHL_N]]
   5371 int64x2_t test_vshlq_n_u64(int64x2_t a) {
   5372   return vshlq_n_u64(a, 3);
   5373 }
   5374 
   5375 // CHECK-LABEL: define <8 x i8> @test_vshr_n_s8(<8 x i8> %a) #0 {
   5376 // CHECK:   [[VSHR_N:%.*]] = ashr <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   5377 // CHECK:   ret <8 x i8> [[VSHR_N]]
   5378 int8x8_t test_vshr_n_s8(int8x8_t a) {
   5379   return vshr_n_s8(a, 3);
   5380 }
   5381 
   5382 // CHECK-LABEL: define <4 x i16> @test_vshr_n_s16(<4 x i16> %a) #0 {
   5383 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   5384 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   5385 // CHECK:   [[VSHR_N:%.*]] = ashr <4 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3>
   5386 // CHECK:   ret <4 x i16> [[VSHR_N]]
   5387 int16x4_t test_vshr_n_s16(int16x4_t a) {
   5388   return vshr_n_s16(a, 3);
   5389 }
   5390 
   5391 // CHECK-LABEL: define <2 x i32> @test_vshr_n_s32(<2 x i32> %a) #0 {
   5392 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   5393 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   5394 // CHECK:   [[VSHR_N:%.*]] = ashr <2 x i32> [[TMP1]], <i32 3, i32 3>
   5395 // CHECK:   ret <2 x i32> [[VSHR_N]]
   5396 int32x2_t test_vshr_n_s32(int32x2_t a) {
   5397   return vshr_n_s32(a, 3);
   5398 }
   5399 
   5400 // CHECK-LABEL: define <16 x i8> @test_vshrq_n_s8(<16 x i8> %a) #0 {
   5401 // CHECK:   [[VSHR_N:%.*]] = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   5402 // CHECK:   ret <16 x i8> [[VSHR_N]]
   5403 int8x16_t test_vshrq_n_s8(int8x16_t a) {
   5404   return vshrq_n_s8(a, 3);
   5405 }
   5406 
   5407 // CHECK-LABEL: define <8 x i16> @test_vshrq_n_s16(<8 x i16> %a) #0 {
   5408 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   5409 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   5410 // CHECK:   [[VSHR_N:%.*]] = ashr <8 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   5411 // CHECK:   ret <8 x i16> [[VSHR_N]]
   5412 int16x8_t test_vshrq_n_s16(int16x8_t a) {
   5413   return vshrq_n_s16(a, 3);
   5414 }
   5415 
   5416 // CHECK-LABEL: define <4 x i32> @test_vshrq_n_s32(<4 x i32> %a) #0 {
   5417 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   5418 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   5419 // CHECK:   [[VSHR_N:%.*]] = ashr <4 x i32> [[TMP1]], <i32 3, i32 3, i32 3, i32 3>
   5420 // CHECK:   ret <4 x i32> [[VSHR_N]]
   5421 int32x4_t test_vshrq_n_s32(int32x4_t a) {
   5422   return vshrq_n_s32(a, 3);
   5423 }
   5424 
   5425 // CHECK-LABEL: define <2 x i64> @test_vshrq_n_s64(<2 x i64> %a) #0 {
   5426 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   5427 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   5428 // CHECK:   [[VSHR_N:%.*]] = ashr <2 x i64> [[TMP1]], <i64 3, i64 3>
   5429 // CHECK:   ret <2 x i64> [[VSHR_N]]
   5430 int64x2_t test_vshrq_n_s64(int64x2_t a) {
   5431   return vshrq_n_s64(a, 3);
   5432 }
   5433 
   5434 // CHECK-LABEL: define <8 x i8> @test_vshr_n_u8(<8 x i8> %a) #0 {
   5435 // CHECK:   [[VSHR_N:%.*]] = lshr <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   5436 // CHECK:   ret <8 x i8> [[VSHR_N]]
   5437 int8x8_t test_vshr_n_u8(int8x8_t a) {
   5438   return vshr_n_u8(a, 3);
   5439 }
   5440 
   5441 // CHECK-LABEL: define <4 x i16> @test_vshr_n_u16(<4 x i16> %a) #0 {
   5442 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   5443 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   5444 // CHECK:   [[VSHR_N:%.*]] = lshr <4 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3>
   5445 // CHECK:   ret <4 x i16> [[VSHR_N]]
   5446 int16x4_t test_vshr_n_u16(int16x4_t a) {
   5447   return vshr_n_u16(a, 3);
   5448 }
   5449 
   5450 // CHECK-LABEL: define <2 x i32> @test_vshr_n_u32(<2 x i32> %a) #0 {
   5451 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   5452 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   5453 // CHECK:   [[VSHR_N:%.*]] = lshr <2 x i32> [[TMP1]], <i32 3, i32 3>
   5454 // CHECK:   ret <2 x i32> [[VSHR_N]]
   5455 int32x2_t test_vshr_n_u32(int32x2_t a) {
   5456   return vshr_n_u32(a, 3);
   5457 }
   5458 
   5459 // CHECK-LABEL: define <16 x i8> @test_vshrq_n_u8(<16 x i8> %a) #0 {
   5460 // CHECK:   [[VSHR_N:%.*]] = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   5461 // CHECK:   ret <16 x i8> [[VSHR_N]]
   5462 int8x16_t test_vshrq_n_u8(int8x16_t a) {
   5463   return vshrq_n_u8(a, 3);
   5464 }
   5465 
   5466 // CHECK-LABEL: define <8 x i16> @test_vshrq_n_u16(<8 x i16> %a) #0 {
   5467 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   5468 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   5469 // CHECK:   [[VSHR_N:%.*]] = lshr <8 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   5470 // CHECK:   ret <8 x i16> [[VSHR_N]]
   5471 int16x8_t test_vshrq_n_u16(int16x8_t a) {
   5472   return vshrq_n_u16(a, 3);
   5473 }
   5474 
   5475 // CHECK-LABEL: define <4 x i32> @test_vshrq_n_u32(<4 x i32> %a) #0 {
   5476 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   5477 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   5478 // CHECK:   [[VSHR_N:%.*]] = lshr <4 x i32> [[TMP1]], <i32 3, i32 3, i32 3, i32 3>
   5479 // CHECK:   ret <4 x i32> [[VSHR_N]]
   5480 int32x4_t test_vshrq_n_u32(int32x4_t a) {
   5481   return vshrq_n_u32(a, 3);
   5482 }
   5483 
   5484 // CHECK-LABEL: define <2 x i64> @test_vshrq_n_u64(<2 x i64> %a) #0 {
   5485 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   5486 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   5487 // CHECK:   [[VSHR_N:%.*]] = lshr <2 x i64> [[TMP1]], <i64 3, i64 3>
   5488 // CHECK:   ret <2 x i64> [[VSHR_N]]
   5489 int64x2_t test_vshrq_n_u64(int64x2_t a) {
   5490   return vshrq_n_u64(a, 3);
   5491 }
   5492 
   5493 // CHECK-LABEL: define <8 x i8> @test_vsra_n_s8(<8 x i8> %a, <8 x i8> %b) #0 {
   5494 // CHECK:   [[VSRA_N:%.*]] = ashr <8 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   5495 // CHECK:   [[TMP0:%.*]] = add <8 x i8> %a, [[VSRA_N]]
   5496 // CHECK:   ret <8 x i8> [[TMP0]]
   5497 int8x8_t test_vsra_n_s8(int8x8_t a, int8x8_t b) {
   5498   return vsra_n_s8(a, b, 3);
   5499 }
   5500 
   5501 // CHECK-LABEL: define <4 x i16> @test_vsra_n_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   5502 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   5503 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   5504 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   5505 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   5506 // CHECK:   [[VSRA_N:%.*]] = ashr <4 x i16> [[TMP3]], <i16 3, i16 3, i16 3, i16 3>
   5507 // CHECK:   [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[VSRA_N]]
   5508 // CHECK:   ret <4 x i16> [[TMP4]]
   5509 int16x4_t test_vsra_n_s16(int16x4_t a, int16x4_t b) {
   5510   return vsra_n_s16(a, b, 3);
   5511 }
   5512 
   5513 // CHECK-LABEL: define <2 x i32> @test_vsra_n_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   5514 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   5515 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   5516 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   5517 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   5518 // CHECK:   [[VSRA_N:%.*]] = ashr <2 x i32> [[TMP3]], <i32 3, i32 3>
   5519 // CHECK:   [[TMP4:%.*]] = add <2 x i32> [[TMP2]], [[VSRA_N]]
   5520 // CHECK:   ret <2 x i32> [[TMP4]]
   5521 int32x2_t test_vsra_n_s32(int32x2_t a, int32x2_t b) {
   5522   return vsra_n_s32(a, b, 3);
   5523 }
   5524 
   5525 // CHECK-LABEL: define <16 x i8> @test_vsraq_n_s8(<16 x i8> %a, <16 x i8> %b) #0 {
   5526 // CHECK:   [[VSRA_N:%.*]] = ashr <16 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   5527 // CHECK:   [[TMP0:%.*]] = add <16 x i8> %a, [[VSRA_N]]
   5528 // CHECK:   ret <16 x i8> [[TMP0]]
   5529 int8x16_t test_vsraq_n_s8(int8x16_t a, int8x16_t b) {
   5530   return vsraq_n_s8(a, b, 3);
   5531 }
   5532 
   5533 // CHECK-LABEL: define <8 x i16> @test_vsraq_n_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   5534 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   5535 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   5536 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   5537 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   5538 // CHECK:   [[VSRA_N:%.*]] = ashr <8 x i16> [[TMP3]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   5539 // CHECK:   [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[VSRA_N]]
   5540 // CHECK:   ret <8 x i16> [[TMP4]]
   5541 int16x8_t test_vsraq_n_s16(int16x8_t a, int16x8_t b) {
   5542   return vsraq_n_s16(a, b, 3);
   5543 }
   5544 
   5545 // CHECK-LABEL: define <4 x i32> @test_vsraq_n_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   5546 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   5547 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   5548 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   5549 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   5550 // CHECK:   [[VSRA_N:%.*]] = ashr <4 x i32> [[TMP3]], <i32 3, i32 3, i32 3, i32 3>
   5551 // CHECK:   [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[VSRA_N]]
   5552 // CHECK:   ret <4 x i32> [[TMP4]]
   5553 int32x4_t test_vsraq_n_s32(int32x4_t a, int32x4_t b) {
   5554   return vsraq_n_s32(a, b, 3);
   5555 }
   5556 
   5557 // CHECK-LABEL: define <2 x i64> @test_vsraq_n_s64(<2 x i64> %a, <2 x i64> %b) #0 {
   5558 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   5559 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   5560 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   5561 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   5562 // CHECK:   [[VSRA_N:%.*]] = ashr <2 x i64> [[TMP3]], <i64 3, i64 3>
   5563 // CHECK:   [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[VSRA_N]]
   5564 // CHECK:   ret <2 x i64> [[TMP4]]
   5565 int64x2_t test_vsraq_n_s64(int64x2_t a, int64x2_t b) {
   5566   return vsraq_n_s64(a, b, 3);
   5567 }
   5568 
   5569 // CHECK-LABEL: define <8 x i8> @test_vsra_n_u8(<8 x i8> %a, <8 x i8> %b) #0 {
   5570 // CHECK:   [[VSRA_N:%.*]] = lshr <8 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   5571 // CHECK:   [[TMP0:%.*]] = add <8 x i8> %a, [[VSRA_N]]
   5572 // CHECK:   ret <8 x i8> [[TMP0]]
   5573 int8x8_t test_vsra_n_u8(int8x8_t a, int8x8_t b) {
   5574   return vsra_n_u8(a, b, 3);
   5575 }
   5576 
   5577 // CHECK-LABEL: define <4 x i16> @test_vsra_n_u16(<4 x i16> %a, <4 x i16> %b) #0 {
   5578 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   5579 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   5580 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   5581 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   5582 // CHECK:   [[VSRA_N:%.*]] = lshr <4 x i16> [[TMP3]], <i16 3, i16 3, i16 3, i16 3>
   5583 // CHECK:   [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[VSRA_N]]
   5584 // CHECK:   ret <4 x i16> [[TMP4]]
   5585 int16x4_t test_vsra_n_u16(int16x4_t a, int16x4_t b) {
   5586   return vsra_n_u16(a, b, 3);
   5587 }
   5588 
   5589 // CHECK-LABEL: define <2 x i32> @test_vsra_n_u32(<2 x i32> %a, <2 x i32> %b) #0 {
   5590 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   5591 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   5592 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   5593 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   5594 // CHECK:   [[VSRA_N:%.*]] = lshr <2 x i32> [[TMP3]], <i32 3, i32 3>
   5595 // CHECK:   [[TMP4:%.*]] = add <2 x i32> [[TMP2]], [[VSRA_N]]
   5596 // CHECK:   ret <2 x i32> [[TMP4]]
   5597 int32x2_t test_vsra_n_u32(int32x2_t a, int32x2_t b) {
   5598   return vsra_n_u32(a, b, 3);
   5599 }
   5600 
   5601 // CHECK-LABEL: define <16 x i8> @test_vsraq_n_u8(<16 x i8> %a, <16 x i8> %b) #0 {
   5602 // CHECK:   [[VSRA_N:%.*]] = lshr <16 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   5603 // CHECK:   [[TMP0:%.*]] = add <16 x i8> %a, [[VSRA_N]]
   5604 // CHECK:   ret <16 x i8> [[TMP0]]
   5605 int8x16_t test_vsraq_n_u8(int8x16_t a, int8x16_t b) {
   5606   return vsraq_n_u8(a, b, 3);
   5607 }
   5608 
   5609 // CHECK-LABEL: define <8 x i16> @test_vsraq_n_u16(<8 x i16> %a, <8 x i16> %b) #0 {
   5610 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   5611 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   5612 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   5613 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   5614 // CHECK:   [[VSRA_N:%.*]] = lshr <8 x i16> [[TMP3]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   5615 // CHECK:   [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[VSRA_N]]
   5616 // CHECK:   ret <8 x i16> [[TMP4]]
   5617 int16x8_t test_vsraq_n_u16(int16x8_t a, int16x8_t b) {
   5618   return vsraq_n_u16(a, b, 3);
   5619 }
   5620 
   5621 // CHECK-LABEL: define <4 x i32> @test_vsraq_n_u32(<4 x i32> %a, <4 x i32> %b) #0 {
   5622 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   5623 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   5624 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   5625 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   5626 // CHECK:   [[VSRA_N:%.*]] = lshr <4 x i32> [[TMP3]], <i32 3, i32 3, i32 3, i32 3>
   5627 // CHECK:   [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[VSRA_N]]
   5628 // CHECK:   ret <4 x i32> [[TMP4]]
   5629 int32x4_t test_vsraq_n_u32(int32x4_t a, int32x4_t b) {
   5630   return vsraq_n_u32(a, b, 3);
   5631 }
   5632 
   5633 // CHECK-LABEL: define <2 x i64> @test_vsraq_n_u64(<2 x i64> %a, <2 x i64> %b) #0 {
   5634 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   5635 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   5636 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   5637 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   5638 // CHECK:   [[VSRA_N:%.*]] = lshr <2 x i64> [[TMP3]], <i64 3, i64 3>
   5639 // CHECK:   [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[VSRA_N]]
   5640 // CHECK:   ret <2 x i64> [[TMP4]]
   5641 int64x2_t test_vsraq_n_u64(int64x2_t a, int64x2_t b) {
   5642   return vsraq_n_u64(a, b, 3);
   5643 }
   5644 
   5645 // CHECK-LABEL: define <8 x i8> @test_vrshr_n_s8(<8 x i8> %a) #0 {
   5646 // CHECK:   [[VRSHR_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %a, <8 x i8> <i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3>)
   5647 // CHECK:   ret <8 x i8> [[VRSHR_N]]
   5648 int8x8_t test_vrshr_n_s8(int8x8_t a) {
   5649   return vrshr_n_s8(a, 3);
   5650 }
   5651 
   5652 // CHECK-LABEL: define <4 x i16> @test_vrshr_n_s16(<4 x i16> %a) #0 {
   5653 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   5654 // CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   5655 // CHECK:   [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> <i16 -3, i16 -3, i16 -3, i16 -3>)
   5656 // CHECK:   ret <4 x i16> [[VRSHR_N1]]
   5657 int16x4_t test_vrshr_n_s16(int16x4_t a) {
   5658   return vrshr_n_s16(a, 3);
   5659 }
   5660 
   5661 // CHECK-LABEL: define <2 x i32> @test_vrshr_n_s32(<2 x i32> %a) #0 {
   5662 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   5663 // CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   5664 // CHECK:   [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> <i32 -3, i32 -3>)
   5665 // CHECK:   ret <2 x i32> [[VRSHR_N1]]
   5666 int32x2_t test_vrshr_n_s32(int32x2_t a) {
   5667   return vrshr_n_s32(a, 3);
   5668 }
   5669 
   5670 // CHECK-LABEL: define <16 x i8> @test_vrshrq_n_s8(<16 x i8> %a) #0 {
   5671 // CHECK:   [[VRSHR_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %a, <16 x i8> <i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3>)
   5672 // CHECK:   ret <16 x i8> [[VRSHR_N]]
   5673 int8x16_t test_vrshrq_n_s8(int8x16_t a) {
   5674   return vrshrq_n_s8(a, 3);
   5675 }
   5676 
   5677 // CHECK-LABEL: define <8 x i16> @test_vrshrq_n_s16(<8 x i16> %a) #0 {
   5678 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   5679 // CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   5680 // CHECK:   [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> <i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3>)
   5681 // CHECK:   ret <8 x i16> [[VRSHR_N1]]
   5682 int16x8_t test_vrshrq_n_s16(int16x8_t a) {
   5683   return vrshrq_n_s16(a, 3);
   5684 }
   5685 
   5686 // CHECK-LABEL: define <4 x i32> @test_vrshrq_n_s32(<4 x i32> %a) #0 {
   5687 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   5688 // CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   5689 // CHECK:   [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> <i32 -3, i32 -3, i32 -3, i32 -3>)
   5690 // CHECK:   ret <4 x i32> [[VRSHR_N1]]
   5691 int32x4_t test_vrshrq_n_s32(int32x4_t a) {
   5692   return vrshrq_n_s32(a, 3);
   5693 }
   5694 
   5695 // CHECK-LABEL: define <2 x i64> @test_vrshrq_n_s64(<2 x i64> %a) #0 {
   5696 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   5697 // CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   5698 // CHECK:   [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> <i64 -3, i64 -3>)
   5699 // CHECK:   ret <2 x i64> [[VRSHR_N1]]
   5700 int64x2_t test_vrshrq_n_s64(int64x2_t a) {
   5701   return vrshrq_n_s64(a, 3);
   5702 }
   5703 
   5704 // CHECK-LABEL: define <8 x i8> @test_vrshr_n_u8(<8 x i8> %a) #0 {
   5705 // CHECK:   [[VRSHR_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %a, <8 x i8> <i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3>)
   5706 // CHECK:   ret <8 x i8> [[VRSHR_N]]
   5707 int8x8_t test_vrshr_n_u8(int8x8_t a) {
   5708   return vrshr_n_u8(a, 3);
   5709 }
   5710 
   5711 // CHECK-LABEL: define <4 x i16> @test_vrshr_n_u16(<4 x i16> %a) #0 {
   5712 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   5713 // CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   5714 // CHECK:   [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> <i16 -3, i16 -3, i16 -3, i16 -3>)
   5715 // CHECK:   ret <4 x i16> [[VRSHR_N1]]
   5716 int16x4_t test_vrshr_n_u16(int16x4_t a) {
   5717   return vrshr_n_u16(a, 3);
   5718 }
   5719 
   5720 // CHECK-LABEL: define <2 x i32> @test_vrshr_n_u32(<2 x i32> %a) #0 {
   5721 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   5722 // CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   5723 // CHECK:   [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> <i32 -3, i32 -3>)
   5724 // CHECK:   ret <2 x i32> [[VRSHR_N1]]
   5725 int32x2_t test_vrshr_n_u32(int32x2_t a) {
   5726   return vrshr_n_u32(a, 3);
   5727 }
   5728 
   5729 // CHECK-LABEL: define <16 x i8> @test_vrshrq_n_u8(<16 x i8> %a) #0 {
   5730 // CHECK:   [[VRSHR_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %a, <16 x i8> <i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3>)
   5731 // CHECK:   ret <16 x i8> [[VRSHR_N]]
   5732 int8x16_t test_vrshrq_n_u8(int8x16_t a) {
   5733   return vrshrq_n_u8(a, 3);
   5734 }
   5735 
   5736 // CHECK-LABEL: define <8 x i16> @test_vrshrq_n_u16(<8 x i16> %a) #0 {
   5737 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   5738 // CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   5739 // CHECK:   [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> <i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3>)
   5740 // CHECK:   ret <8 x i16> [[VRSHR_N1]]
   5741 int16x8_t test_vrshrq_n_u16(int16x8_t a) {
   5742   return vrshrq_n_u16(a, 3);
   5743 }
   5744 
   5745 // CHECK-LABEL: define <4 x i32> @test_vrshrq_n_u32(<4 x i32> %a) #0 {
   5746 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   5747 // CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   5748 // CHECK:   [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> <i32 -3, i32 -3, i32 -3, i32 -3>)
   5749 // CHECK:   ret <4 x i32> [[VRSHR_N1]]
   5750 int32x4_t test_vrshrq_n_u32(int32x4_t a) {
   5751   return vrshrq_n_u32(a, 3);
   5752 }
   5753 
   5754 // CHECK-LABEL: define <2 x i64> @test_vrshrq_n_u64(<2 x i64> %a) #0 {
   5755 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   5756 // CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   5757 // CHECK:   [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> <i64 -3, i64 -3>)
   5758 // CHECK:   ret <2 x i64> [[VRSHR_N1]]
   5759 int64x2_t test_vrshrq_n_u64(int64x2_t a) {
   5760   return vrshrq_n_u64(a, 3);
   5761 }
   5762 
   5763 // CHECK-LABEL: define <8 x i8> @test_vrsra_n_s8(<8 x i8> %a, <8 x i8> %b) #0 {
   5764 // CHECK:   [[VRSHR_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %b, <8 x i8> <i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3>)
   5765 // CHECK:   [[TMP0:%.*]] = add <8 x i8> %a, [[VRSHR_N]]
   5766 // CHECK:   ret <8 x i8> [[TMP0]]
   5767 int8x8_t test_vrsra_n_s8(int8x8_t a, int8x8_t b) {
   5768   return vrsra_n_s8(a, b, 3);
   5769 }
   5770 
   5771 // CHECK-LABEL: define <4 x i16> @test_vrsra_n_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   5772 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   5773 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   5774 // CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   5775 // CHECK:   [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> <i16 -3, i16 -3, i16 -3, i16 -3>)
   5776 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   5777 // CHECK:   [[TMP3:%.*]] = add <4 x i16> [[TMP2]], [[VRSHR_N1]]
   5778 // CHECK:   ret <4 x i16> [[TMP3]]
   5779 int16x4_t test_vrsra_n_s16(int16x4_t a, int16x4_t b) {
   5780   return vrsra_n_s16(a, b, 3);
   5781 }
   5782 
   5783 // CHECK-LABEL: define <2 x i32> @test_vrsra_n_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   5784 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   5785 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   5786 // CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   5787 // CHECK:   [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> <i32 -3, i32 -3>)
   5788 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   5789 // CHECK:   [[TMP3:%.*]] = add <2 x i32> [[TMP2]], [[VRSHR_N1]]
   5790 // CHECK:   ret <2 x i32> [[TMP3]]
   5791 int32x2_t test_vrsra_n_s32(int32x2_t a, int32x2_t b) {
   5792   return vrsra_n_s32(a, b, 3);
   5793 }
   5794 
   5795 // CHECK-LABEL: define <16 x i8> @test_vrsraq_n_s8(<16 x i8> %a, <16 x i8> %b) #0 {
   5796 // CHECK:   [[VRSHR_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %b, <16 x i8> <i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3>)
   5797 // CHECK:   [[TMP0:%.*]] = add <16 x i8> %a, [[VRSHR_N]]
   5798 // CHECK:   ret <16 x i8> [[TMP0]]
   5799 int8x16_t test_vrsraq_n_s8(int8x16_t a, int8x16_t b) {
   5800   return vrsraq_n_s8(a, b, 3);
   5801 }
   5802 
   5803 // CHECK-LABEL: define <8 x i16> @test_vrsraq_n_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   5804 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   5805 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   5806 // CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   5807 // CHECK:   [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> <i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3>)
   5808 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   5809 // CHECK:   [[TMP3:%.*]] = add <8 x i16> [[TMP2]], [[VRSHR_N1]]
   5810 // CHECK:   ret <8 x i16> [[TMP3]]
   5811 int16x8_t test_vrsraq_n_s16(int16x8_t a, int16x8_t b) {
   5812   return vrsraq_n_s16(a, b, 3);
   5813 }
   5814 
   5815 // CHECK-LABEL: define <4 x i32> @test_vrsraq_n_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   5816 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   5817 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   5818 // CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   5819 // CHECK:   [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> <i32 -3, i32 -3, i32 -3, i32 -3>)
   5820 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   5821 // CHECK:   [[TMP3:%.*]] = add <4 x i32> [[TMP2]], [[VRSHR_N1]]
   5822 // CHECK:   ret <4 x i32> [[TMP3]]
   5823 int32x4_t test_vrsraq_n_s32(int32x4_t a, int32x4_t b) {
   5824   return vrsraq_n_s32(a, b, 3);
   5825 }
   5826 
   5827 // CHECK-LABEL: define <2 x i64> @test_vrsraq_n_s64(<2 x i64> %a, <2 x i64> %b) #0 {
   5828 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   5829 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   5830 // CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   5831 // CHECK:   [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> <i64 -3, i64 -3>)
   5832 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   5833 // CHECK:   [[TMP3:%.*]] = add <2 x i64> [[TMP2]], [[VRSHR_N1]]
   5834 // CHECK:   ret <2 x i64> [[TMP3]]
   5835 int64x2_t test_vrsraq_n_s64(int64x2_t a, int64x2_t b) {
   5836   return vrsraq_n_s64(a, b, 3);
   5837 }
   5838 
   5839 // CHECK-LABEL: define <8 x i8> @test_vrsra_n_u8(<8 x i8> %a, <8 x i8> %b) #0 {
   5840 // CHECK:   [[VRSHR_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %b, <8 x i8> <i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3>)
   5841 // CHECK:   [[TMP0:%.*]] = add <8 x i8> %a, [[VRSHR_N]]
   5842 // CHECK:   ret <8 x i8> [[TMP0]]
   5843 int8x8_t test_vrsra_n_u8(int8x8_t a, int8x8_t b) {
   5844   return vrsra_n_u8(a, b, 3);
   5845 }
   5846 
   5847 // CHECK-LABEL: define <4 x i16> @test_vrsra_n_u16(<4 x i16> %a, <4 x i16> %b) #0 {
   5848 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   5849 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   5850 // CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   5851 // CHECK:   [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> <i16 -3, i16 -3, i16 -3, i16 -3>)
   5852 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   5853 // CHECK:   [[TMP3:%.*]] = add <4 x i16> [[TMP2]], [[VRSHR_N1]]
   5854 // CHECK:   ret <4 x i16> [[TMP3]]
   5855 int16x4_t test_vrsra_n_u16(int16x4_t a, int16x4_t b) {
   5856   return vrsra_n_u16(a, b, 3);
   5857 }
   5858 
   5859 // CHECK-LABEL: define <2 x i32> @test_vrsra_n_u32(<2 x i32> %a, <2 x i32> %b) #0 {
   5860 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   5861 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   5862 // CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   5863 // CHECK:   [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> <i32 -3, i32 -3>)
   5864 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   5865 // CHECK:   [[TMP3:%.*]] = add <2 x i32> [[TMP2]], [[VRSHR_N1]]
   5866 // CHECK:   ret <2 x i32> [[TMP3]]
   5867 int32x2_t test_vrsra_n_u32(int32x2_t a, int32x2_t b) {
   5868   return vrsra_n_u32(a, b, 3);
   5869 }
   5870 
   5871 // CHECK-LABEL: define <16 x i8> @test_vrsraq_n_u8(<16 x i8> %a, <16 x i8> %b) #0 {
   5872 // CHECK:   [[VRSHR_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %b, <16 x i8> <i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3>)
   5873 // CHECK:   [[TMP0:%.*]] = add <16 x i8> %a, [[VRSHR_N]]
   5874 // CHECK:   ret <16 x i8> [[TMP0]]
   5875 int8x16_t test_vrsraq_n_u8(int8x16_t a, int8x16_t b) {
   5876   return vrsraq_n_u8(a, b, 3);
   5877 }
   5878 
   5879 // CHECK-LABEL: define <8 x i16> @test_vrsraq_n_u16(<8 x i16> %a, <8 x i16> %b) #0 {
   5880 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   5881 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   5882 // CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   5883 // CHECK:   [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> <i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3>)
   5884 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   5885 // CHECK:   [[TMP3:%.*]] = add <8 x i16> [[TMP2]], [[VRSHR_N1]]
   5886 // CHECK:   ret <8 x i16> [[TMP3]]
   5887 int16x8_t test_vrsraq_n_u16(int16x8_t a, int16x8_t b) {
   5888   return vrsraq_n_u16(a, b, 3);
   5889 }
   5890 
   5891 // CHECK-LABEL: define <4 x i32> @test_vrsraq_n_u32(<4 x i32> %a, <4 x i32> %b) #0 {
   5892 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   5893 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   5894 // CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   5895 // CHECK:   [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> <i32 -3, i32 -3, i32 -3, i32 -3>)
   5896 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   5897 // CHECK:   [[TMP3:%.*]] = add <4 x i32> [[TMP2]], [[VRSHR_N1]]
   5898 // CHECK:   ret <4 x i32> [[TMP3]]
   5899 int32x4_t test_vrsraq_n_u32(int32x4_t a, int32x4_t b) {
   5900   return vrsraq_n_u32(a, b, 3);
   5901 }
   5902 
   5903 // CHECK-LABEL: define <2 x i64> @test_vrsraq_n_u64(<2 x i64> %a, <2 x i64> %b) #0 {
   5904 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   5905 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   5906 // CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   5907 // CHECK:   [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> <i64 -3, i64 -3>)
   5908 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   5909 // CHECK:   [[TMP3:%.*]] = add <2 x i64> [[TMP2]], [[VRSHR_N1]]
   5910 // CHECK:   ret <2 x i64> [[TMP3]]
   5911 int64x2_t test_vrsraq_n_u64(int64x2_t a, int64x2_t b) {
   5912   return vrsraq_n_u64(a, b, 3);
   5913 }
   5914 
   5915 // CHECK-LABEL: define <8 x i8> @test_vsri_n_s8(<8 x i8> %a, <8 x i8> %b) #0 {
   5916 // CHECK:   [[VSRI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
   5917 // CHECK:   ret <8 x i8> [[VSRI_N]]
   5918 int8x8_t test_vsri_n_s8(int8x8_t a, int8x8_t b) {
   5919   return vsri_n_s8(a, b, 3);
   5920 }
   5921 
   5922 // CHECK-LABEL: define <4 x i16> @test_vsri_n_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   5923 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   5924 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   5925 // CHECK:   [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   5926 // CHECK:   [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   5927 // CHECK:   [[VSRI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16> [[VSRI_N]], <4 x i16> [[VSRI_N1]], i32 3)
   5928 // CHECK:   ret <4 x i16> [[VSRI_N2]]
   5929 int16x4_t test_vsri_n_s16(int16x4_t a, int16x4_t b) {
   5930   return vsri_n_s16(a, b, 3);
   5931 }
   5932 
   5933 // CHECK-LABEL: define <2 x i32> @test_vsri_n_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   5934 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   5935 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   5936 // CHECK:   [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   5937 // CHECK:   [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   5938 // CHECK:   [[VSRI_N2:%.*]] = call <2 x i32> @llvm.aarch64.neon.vsri.v2i32(<2 x i32> [[VSRI_N]], <2 x i32> [[VSRI_N1]], i32 3)
   5939 // CHECK:   ret <2 x i32> [[VSRI_N2]]
   5940 int32x2_t test_vsri_n_s32(int32x2_t a, int32x2_t b) {
   5941   return vsri_n_s32(a, b, 3);
   5942 }
   5943 
   5944 // CHECK-LABEL: define <16 x i8> @test_vsriq_n_s8(<16 x i8> %a, <16 x i8> %b) #0 {
   5945 // CHECK:   [[VSRI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
   5946 // CHECK:   ret <16 x i8> [[VSRI_N]]
   5947 int8x16_t test_vsriq_n_s8(int8x16_t a, int8x16_t b) {
   5948   return vsriq_n_s8(a, b, 3);
   5949 }
   5950 
   5951 // CHECK-LABEL: define <8 x i16> @test_vsriq_n_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   5952 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   5953 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   5954 // CHECK:   [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   5955 // CHECK:   [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   5956 // CHECK:   [[VSRI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16> [[VSRI_N]], <8 x i16> [[VSRI_N1]], i32 3)
   5957 // CHECK:   ret <8 x i16> [[VSRI_N2]]
   5958 int16x8_t test_vsriq_n_s16(int16x8_t a, int16x8_t b) {
   5959   return vsriq_n_s16(a, b, 3);
   5960 }
   5961 
   5962 // CHECK-LABEL: define <4 x i32> @test_vsriq_n_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   5963 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   5964 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   5965 // CHECK:   [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   5966 // CHECK:   [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   5967 // CHECK:   [[VSRI_N2:%.*]] = call <4 x i32> @llvm.aarch64.neon.vsri.v4i32(<4 x i32> [[VSRI_N]], <4 x i32> [[VSRI_N1]], i32 3)
   5968 // CHECK:   ret <4 x i32> [[VSRI_N2]]
   5969 int32x4_t test_vsriq_n_s32(int32x4_t a, int32x4_t b) {
   5970   return vsriq_n_s32(a, b, 3);
   5971 }
   5972 
   5973 // CHECK-LABEL: define <2 x i64> @test_vsriq_n_s64(<2 x i64> %a, <2 x i64> %b) #0 {
   5974 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   5975 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   5976 // CHECK:   [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   5977 // CHECK:   [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   5978 // CHECK:   [[VSRI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsri.v2i64(<2 x i64> [[VSRI_N]], <2 x i64> [[VSRI_N1]], i32 3)
   5979 // CHECK:   ret <2 x i64> [[VSRI_N2]]
   5980 int64x2_t test_vsriq_n_s64(int64x2_t a, int64x2_t b) {
   5981   return vsriq_n_s64(a, b, 3);
   5982 }
   5983 
   5984 // CHECK-LABEL: define <8 x i8> @test_vsri_n_u8(<8 x i8> %a, <8 x i8> %b) #0 {
   5985 // CHECK:   [[VSRI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
   5986 // CHECK:   ret <8 x i8> [[VSRI_N]]
   5987 int8x8_t test_vsri_n_u8(int8x8_t a, int8x8_t b) {
   5988   return vsri_n_u8(a, b, 3);
   5989 }
   5990 
   5991 // CHECK-LABEL: define <4 x i16> @test_vsri_n_u16(<4 x i16> %a, <4 x i16> %b) #0 {
   5992 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   5993 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   5994 // CHECK:   [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   5995 // CHECK:   [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   5996 // CHECK:   [[VSRI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16> [[VSRI_N]], <4 x i16> [[VSRI_N1]], i32 3)
   5997 // CHECK:   ret <4 x i16> [[VSRI_N2]]
   5998 int16x4_t test_vsri_n_u16(int16x4_t a, int16x4_t b) {
   5999   return vsri_n_u16(a, b, 3);
   6000 }
   6001 
   6002 // CHECK-LABEL: define <2 x i32> @test_vsri_n_u32(<2 x i32> %a, <2 x i32> %b) #0 {
   6003 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   6004 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   6005 // CHECK:   [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   6006 // CHECK:   [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   6007 // CHECK:   [[VSRI_N2:%.*]] = call <2 x i32> @llvm.aarch64.neon.vsri.v2i32(<2 x i32> [[VSRI_N]], <2 x i32> [[VSRI_N1]], i32 3)
   6008 // CHECK:   ret <2 x i32> [[VSRI_N2]]
   6009 int32x2_t test_vsri_n_u32(int32x2_t a, int32x2_t b) {
   6010   return vsri_n_u32(a, b, 3);
   6011 }
   6012 
   6013 // CHECK-LABEL: define <16 x i8> @test_vsriq_n_u8(<16 x i8> %a, <16 x i8> %b) #0 {
   6014 // CHECK:   [[VSRI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
   6015 // CHECK:   ret <16 x i8> [[VSRI_N]]
   6016 int8x16_t test_vsriq_n_u8(int8x16_t a, int8x16_t b) {
   6017   return vsriq_n_u8(a, b, 3);
   6018 }
   6019 
   6020 // CHECK-LABEL: define <8 x i16> @test_vsriq_n_u16(<8 x i16> %a, <8 x i16> %b) #0 {
   6021 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   6022 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   6023 // CHECK:   [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   6024 // CHECK:   [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   6025 // CHECK:   [[VSRI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16> [[VSRI_N]], <8 x i16> [[VSRI_N1]], i32 3)
   6026 // CHECK:   ret <8 x i16> [[VSRI_N2]]
   6027 int16x8_t test_vsriq_n_u16(int16x8_t a, int16x8_t b) {
   6028   return vsriq_n_u16(a, b, 3);
   6029 }
   6030 
   6031 // CHECK-LABEL: define <4 x i32> @test_vsriq_n_u32(<4 x i32> %a, <4 x i32> %b) #0 {
   6032 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   6033 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   6034 // CHECK:   [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   6035 // CHECK:   [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   6036 // CHECK:   [[VSRI_N2:%.*]] = call <4 x i32> @llvm.aarch64.neon.vsri.v4i32(<4 x i32> [[VSRI_N]], <4 x i32> [[VSRI_N1]], i32 3)
   6037 // CHECK:   ret <4 x i32> [[VSRI_N2]]
   6038 int32x4_t test_vsriq_n_u32(int32x4_t a, int32x4_t b) {
   6039   return vsriq_n_u32(a, b, 3);
   6040 }
   6041 
   6042 // CHECK-LABEL: define <2 x i64> @test_vsriq_n_u64(<2 x i64> %a, <2 x i64> %b) #0 {
   6043 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   6044 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   6045 // CHECK:   [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   6046 // CHECK:   [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   6047 // CHECK:   [[VSRI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsri.v2i64(<2 x i64> [[VSRI_N]], <2 x i64> [[VSRI_N1]], i32 3)
   6048 // CHECK:   ret <2 x i64> [[VSRI_N2]]
   6049 int64x2_t test_vsriq_n_u64(int64x2_t a, int64x2_t b) {
   6050   return vsriq_n_u64(a, b, 3);
   6051 }
   6052 
   6053 // CHECK-LABEL: define <8 x i8> @test_vsri_n_p8(<8 x i8> %a, <8 x i8> %b) #0 {
   6054 // CHECK:   [[VSRI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
   6055 // CHECK:   ret <8 x i8> [[VSRI_N]]
   6056 poly8x8_t test_vsri_n_p8(poly8x8_t a, poly8x8_t b) {
   6057   return vsri_n_p8(a, b, 3);
   6058 }
   6059 
   6060 // CHECK-LABEL: define <4 x i16> @test_vsri_n_p16(<4 x i16> %a, <4 x i16> %b) #0 {
   6061 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   6062 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   6063 // CHECK:   [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   6064 // CHECK:   [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   6065 // CHECK:   [[VSRI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16> [[VSRI_N]], <4 x i16> [[VSRI_N1]], i32 15)
   6066 // CHECK:   ret <4 x i16> [[VSRI_N2]]
   6067 poly16x4_t test_vsri_n_p16(poly16x4_t a, poly16x4_t b) {
   6068   return vsri_n_p16(a, b, 15);
   6069 }
   6070 
   6071 // CHECK-LABEL: define <16 x i8> @test_vsriq_n_p8(<16 x i8> %a, <16 x i8> %b) #0 {
   6072 // CHECK:   [[VSRI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
   6073 // CHECK:   ret <16 x i8> [[VSRI_N]]
   6074 poly8x16_t test_vsriq_n_p8(poly8x16_t a, poly8x16_t b) {
   6075   return vsriq_n_p8(a, b, 3);
   6076 }
   6077 
   6078 // CHECK-LABEL: define <8 x i16> @test_vsriq_n_p16(<8 x i16> %a, <8 x i16> %b) #0 {
   6079 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   6080 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   6081 // CHECK:   [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   6082 // CHECK:   [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   6083 // CHECK:   [[VSRI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16> [[VSRI_N]], <8 x i16> [[VSRI_N1]], i32 15)
   6084 // CHECK:   ret <8 x i16> [[VSRI_N2]]
   6085 poly16x8_t test_vsriq_n_p16(poly16x8_t a, poly16x8_t b) {
   6086   return vsriq_n_p16(a, b, 15);
   6087 }
   6088 
   6089 // CHECK-LABEL: define <8 x i8> @test_vsli_n_s8(<8 x i8> %a, <8 x i8> %b) #0 {
   6090 // CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
   6091 // CHECK:   ret <8 x i8> [[VSLI_N]]
   6092 int8x8_t test_vsli_n_s8(int8x8_t a, int8x8_t b) {
   6093   return vsli_n_s8(a, b, 3);
   6094 }
   6095 
   6096 // CHECK-LABEL: define <4 x i16> @test_vsli_n_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   6097 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   6098 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   6099 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   6100 // CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   6101 // CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], i32 3)
   6102 // CHECK:   ret <4 x i16> [[VSLI_N2]]
   6103 int16x4_t test_vsli_n_s16(int16x4_t a, int16x4_t b) {
   6104   return vsli_n_s16(a, b, 3);
   6105 }
   6106 
   6107 // CHECK-LABEL: define <2 x i32> @test_vsli_n_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   6108 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   6109 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   6110 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   6111 // CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   6112 // CHECK:   [[VSLI_N2:%.*]] = call <2 x i32> @llvm.aarch64.neon.vsli.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], i32 3)
   6113 // CHECK:   ret <2 x i32> [[VSLI_N2]]
   6114 int32x2_t test_vsli_n_s32(int32x2_t a, int32x2_t b) {
   6115   return vsli_n_s32(a, b, 3);
   6116 }
   6117 
   6118 // CHECK-LABEL: define <16 x i8> @test_vsliq_n_s8(<16 x i8> %a, <16 x i8> %b) #0 {
   6119 // CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
   6120 // CHECK:   ret <16 x i8> [[VSLI_N]]
   6121 int8x16_t test_vsliq_n_s8(int8x16_t a, int8x16_t b) {
   6122   return vsliq_n_s8(a, b, 3);
   6123 }
   6124 
   6125 // CHECK-LABEL: define <8 x i16> @test_vsliq_n_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   6126 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   6127 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   6128 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   6129 // CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   6130 // CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], i32 3)
   6131 // CHECK:   ret <8 x i16> [[VSLI_N2]]
   6132 int16x8_t test_vsliq_n_s16(int16x8_t a, int16x8_t b) {
   6133   return vsliq_n_s16(a, b, 3);
   6134 }
   6135 
   6136 // CHECK-LABEL: define <4 x i32> @test_vsliq_n_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   6137 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   6138 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   6139 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   6140 // CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   6141 // CHECK:   [[VSLI_N2:%.*]] = call <4 x i32> @llvm.aarch64.neon.vsli.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], i32 3)
   6142 // CHECK:   ret <4 x i32> [[VSLI_N2]]
   6143 int32x4_t test_vsliq_n_s32(int32x4_t a, int32x4_t b) {
   6144   return vsliq_n_s32(a, b, 3);
   6145 }
   6146 
   6147 // CHECK-LABEL: define <2 x i64> @test_vsliq_n_s64(<2 x i64> %a, <2 x i64> %b) #0 {
   6148 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   6149 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   6150 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   6151 // CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   6152 // CHECK:   [[VSLI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], i32 3)
   6153 // CHECK:   ret <2 x i64> [[VSLI_N2]]
   6154 int64x2_t test_vsliq_n_s64(int64x2_t a, int64x2_t b) {
   6155   return vsliq_n_s64(a, b, 3);
   6156 }
   6157 
   6158 // CHECK-LABEL: define <8 x i8> @test_vsli_n_u8(<8 x i8> %a, <8 x i8> %b) #0 {
   6159 // CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
   6160 // CHECK:   ret <8 x i8> [[VSLI_N]]
   6161 uint8x8_t test_vsli_n_u8(uint8x8_t a, uint8x8_t b) {
   6162   return vsli_n_u8(a, b, 3);
   6163 }
   6164 
   6165 // CHECK-LABEL: define <4 x i16> @test_vsli_n_u16(<4 x i16> %a, <4 x i16> %b) #0 {
   6166 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   6167 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   6168 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   6169 // CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   6170 // CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], i32 3)
   6171 // CHECK:   ret <4 x i16> [[VSLI_N2]]
   6172 uint16x4_t test_vsli_n_u16(uint16x4_t a, uint16x4_t b) {
   6173   return vsli_n_u16(a, b, 3);
   6174 }
   6175 
   6176 // CHECK-LABEL: define <2 x i32> @test_vsli_n_u32(<2 x i32> %a, <2 x i32> %b) #0 {
   6177 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   6178 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   6179 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   6180 // CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   6181 // CHECK:   [[VSLI_N2:%.*]] = call <2 x i32> @llvm.aarch64.neon.vsli.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], i32 3)
   6182 // CHECK:   ret <2 x i32> [[VSLI_N2]]
   6183 uint32x2_t test_vsli_n_u32(uint32x2_t a, uint32x2_t b) {
   6184   return vsli_n_u32(a, b, 3);
   6185 }
   6186 
   6187 // CHECK-LABEL: define <16 x i8> @test_vsliq_n_u8(<16 x i8> %a, <16 x i8> %b) #0 {
   6188 // CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
   6189 // CHECK:   ret <16 x i8> [[VSLI_N]]
   6190 uint8x16_t test_vsliq_n_u8(uint8x16_t a, uint8x16_t b) {
   6191   return vsliq_n_u8(a, b, 3);
   6192 }
   6193 
   6194 // CHECK-LABEL: define <8 x i16> @test_vsliq_n_u16(<8 x i16> %a, <8 x i16> %b) #0 {
   6195 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   6196 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   6197 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   6198 // CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   6199 // CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], i32 3)
   6200 // CHECK:   ret <8 x i16> [[VSLI_N2]]
   6201 uint16x8_t test_vsliq_n_u16(uint16x8_t a, uint16x8_t b) {
   6202   return vsliq_n_u16(a, b, 3);
   6203 }
   6204 
   6205 // CHECK-LABEL: define <4 x i32> @test_vsliq_n_u32(<4 x i32> %a, <4 x i32> %b) #0 {
   6206 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   6207 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   6208 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   6209 // CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   6210 // CHECK:   [[VSLI_N2:%.*]] = call <4 x i32> @llvm.aarch64.neon.vsli.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], i32 3)
   6211 // CHECK:   ret <4 x i32> [[VSLI_N2]]
   6212 uint32x4_t test_vsliq_n_u32(uint32x4_t a, uint32x4_t b) {
   6213   return vsliq_n_u32(a, b, 3);
   6214 }
   6215 
   6216 // CHECK-LABEL: define <2 x i64> @test_vsliq_n_u64(<2 x i64> %a, <2 x i64> %b) #0 {
   6217 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   6218 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   6219 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   6220 // CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   6221 // CHECK:   [[VSLI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], i32 3)
   6222 // CHECK:   ret <2 x i64> [[VSLI_N2]]
   6223 uint64x2_t test_vsliq_n_u64(uint64x2_t a, uint64x2_t b) {
   6224   return vsliq_n_u64(a, b, 3);
   6225 }
   6226 
   6227 // CHECK-LABEL: define <8 x i8> @test_vsli_n_p8(<8 x i8> %a, <8 x i8> %b) #0 {
   6228 // CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
   6229 // CHECK:   ret <8 x i8> [[VSLI_N]]
   6230 poly8x8_t test_vsli_n_p8(poly8x8_t a, poly8x8_t b) {
   6231   return vsli_n_p8(a, b, 3);
   6232 }
   6233 
   6234 // CHECK-LABEL: define <4 x i16> @test_vsli_n_p16(<4 x i16> %a, <4 x i16> %b) #0 {
   6235 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   6236 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   6237 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   6238 // CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   6239 // CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], i32 15)
   6240 // CHECK:   ret <4 x i16> [[VSLI_N2]]
   6241 poly16x4_t test_vsli_n_p16(poly16x4_t a, poly16x4_t b) {
   6242   return vsli_n_p16(a, b, 15);
   6243 }
   6244 
   6245 // CHECK-LABEL: define <16 x i8> @test_vsliq_n_p8(<16 x i8> %a, <16 x i8> %b) #0 {
   6246 // CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
   6247 // CHECK:   ret <16 x i8> [[VSLI_N]]
   6248 poly8x16_t test_vsliq_n_p8(poly8x16_t a, poly8x16_t b) {
   6249   return vsliq_n_p8(a, b, 3);
   6250 }
   6251 
   6252 // CHECK-LABEL: define <8 x i16> @test_vsliq_n_p16(<8 x i16> %a, <8 x i16> %b) #0 {
   6253 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   6254 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   6255 // CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   6256 // CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   6257 // CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], i32 15)
   6258 // CHECK:   ret <8 x i16> [[VSLI_N2]]
   6259 poly16x8_t test_vsliq_n_p16(poly16x8_t a, poly16x8_t b) {
   6260   return vsliq_n_p16(a, b, 15);
   6261 }
   6262 
   6263 // CHECK-LABEL: define <8 x i8> @test_vqshlu_n_s8(<8 x i8> %a) #0 {
   6264 // CHECK:   [[VQSHLU_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshlu.v8i8(<8 x i8> %a, <8 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
   6265 // CHECK:   ret <8 x i8> [[VQSHLU_N]]
   6266 int8x8_t test_vqshlu_n_s8(int8x8_t a) {
   6267   return vqshlu_n_s8(a, 3);
   6268 }
   6269 
   6270 // CHECK-LABEL: define <4 x i16> @test_vqshlu_n_s16(<4 x i16> %a) #0 {
   6271 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   6272 // CHECK:   [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   6273 // CHECK:   [[VQSHLU_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshlu.v4i16(<4 x i16> [[VQSHLU_N]], <4 x i16> <i16 3, i16 3, i16 3, i16 3>)
   6274 // CHECK:   ret <4 x i16> [[VQSHLU_N1]]
   6275 int16x4_t test_vqshlu_n_s16(int16x4_t a) {
   6276   return vqshlu_n_s16(a, 3);
   6277 }
   6278 
   6279 // CHECK-LABEL: define <2 x i32> @test_vqshlu_n_s32(<2 x i32> %a) #0 {
   6280 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   6281 // CHECK:   [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   6282 // CHECK:   [[VQSHLU_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshlu.v2i32(<2 x i32> [[VQSHLU_N]], <2 x i32> <i32 3, i32 3>)
   6283 // CHECK:   ret <2 x i32> [[VQSHLU_N1]]
   6284 int32x2_t test_vqshlu_n_s32(int32x2_t a) {
   6285   return vqshlu_n_s32(a, 3);
   6286 }
   6287 
   6288 // CHECK-LABEL: define <16 x i8> @test_vqshluq_n_s8(<16 x i8> %a) #0 {
   6289 // CHECK:   [[VQSHLU_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqshlu.v16i8(<16 x i8> %a, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
   6290 // CHECK:   ret <16 x i8> [[VQSHLU_N]]
   6291 int8x16_t test_vqshluq_n_s8(int8x16_t a) {
   6292   return vqshluq_n_s8(a, 3);
   6293 }
   6294 
   6295 // CHECK-LABEL: define <8 x i16> @test_vqshluq_n_s16(<8 x i16> %a) #0 {
   6296 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   6297 // CHECK:   [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   6298 // CHECK:   [[VQSHLU_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshlu.v8i16(<8 x i16> [[VQSHLU_N]], <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
   6299 // CHECK:   ret <8 x i16> [[VQSHLU_N1]]
   6300 int16x8_t test_vqshluq_n_s16(int16x8_t a) {
   6301   return vqshluq_n_s16(a, 3);
   6302 }
   6303 
   6304 // CHECK-LABEL: define <4 x i32> @test_vqshluq_n_s32(<4 x i32> %a) #0 {
   6305 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   6306 // CHECK:   [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   6307 // CHECK:   [[VQSHLU_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshlu.v4i32(<4 x i32> [[VQSHLU_N]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>)
   6308 // CHECK:   ret <4 x i32> [[VQSHLU_N1]]
   6309 int32x4_t test_vqshluq_n_s32(int32x4_t a) {
   6310   return vqshluq_n_s32(a, 3);
   6311 }
   6312 
   6313 // CHECK-LABEL: define <2 x i64> @test_vqshluq_n_s64(<2 x i64> %a) #0 {
   6314 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   6315 // CHECK:   [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   6316 // CHECK:   [[VQSHLU_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshlu.v2i64(<2 x i64> [[VQSHLU_N]], <2 x i64> <i64 3, i64 3>)
   6317 // CHECK:   ret <2 x i64> [[VQSHLU_N1]]
   6318 int64x2_t test_vqshluq_n_s64(int64x2_t a) {
   6319   return vqshluq_n_s64(a, 3);
   6320 }
   6321 
   6322 // CHECK-LABEL: define <8 x i8> @test_vshrn_n_s16(<8 x i16> %a) #0 {
   6323 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   6324 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   6325 // CHECK:   [[TMP2:%.*]] = ashr <8 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   6326 // CHECK:   [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8>
   6327 // CHECK:   ret <8 x i8> [[VSHRN_N]]
   6328 int8x8_t test_vshrn_n_s16(int16x8_t a) {
   6329   return vshrn_n_s16(a, 3);
   6330 }
   6331 
   6332 // CHECK-LABEL: define <4 x i16> @test_vshrn_n_s32(<4 x i32> %a) #0 {
   6333 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   6334 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   6335 // CHECK:   [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], <i32 9, i32 9, i32 9, i32 9>
   6336 // CHECK:   [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16>
   6337 // CHECK:   ret <4 x i16> [[VSHRN_N]]
   6338 int16x4_t test_vshrn_n_s32(int32x4_t a) {
   6339   return vshrn_n_s32(a, 9);
   6340 }
   6341 
   6342 // CHECK-LABEL: define <2 x i32> @test_vshrn_n_s64(<2 x i64> %a) #0 {
   6343 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   6344 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   6345 // CHECK:   [[TMP2:%.*]] = ashr <2 x i64> [[TMP1]], <i64 19, i64 19>
   6346 // CHECK:   [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32>
   6347 // CHECK:   ret <2 x i32> [[VSHRN_N]]
   6348 int32x2_t test_vshrn_n_s64(int64x2_t a) {
   6349   return vshrn_n_s64(a, 19);
   6350 }
   6351 
   6352 // CHECK-LABEL: define <8 x i8> @test_vshrn_n_u16(<8 x i16> %a) #0 {
   6353 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   6354 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   6355 // CHECK:   [[TMP2:%.*]] = lshr <8 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   6356 // CHECK:   [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8>
   6357 // CHECK:   ret <8 x i8> [[VSHRN_N]]
   6358 uint8x8_t test_vshrn_n_u16(uint16x8_t a) {
   6359   return vshrn_n_u16(a, 3);
   6360 }
   6361 
   6362 // CHECK-LABEL: define <4 x i16> @test_vshrn_n_u32(<4 x i32> %a) #0 {
   6363 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   6364 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   6365 // CHECK:   [[TMP2:%.*]] = lshr <4 x i32> [[TMP1]], <i32 9, i32 9, i32 9, i32 9>
   6366 // CHECK:   [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16>
   6367 // CHECK:   ret <4 x i16> [[VSHRN_N]]
   6368 uint16x4_t test_vshrn_n_u32(uint32x4_t a) {
   6369   return vshrn_n_u32(a, 9);
   6370 }
   6371 
   6372 // CHECK-LABEL: define <2 x i32> @test_vshrn_n_u64(<2 x i64> %a) #0 {
   6373 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   6374 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   6375 // CHECK:   [[TMP2:%.*]] = lshr <2 x i64> [[TMP1]], <i64 19, i64 19>
   6376 // CHECK:   [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32>
   6377 // CHECK:   ret <2 x i32> [[VSHRN_N]]
   6378 uint32x2_t test_vshrn_n_u64(uint64x2_t a) {
   6379   return vshrn_n_u64(a, 19);
   6380 }
   6381 
   6382 // CHECK-LABEL: define <16 x i8> @test_vshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) #0 {
   6383 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   6384 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   6385 // CHECK:   [[TMP2:%.*]] = ashr <8 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   6386 // CHECK:   [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8>
   6387 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VSHRN_N]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   6388 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
   6389 int8x16_t test_vshrn_high_n_s16(int8x8_t a, int16x8_t b) {
   6390   return vshrn_high_n_s16(a, b, 3);
   6391 }
   6392 
   6393 // CHECK-LABEL: define <8 x i16> @test_vshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) #0 {
   6394 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   6395 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   6396 // CHECK:   [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], <i32 9, i32 9, i32 9, i32 9>
   6397 // CHECK:   [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16>
   6398 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VSHRN_N]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   6399 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
   6400 int16x8_t test_vshrn_high_n_s32(int16x4_t a, int32x4_t b) {
   6401   return vshrn_high_n_s32(a, b, 9);
   6402 }
   6403 
   6404 // CHECK-LABEL: define <4 x i32> @test_vshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) #0 {
   6405 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   6406 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   6407 // CHECK:   [[TMP2:%.*]] = ashr <2 x i64> [[TMP1]], <i64 19, i64 19>
   6408 // CHECK:   [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32>
   6409 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VSHRN_N]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   6410 // CHECK:   ret <4 x i32> [[SHUFFLE_I]]
   6411 int32x4_t test_vshrn_high_n_s64(int32x2_t a, int64x2_t b) {
   6412   return vshrn_high_n_s64(a, b, 19);
   6413 }
   6414 
   6415 // CHECK-LABEL: define <16 x i8> @test_vshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) #0 {
   6416 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   6417 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   6418 // CHECK:   [[TMP2:%.*]] = lshr <8 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   6419 // CHECK:   [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8>
   6420 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VSHRN_N]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   6421 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
   6422 uint8x16_t test_vshrn_high_n_u16(uint8x8_t a, uint16x8_t b) {
   6423   return vshrn_high_n_u16(a, b, 3);
   6424 }
   6425 
   6426 // CHECK-LABEL: define <8 x i16> @test_vshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) #0 {
   6427 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   6428 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   6429 // CHECK:   [[TMP2:%.*]] = lshr <4 x i32> [[TMP1]], <i32 9, i32 9, i32 9, i32 9>
   6430 // CHECK:   [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16>
   6431 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VSHRN_N]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   6432 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
   6433 uint16x8_t test_vshrn_high_n_u32(uint16x4_t a, uint32x4_t b) {
   6434   return vshrn_high_n_u32(a, b, 9);
   6435 }
   6436 
   6437 // CHECK-LABEL: define <4 x i32> @test_vshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) #0 {
   6438 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   6439 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   6440 // CHECK:   [[TMP2:%.*]] = lshr <2 x i64> [[TMP1]], <i64 19, i64 19>
   6441 // CHECK:   [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32>
   6442 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VSHRN_N]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   6443 // CHECK:   ret <4 x i32> [[SHUFFLE_I]]
   6444 uint32x4_t test_vshrn_high_n_u64(uint32x2_t a, uint64x2_t b) {
   6445   return vshrn_high_n_u64(a, b, 19);
   6446 }
   6447 
   6448 // CHECK-LABEL: define <8 x i8> @test_vqshrun_n_s16(<8 x i16> %a) #0 {
   6449 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   6450 // CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   6451 // CHECK:   [[VQSHRUN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> [[VQSHRUN_N]], i32 3)
   6452 // CHECK:   ret <8 x i8> [[VQSHRUN_N1]]
   6453 int8x8_t test_vqshrun_n_s16(int16x8_t a) {
   6454   return vqshrun_n_s16(a, 3);
   6455 }
   6456 
   6457 // CHECK-LABEL: define <4 x i16> @test_vqshrun_n_s32(<4 x i32> %a) #0 {
   6458 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   6459 // CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   6460 // CHECK:   [[VQSHRUN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> [[VQSHRUN_N]], i32 9)
   6461 // CHECK:   ret <4 x i16> [[VQSHRUN_N1]]
   6462 int16x4_t test_vqshrun_n_s32(int32x4_t a) {
   6463   return vqshrun_n_s32(a, 9);
   6464 }
   6465 
   6466 // CHECK-LABEL: define <2 x i32> @test_vqshrun_n_s64(<2 x i64> %a) #0 {
   6467 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   6468 // CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   6469 // CHECK:   [[VQSHRUN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> [[VQSHRUN_N]], i32 19)
   6470 // CHECK:   ret <2 x i32> [[VQSHRUN_N1]]
   6471 int32x2_t test_vqshrun_n_s64(int64x2_t a) {
   6472   return vqshrun_n_s64(a, 19);
   6473 }
   6474 
   6475 // CHECK-LABEL: define <16 x i8> @test_vqshrun_high_n_s16(<8 x i8> %a, <8 x i16> %b) #0 {
   6476 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   6477 // CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   6478 // CHECK:   [[VQSHRUN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> [[VQSHRUN_N]], i32 3)
   6479 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQSHRUN_N1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   6480 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
   6481 int8x16_t test_vqshrun_high_n_s16(int8x8_t a, int16x8_t b) {
   6482   return vqshrun_high_n_s16(a, b, 3);
   6483 }
   6484 
   6485 // CHECK-LABEL: define <8 x i16> @test_vqshrun_high_n_s32(<4 x i16> %a, <4 x i32> %b) #0 {
   6486 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   6487 // CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   6488 // CHECK:   [[VQSHRUN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> [[VQSHRUN_N]], i32 9)
   6489 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQSHRUN_N1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   6490 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
   6491 int16x8_t test_vqshrun_high_n_s32(int16x4_t a, int32x4_t b) {
   6492   return vqshrun_high_n_s32(a, b, 9);
   6493 }
   6494 
   6495 // CHECK-LABEL: define <4 x i32> @test_vqshrun_high_n_s64(<2 x i32> %a, <2 x i64> %b) #0 {
   6496 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   6497 // CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   6498 // CHECK:   [[VQSHRUN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> [[VQSHRUN_N]], i32 19)
   6499 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQSHRUN_N1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   6500 // CHECK:   ret <4 x i32> [[SHUFFLE_I]]
   6501 int32x4_t test_vqshrun_high_n_s64(int32x2_t a, int64x2_t b) {
   6502   return vqshrun_high_n_s64(a, b, 19);
   6503 }
   6504 
   6505 // CHECK-LABEL: define <8 x i8> @test_vrshrn_n_s16(<8 x i16> %a) #0 {
   6506 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   6507 // CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   6508 // CHECK:   [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[VRSHRN_N]], i32 3)
   6509 // CHECK:   ret <8 x i8> [[VRSHRN_N1]]
   6510 int8x8_t test_vrshrn_n_s16(int16x8_t a) {
   6511   return vrshrn_n_s16(a, 3);
   6512 }
   6513 
   6514 // CHECK-LABEL: define <4 x i16> @test_vrshrn_n_s32(<4 x i32> %a) #0 {
   6515 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   6516 // CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   6517 // CHECK:   [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[VRSHRN_N]], i32 9)
   6518 // CHECK:   ret <4 x i16> [[VRSHRN_N1]]
   6519 int16x4_t test_vrshrn_n_s32(int32x4_t a) {
   6520   return vrshrn_n_s32(a, 9);
   6521 }
   6522 
   6523 // CHECK-LABEL: define <2 x i32> @test_vrshrn_n_s64(<2 x i64> %a) #0 {
   6524 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   6525 // CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   6526 // CHECK:   [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[VRSHRN_N]], i32 19)
   6527 // CHECK:   ret <2 x i32> [[VRSHRN_N1]]
   6528 int32x2_t test_vrshrn_n_s64(int64x2_t a) {
   6529   return vrshrn_n_s64(a, 19);
   6530 }
   6531 
   6532 // CHECK-LABEL: define <8 x i8> @test_vrshrn_n_u16(<8 x i16> %a) #0 {
   6533 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   6534 // CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   6535 // CHECK:   [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[VRSHRN_N]], i32 3)
   6536 // CHECK:   ret <8 x i8> [[VRSHRN_N1]]
   6537 uint8x8_t test_vrshrn_n_u16(uint16x8_t a) {
   6538   return vrshrn_n_u16(a, 3);
   6539 }
   6540 
   6541 // CHECK-LABEL: define <4 x i16> @test_vrshrn_n_u32(<4 x i32> %a) #0 {
   6542 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   6543 // CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   6544 // CHECK:   [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[VRSHRN_N]], i32 9)
   6545 // CHECK:   ret <4 x i16> [[VRSHRN_N1]]
   6546 uint16x4_t test_vrshrn_n_u32(uint32x4_t a) {
   6547   return vrshrn_n_u32(a, 9);
   6548 }
   6549 
   6550 // CHECK-LABEL: define <2 x i32> @test_vrshrn_n_u64(<2 x i64> %a) #0 {
   6551 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   6552 // CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   6553 // CHECK:   [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[VRSHRN_N]], i32 19)
   6554 // CHECK:   ret <2 x i32> [[VRSHRN_N1]]
   6555 uint32x2_t test_vrshrn_n_u64(uint64x2_t a) {
   6556   return vrshrn_n_u64(a, 19);
   6557 }
   6558 
   6559 // CHECK-LABEL: define <16 x i8> @test_vrshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) #0 {
   6560 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   6561 // CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   6562 // CHECK:   [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[VRSHRN_N]], i32 3)
   6563 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VRSHRN_N1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   6564 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
   6565 int8x16_t test_vrshrn_high_n_s16(int8x8_t a, int16x8_t b) {
   6566   return vrshrn_high_n_s16(a, b, 3);
   6567 }
   6568 
   6569 // CHECK-LABEL: define <8 x i16> @test_vrshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) #0 {
   6570 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   6571 // CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   6572 // CHECK:   [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[VRSHRN_N]], i32 9)
   6573 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VRSHRN_N1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   6574 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
   6575 int16x8_t test_vrshrn_high_n_s32(int16x4_t a, int32x4_t b) {
   6576   return vrshrn_high_n_s32(a, b, 9);
   6577 }
   6578 
   6579 // CHECK-LABEL: define <4 x i32> @test_vrshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) #0 {
   6580 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   6581 // CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   6582 // CHECK:   [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[VRSHRN_N]], i32 19)
   6583 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VRSHRN_N1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   6584 // CHECK:   ret <4 x i32> [[SHUFFLE_I]]
   6585 int32x4_t test_vrshrn_high_n_s64(int32x2_t a, int64x2_t b) {
   6586   return vrshrn_high_n_s64(a, b, 19);
   6587 }
   6588 
   6589 // CHECK-LABEL: define <16 x i8> @test_vrshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) #0 {
   6590 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   6591 // CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   6592 // CHECK:   [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[VRSHRN_N]], i32 3)
   6593 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VRSHRN_N1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   6594 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
   6595 uint8x16_t test_vrshrn_high_n_u16(uint8x8_t a, uint16x8_t b) {
   6596   return vrshrn_high_n_u16(a, b, 3);
   6597 }
   6598 
   6599 // CHECK-LABEL: define <8 x i16> @test_vrshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) #0 {
   6600 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   6601 // CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   6602 // CHECK:   [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[VRSHRN_N]], i32 9)
   6603 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VRSHRN_N1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   6604 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
   6605 uint16x8_t test_vrshrn_high_n_u32(uint16x4_t a, uint32x4_t b) {
   6606   return vrshrn_high_n_u32(a, b, 9);
   6607 }
   6608 
   6609 // CHECK-LABEL: define <4 x i32> @test_vrshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) #0 {
   6610 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   6611 // CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   6612 // CHECK:   [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[VRSHRN_N]], i32 19)
   6613 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VRSHRN_N1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   6614 // CHECK:   ret <4 x i32> [[SHUFFLE_I]]
   6615 uint32x4_t test_vrshrn_high_n_u64(uint32x2_t a, uint64x2_t b) {
   6616   return vrshrn_high_n_u64(a, b, 19);
   6617 }
   6618 
   6619 // CHECK-LABEL: define <8 x i8> @test_vqrshrun_n_s16(<8 x i16> %a) #0 {
   6620 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   6621 // CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   6622 // CHECK:   [[VQRSHRUN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[VQRSHRUN_N]], i32 3)
   6623 // CHECK:   ret <8 x i8> [[VQRSHRUN_N1]]
   6624 int8x8_t test_vqrshrun_n_s16(int16x8_t a) {
   6625   return vqrshrun_n_s16(a, 3);
   6626 }
   6627 
   6628 // CHECK-LABEL: define <4 x i16> @test_vqrshrun_n_s32(<4 x i32> %a) #0 {
   6629 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   6630 // CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   6631 // CHECK:   [[VQRSHRUN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[VQRSHRUN_N]], i32 9)
   6632 // CHECK:   ret <4 x i16> [[VQRSHRUN_N1]]
   6633 int16x4_t test_vqrshrun_n_s32(int32x4_t a) {
   6634   return vqrshrun_n_s32(a, 9);
   6635 }
   6636 
   6637 // CHECK-LABEL: define <2 x i32> @test_vqrshrun_n_s64(<2 x i64> %a) #0 {
   6638 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   6639 // CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   6640 // CHECK:   [[VQRSHRUN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> [[VQRSHRUN_N]], i32 19)
   6641 // CHECK:   ret <2 x i32> [[VQRSHRUN_N1]]
   6642 int32x2_t test_vqrshrun_n_s64(int64x2_t a) {
   6643   return vqrshrun_n_s64(a, 19);
   6644 }
   6645 
   6646 // CHECK-LABEL: define <16 x i8> @test_vqrshrun_high_n_s16(<8 x i8> %a, <8 x i16> %b) #0 {
   6647 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   6648 // CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   6649 // CHECK:   [[VQRSHRUN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[VQRSHRUN_N]], i32 3)
   6650 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQRSHRUN_N1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   6651 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
   6652 int8x16_t test_vqrshrun_high_n_s16(int8x8_t a, int16x8_t b) {
   6653   return vqrshrun_high_n_s16(a, b, 3);
   6654 }
   6655 
   6656 // CHECK-LABEL: define <8 x i16> @test_vqrshrun_high_n_s32(<4 x i16> %a, <4 x i32> %b) #0 {
   6657 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   6658 // CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   6659 // CHECK:   [[VQRSHRUN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[VQRSHRUN_N]], i32 9)
   6660 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQRSHRUN_N1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   6661 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
   6662 int16x8_t test_vqrshrun_high_n_s32(int16x4_t a, int32x4_t b) {
   6663   return vqrshrun_high_n_s32(a, b, 9);
   6664 }
   6665 
   6666 // CHECK-LABEL: define <4 x i32> @test_vqrshrun_high_n_s64(<2 x i32> %a, <2 x i64> %b) #0 {
   6667 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   6668 // CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   6669 // CHECK:   [[VQRSHRUN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> [[VQRSHRUN_N]], i32 19)
   6670 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQRSHRUN_N1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   6671 // CHECK:   ret <4 x i32> [[SHUFFLE_I]]
   6672 int32x4_t test_vqrshrun_high_n_s64(int32x2_t a, int64x2_t b) {
   6673   return vqrshrun_high_n_s64(a, b, 19);
   6674 }
   6675 
   6676 // CHECK-LABEL: define <8 x i8> @test_vqshrn_n_s16(<8 x i16> %a) #0 {
   6677 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   6678 // CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   6679 // CHECK:   [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> [[VQSHRN_N]], i32 3)
   6680 // CHECK:   ret <8 x i8> [[VQSHRN_N1]]
   6681 int8x8_t test_vqshrn_n_s16(int16x8_t a) {
   6682   return vqshrn_n_s16(a, 3);
   6683 }
   6684 
   6685 // CHECK-LABEL: define <4 x i16> @test_vqshrn_n_s32(<4 x i32> %a) #0 {
   6686 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   6687 // CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   6688 // CHECK:   [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> [[VQSHRN_N]], i32 9)
   6689 // CHECK:   ret <4 x i16> [[VQSHRN_N1]]
   6690 int16x4_t test_vqshrn_n_s32(int32x4_t a) {
   6691   return vqshrn_n_s32(a, 9);
   6692 }
   6693 
   6694 // CHECK-LABEL: define <2 x i32> @test_vqshrn_n_s64(<2 x i64> %a) #0 {
   6695 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   6696 // CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   6697 // CHECK:   [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> [[VQSHRN_N]], i32 19)
   6698 // CHECK:   ret <2 x i32> [[VQSHRN_N1]]
   6699 int32x2_t test_vqshrn_n_s64(int64x2_t a) {
   6700   return vqshrn_n_s64(a, 19);
   6701 }
   6702 
   6703 // CHECK-LABEL: define <8 x i8> @test_vqshrn_n_u16(<8 x i16> %a) #0 {
   6704 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   6705 // CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   6706 // CHECK:   [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> [[VQSHRN_N]], i32 3)
   6707 // CHECK:   ret <8 x i8> [[VQSHRN_N1]]
   6708 uint8x8_t test_vqshrn_n_u16(uint16x8_t a) {
   6709   return vqshrn_n_u16(a, 3);
   6710 }
   6711 
   6712 // CHECK-LABEL: define <4 x i16> @test_vqshrn_n_u32(<4 x i32> %a) #0 {
   6713 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   6714 // CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   6715 // CHECK:   [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> [[VQSHRN_N]], i32 9)
   6716 // CHECK:   ret <4 x i16> [[VQSHRN_N1]]
   6717 uint16x4_t test_vqshrn_n_u32(uint32x4_t a) {
   6718   return vqshrn_n_u32(a, 9);
   6719 }
   6720 
   6721 // CHECK-LABEL: define <2 x i32> @test_vqshrn_n_u64(<2 x i64> %a) #0 {
   6722 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   6723 // CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   6724 // CHECK:   [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> [[VQSHRN_N]], i32 19)
   6725 // CHECK:   ret <2 x i32> [[VQSHRN_N1]]
   6726 uint32x2_t test_vqshrn_n_u64(uint64x2_t a) {
   6727   return vqshrn_n_u64(a, 19);
   6728 }
   6729 
   6730 // CHECK-LABEL: define <16 x i8> @test_vqshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) #0 {
   6731 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   6732 // CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   6733 // CHECK:   [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> [[VQSHRN_N]], i32 3)
   6734 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQSHRN_N1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   6735 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
   6736 int8x16_t test_vqshrn_high_n_s16(int8x8_t a, int16x8_t b) {
   6737   return vqshrn_high_n_s16(a, b, 3);
   6738 }
   6739 
   6740 // CHECK-LABEL: define <8 x i16> @test_vqshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) #0 {
   6741 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   6742 // CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   6743 // CHECK:   [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> [[VQSHRN_N]], i32 9)
   6744 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQSHRN_N1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   6745 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
   6746 int16x8_t test_vqshrn_high_n_s32(int16x4_t a, int32x4_t b) {
   6747   return vqshrn_high_n_s32(a, b, 9);
   6748 }
   6749 
   6750 // CHECK-LABEL: define <4 x i32> @test_vqshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) #0 {
   6751 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   6752 // CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   6753 // CHECK:   [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> [[VQSHRN_N]], i32 19)
   6754 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQSHRN_N1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   6755 // CHECK:   ret <4 x i32> [[SHUFFLE_I]]
   6756 int32x4_t test_vqshrn_high_n_s64(int32x2_t a, int64x2_t b) {
   6757   return vqshrn_high_n_s64(a, b, 19);
   6758 }
   6759 
   6760 // CHECK-LABEL: define <16 x i8> @test_vqshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) #0 {
   6761 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   6762 // CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   6763 // CHECK:   [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> [[VQSHRN_N]], i32 3)
   6764 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQSHRN_N1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   6765 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
   6766 uint8x16_t test_vqshrn_high_n_u16(uint8x8_t a, uint16x8_t b) {
   6767   return vqshrn_high_n_u16(a, b, 3);
   6768 }
   6769 
   6770 // CHECK-LABEL: define <8 x i16> @test_vqshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) #0 {
   6771 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   6772 // CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   6773 // CHECK:   [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> [[VQSHRN_N]], i32 9)
   6774 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQSHRN_N1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   6775 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
   6776 uint16x8_t test_vqshrn_high_n_u32(uint16x4_t a, uint32x4_t b) {
   6777   return vqshrn_high_n_u32(a, b, 9);
   6778 }
   6779 
   6780 // CHECK-LABEL: define <4 x i32> @test_vqshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) #0 {
   6781 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   6782 // CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   6783 // CHECK:   [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> [[VQSHRN_N]], i32 19)
   6784 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQSHRN_N1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   6785 // CHECK:   ret <4 x i32> [[SHUFFLE_I]]
   6786 uint32x4_t test_vqshrn_high_n_u64(uint32x2_t a, uint64x2_t b) {
   6787   return vqshrn_high_n_u64(a, b, 19);
   6788 }
   6789 
   6790 // CHECK-LABEL: define <8 x i8> @test_vqrshrn_n_s16(<8 x i16> %a) #0 {
   6791 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   6792 // CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   6793 // CHECK:   [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> [[VQRSHRN_N]], i32 3)
   6794 // CHECK:   ret <8 x i8> [[VQRSHRN_N1]]
   6795 int8x8_t test_vqrshrn_n_s16(int16x8_t a) {
   6796   return vqrshrn_n_s16(a, 3);
   6797 }
   6798 
   6799 // CHECK-LABEL: define <4 x i16> @test_vqrshrn_n_s32(<4 x i32> %a) #0 {
   6800 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   6801 // CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   6802 // CHECK:   [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> [[VQRSHRN_N]], i32 9)
   6803 // CHECK:   ret <4 x i16> [[VQRSHRN_N1]]
   6804 int16x4_t test_vqrshrn_n_s32(int32x4_t a) {
   6805   return vqrshrn_n_s32(a, 9);
   6806 }
   6807 
   6808 // CHECK-LABEL: define <2 x i32> @test_vqrshrn_n_s64(<2 x i64> %a) #0 {
   6809 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   6810 // CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   6811 // CHECK:   [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> [[VQRSHRN_N]], i32 19)
   6812 // CHECK:   ret <2 x i32> [[VQRSHRN_N1]]
   6813 int32x2_t test_vqrshrn_n_s64(int64x2_t a) {
   6814   return vqrshrn_n_s64(a, 19);
   6815 }
   6816 
   6817 // CHECK-LABEL: define <8 x i8> @test_vqrshrn_n_u16(<8 x i16> %a) #0 {
   6818 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   6819 // CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   6820 // CHECK:   [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> [[VQRSHRN_N]], i32 3)
   6821 // CHECK:   ret <8 x i8> [[VQRSHRN_N1]]
   6822 uint8x8_t test_vqrshrn_n_u16(uint16x8_t a) {
   6823   return vqrshrn_n_u16(a, 3);
   6824 }
   6825 
   6826 // CHECK-LABEL: define <4 x i16> @test_vqrshrn_n_u32(<4 x i32> %a) #0 {
   6827 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   6828 // CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   6829 // CHECK:   [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> [[VQRSHRN_N]], i32 9)
   6830 // CHECK:   ret <4 x i16> [[VQRSHRN_N1]]
   6831 uint16x4_t test_vqrshrn_n_u32(uint32x4_t a) {
   6832   return vqrshrn_n_u32(a, 9);
   6833 }
   6834 
   6835 // CHECK-LABEL: define <2 x i32> @test_vqrshrn_n_u64(<2 x i64> %a) #0 {
   6836 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   6837 // CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   6838 // CHECK:   [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> [[VQRSHRN_N]], i32 19)
   6839 // CHECK:   ret <2 x i32> [[VQRSHRN_N1]]
   6840 uint32x2_t test_vqrshrn_n_u64(uint64x2_t a) {
   6841   return vqrshrn_n_u64(a, 19);
   6842 }
   6843 
   6844 // CHECK-LABEL: define <16 x i8> @test_vqrshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) #0 {
   6845 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   6846 // CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   6847 // CHECK:   [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> [[VQRSHRN_N]], i32 3)
   6848 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQRSHRN_N1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   6849 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
   6850 int8x16_t test_vqrshrn_high_n_s16(int8x8_t a, int16x8_t b) {
   6851   return vqrshrn_high_n_s16(a, b, 3);
   6852 }
   6853 
   6854 // CHECK-LABEL: define <8 x i16> @test_vqrshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) #0 {
   6855 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   6856 // CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   6857 // CHECK:   [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> [[VQRSHRN_N]], i32 9)
   6858 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQRSHRN_N1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   6859 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
   6860 int16x8_t test_vqrshrn_high_n_s32(int16x4_t a, int32x4_t b) {
   6861   return vqrshrn_high_n_s32(a, b, 9);
   6862 }
   6863 
   6864 // CHECK-LABEL: define <4 x i32> @test_vqrshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) #0 {
   6865 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   6866 // CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   6867 // CHECK:   [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> [[VQRSHRN_N]], i32 19)
   6868 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQRSHRN_N1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   6869 // CHECK:   ret <4 x i32> [[SHUFFLE_I]]
   6870 int32x4_t test_vqrshrn_high_n_s64(int32x2_t a, int64x2_t b) {
   6871   return vqrshrn_high_n_s64(a, b, 19);
   6872 }
   6873 
   6874 // CHECK-LABEL: define <16 x i8> @test_vqrshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) #0 {
   6875 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   6876 // CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   6877 // CHECK:   [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> [[VQRSHRN_N]], i32 3)
   6878 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQRSHRN_N1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   6879 // CHECK:   ret <16 x i8> [[SHUFFLE_I]]
   6880 uint8x16_t test_vqrshrn_high_n_u16(uint8x8_t a, uint16x8_t b) {
   6881   return vqrshrn_high_n_u16(a, b, 3);
   6882 }
   6883 
   6884 // CHECK-LABEL: define <8 x i16> @test_vqrshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) #0 {
   6885 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   6886 // CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   6887 // CHECK:   [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> [[VQRSHRN_N]], i32 9)
   6888 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQRSHRN_N1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   6889 // CHECK:   ret <8 x i16> [[SHUFFLE_I]]
   6890 uint16x8_t test_vqrshrn_high_n_u32(uint16x4_t a, uint32x4_t b) {
   6891   return vqrshrn_high_n_u32(a, b, 9);
   6892 }
   6893 
   6894 // CHECK-LABEL: define <4 x i32> @test_vqrshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) #0 {
   6895 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   6896 // CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   6897 // CHECK:   [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> [[VQRSHRN_N]], i32 19)
   6898 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQRSHRN_N1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   6899 // CHECK:   ret <4 x i32> [[SHUFFLE_I]]
   6900 uint32x4_t test_vqrshrn_high_n_u64(uint32x2_t a, uint64x2_t b) {
   6901   return vqrshrn_high_n_u64(a, b, 19);
   6902 }
   6903 
   6904 // CHECK-LABEL: define <8 x i16> @test_vshll_n_s8(<8 x i8> %a) #0 {
   6905 // CHECK:   [[TMP0:%.*]] = sext <8 x i8> %a to <8 x i16>
   6906 // CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   6907 // CHECK:   ret <8 x i16> [[VSHLL_N]]
   6908 int16x8_t test_vshll_n_s8(int8x8_t a) {
   6909   return vshll_n_s8(a, 3);
   6910 }
   6911 
   6912 // CHECK-LABEL: define <4 x i32> @test_vshll_n_s16(<4 x i16> %a) #0 {
   6913 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   6914 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   6915 // CHECK:   [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
   6916 // CHECK:   [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], <i32 9, i32 9, i32 9, i32 9>
   6917 // CHECK:   ret <4 x i32> [[VSHLL_N]]
   6918 int32x4_t test_vshll_n_s16(int16x4_t a) {
   6919   return vshll_n_s16(a, 9);
   6920 }
   6921 
   6922 // CHECK-LABEL: define <2 x i64> @test_vshll_n_s32(<2 x i32> %a) #0 {
   6923 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   6924 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   6925 // CHECK:   [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
   6926 // CHECK:   [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], <i64 19, i64 19>
   6927 // CHECK:   ret <2 x i64> [[VSHLL_N]]
   6928 int64x2_t test_vshll_n_s32(int32x2_t a) {
   6929   return vshll_n_s32(a, 19);
   6930 }
   6931 
   6932 // CHECK-LABEL: define <8 x i16> @test_vshll_n_u8(<8 x i8> %a) #0 {
   6933 // CHECK:   [[TMP0:%.*]] = zext <8 x i8> %a to <8 x i16>
   6934 // CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   6935 // CHECK:   ret <8 x i16> [[VSHLL_N]]
   6936 uint16x8_t test_vshll_n_u8(uint8x8_t a) {
   6937   return vshll_n_u8(a, 3);
   6938 }
   6939 
   6940 // CHECK-LABEL: define <4 x i32> @test_vshll_n_u16(<4 x i16> %a) #0 {
   6941 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   6942 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   6943 // CHECK:   [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
   6944 // CHECK:   [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], <i32 9, i32 9, i32 9, i32 9>
   6945 // CHECK:   ret <4 x i32> [[VSHLL_N]]
   6946 uint32x4_t test_vshll_n_u16(uint16x4_t a) {
   6947   return vshll_n_u16(a, 9);
   6948 }
   6949 
   6950 // CHECK-LABEL: define <2 x i64> @test_vshll_n_u32(<2 x i32> %a) #0 {
   6951 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   6952 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   6953 // CHECK:   [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
   6954 // CHECK:   [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], <i64 19, i64 19>
   6955 // CHECK:   ret <2 x i64> [[VSHLL_N]]
   6956 uint64x2_t test_vshll_n_u32(uint32x2_t a) {
   6957   return vshll_n_u32(a, 19);
   6958 }
   6959 
   6960 // CHECK-LABEL: define <8 x i16> @test_vshll_high_n_s8(<16 x i8> %a) #0 {
   6961 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   6962 // CHECK:   [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I]] to <8 x i16>
   6963 // CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   6964 // CHECK:   ret <8 x i16> [[VSHLL_N]]
   6965 int16x8_t test_vshll_high_n_s8(int8x16_t a) {
   6966   return vshll_high_n_s8(a, 3);
   6967 }
   6968 
   6969 // CHECK-LABEL: define <4 x i32> @test_vshll_high_n_s16(<8 x i16> %a) #0 {
   6970 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   6971 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
   6972 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   6973 // CHECK:   [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
   6974 // CHECK:   [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], <i32 9, i32 9, i32 9, i32 9>
   6975 // CHECK:   ret <4 x i32> [[VSHLL_N]]
   6976 int32x4_t test_vshll_high_n_s16(int16x8_t a) {
   6977   return vshll_high_n_s16(a, 9);
   6978 }
   6979 
   6980 // CHECK-LABEL: define <2 x i64> @test_vshll_high_n_s32(<4 x i32> %a) #0 {
   6981 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
   6982 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
   6983 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   6984 // CHECK:   [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
   6985 // CHECK:   [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], <i64 19, i64 19>
   6986 // CHECK:   ret <2 x i64> [[VSHLL_N]]
   6987 int64x2_t test_vshll_high_n_s32(int32x4_t a) {
   6988   return vshll_high_n_s32(a, 19);
   6989 }
   6990 
   6991 // CHECK-LABEL: define <8 x i16> @test_vshll_high_n_u8(<16 x i8> %a) #0 {
   6992 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   6993 // CHECK:   [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I]] to <8 x i16>
   6994 // CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   6995 // CHECK:   ret <8 x i16> [[VSHLL_N]]
   6996 uint16x8_t test_vshll_high_n_u8(uint8x16_t a) {
   6997   return vshll_high_n_u8(a, 3);
   6998 }
   6999 
   7000 // CHECK-LABEL: define <4 x i32> @test_vshll_high_n_u16(<8 x i16> %a) #0 {
   7001 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   7002 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
   7003 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   7004 // CHECK:   [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
   7005 // CHECK:   [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], <i32 9, i32 9, i32 9, i32 9>
   7006 // CHECK:   ret <4 x i32> [[VSHLL_N]]
   7007 uint32x4_t test_vshll_high_n_u16(uint16x8_t a) {
   7008   return vshll_high_n_u16(a, 9);
   7009 }
   7010 
   7011 // CHECK-LABEL: define <2 x i64> @test_vshll_high_n_u32(<4 x i32> %a) #0 {
   7012 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
   7013 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
   7014 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   7015 // CHECK:   [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
   7016 // CHECK:   [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], <i64 19, i64 19>
   7017 // CHECK:   ret <2 x i64> [[VSHLL_N]]
   7018 uint64x2_t test_vshll_high_n_u32(uint32x4_t a) {
   7019   return vshll_high_n_u32(a, 19);
   7020 }
   7021 
   7022 // CHECK-LABEL: define <8 x i16> @test_vmovl_s8(<8 x i8> %a) #0 {
   7023 // CHECK:   [[VMOVL_I:%.*]] = sext <8 x i8> %a to <8 x i16>
   7024 // CHECK:   ret <8 x i16> [[VMOVL_I]]
   7025 int16x8_t test_vmovl_s8(int8x8_t a) {
   7026   return vmovl_s8(a);
   7027 }
   7028 
   7029 // CHECK-LABEL: define <4 x i32> @test_vmovl_s16(<4 x i16> %a) #0 {
   7030 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   7031 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   7032 // CHECK:   [[VMOVL_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
   7033 // CHECK:   ret <4 x i32> [[VMOVL_I]]
   7034 int32x4_t test_vmovl_s16(int16x4_t a) {
   7035   return vmovl_s16(a);
   7036 }
   7037 
   7038 // CHECK-LABEL: define <2 x i64> @test_vmovl_s32(<2 x i32> %a) #0 {
   7039 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   7040 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   7041 // CHECK:   [[VMOVL_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
   7042 // CHECK:   ret <2 x i64> [[VMOVL_I]]
   7043 int64x2_t test_vmovl_s32(int32x2_t a) {
   7044   return vmovl_s32(a);
   7045 }
   7046 
   7047 // CHECK-LABEL: define <8 x i16> @test_vmovl_u8(<8 x i8> %a) #0 {
   7048 // CHECK:   [[VMOVL_I:%.*]] = zext <8 x i8> %a to <8 x i16>
   7049 // CHECK:   ret <8 x i16> [[VMOVL_I]]
   7050 uint16x8_t test_vmovl_u8(uint8x8_t a) {
   7051   return vmovl_u8(a);
   7052 }
   7053 
   7054 // CHECK-LABEL: define <4 x i32> @test_vmovl_u16(<4 x i16> %a) #0 {
   7055 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   7056 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   7057 // CHECK:   [[VMOVL_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
   7058 // CHECK:   ret <4 x i32> [[VMOVL_I]]
   7059 uint32x4_t test_vmovl_u16(uint16x4_t a) {
   7060   return vmovl_u16(a);
   7061 }
   7062 
   7063 // CHECK-LABEL: define <2 x i64> @test_vmovl_u32(<2 x i32> %a) #0 {
   7064 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   7065 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   7066 // CHECK:   [[VMOVL_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
   7067 // CHECK:   ret <2 x i64> [[VMOVL_I]]
   7068 uint64x2_t test_vmovl_u32(uint32x2_t a) {
   7069   return vmovl_u32(a);
   7070 }
   7071 
   7072 // CHECK-LABEL: define <8 x i16> @test_vmovl_high_s8(<16 x i8> %a) #0 {
   7073 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   7074 // CHECK:   [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I]] to <8 x i16>
   7075 // CHECK:   ret <8 x i16> [[TMP0]]
   7076 int16x8_t test_vmovl_high_s8(int8x16_t a) {
   7077   return vmovl_high_s8(a);
   7078 }
   7079 
   7080 // CHECK-LABEL: define <4 x i32> @test_vmovl_high_s16(<8 x i16> %a) #0 {
   7081 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   7082 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
   7083 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   7084 // CHECK:   [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
   7085 // CHECK:   ret <4 x i32> [[TMP2]]
   7086 int32x4_t test_vmovl_high_s16(int16x8_t a) {
   7087   return vmovl_high_s16(a);
   7088 }
   7089 
   7090 // CHECK-LABEL: define <2 x i64> @test_vmovl_high_s32(<4 x i32> %a) #0 {
   7091 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
   7092 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
   7093 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   7094 // CHECK:   [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
   7095 // CHECK:   ret <2 x i64> [[TMP2]]
   7096 int64x2_t test_vmovl_high_s32(int32x4_t a) {
   7097   return vmovl_high_s32(a);
   7098 }
   7099 
   7100 // CHECK-LABEL: define <8 x i16> @test_vmovl_high_u8(<16 x i8> %a) #0 {
   7101 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   7102 // CHECK:   [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I]] to <8 x i16>
   7103 // CHECK:   ret <8 x i16> [[TMP0]]
   7104 uint16x8_t test_vmovl_high_u8(uint8x16_t a) {
   7105   return vmovl_high_u8(a);
   7106 }
   7107 
   7108 // CHECK-LABEL: define <4 x i32> @test_vmovl_high_u16(<8 x i16> %a) #0 {
   7109 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   7110 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
   7111 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   7112 // CHECK:   [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
   7113 // CHECK:   ret <4 x i32> [[TMP2]]
   7114 uint32x4_t test_vmovl_high_u16(uint16x8_t a) {
   7115   return vmovl_high_u16(a);
   7116 }
   7117 
   7118 // CHECK-LABEL: define <2 x i64> @test_vmovl_high_u32(<4 x i32> %a) #0 {
   7119 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
   7120 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
   7121 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   7122 // CHECK:   [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
   7123 // CHECK:   ret <2 x i64> [[TMP2]]
   7124 uint64x2_t test_vmovl_high_u32(uint32x4_t a) {
   7125   return vmovl_high_u32(a);
   7126 }
   7127 
   7128 // CHECK-LABEL: define <2 x float> @test_vcvt_n_f32_s32(<2 x i32> %a) #0 {
   7129 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   7130 // CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   7131 // CHECK:   [[VCVT_N1:%.*]] = call <2 x float> @llvm.aarch64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> [[VCVT_N]], i32 31)
   7132 // CHECK:   ret <2 x float> [[VCVT_N1]]
   7133 float32x2_t test_vcvt_n_f32_s32(int32x2_t a) {
   7134   return vcvt_n_f32_s32(a, 31);
   7135 }
   7136 
   7137 // CHECK-LABEL: define <4 x float> @test_vcvtq_n_f32_s32(<4 x i32> %a) #0 {
   7138 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   7139 // CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   7140 // CHECK:   [[VCVT_N1:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> [[VCVT_N]], i32 31)
   7141 // CHECK:   ret <4 x float> [[VCVT_N1]]
   7142 float32x4_t test_vcvtq_n_f32_s32(int32x4_t a) {
   7143   return vcvtq_n_f32_s32(a, 31);
   7144 }
   7145 
   7146 // CHECK-LABEL: define <2 x double> @test_vcvtq_n_f64_s64(<2 x i64> %a) #0 {
   7147 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   7148 // CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   7149 // CHECK:   [[VCVT_N1:%.*]] = call <2 x double> @llvm.aarch64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64> [[VCVT_N]], i32 50)
   7150 // CHECK:   ret <2 x double> [[VCVT_N1]]
   7151 float64x2_t test_vcvtq_n_f64_s64(int64x2_t a) {
   7152   return vcvtq_n_f64_s64(a, 50);
   7153 }
   7154 
   7155 // CHECK-LABEL: define <2 x float> @test_vcvt_n_f32_u32(<2 x i32> %a) #0 {
   7156 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   7157 // CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   7158 // CHECK:   [[VCVT_N1:%.*]] = call <2 x float> @llvm.aarch64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> [[VCVT_N]], i32 31)
   7159 // CHECK:   ret <2 x float> [[VCVT_N1]]
   7160 float32x2_t test_vcvt_n_f32_u32(uint32x2_t a) {
   7161   return vcvt_n_f32_u32(a, 31);
   7162 }
   7163 
   7164 // CHECK-LABEL: define <4 x float> @test_vcvtq_n_f32_u32(<4 x i32> %a) #0 {
   7165 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   7166 // CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   7167 // CHECK:   [[VCVT_N1:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> [[VCVT_N]], i32 31)
   7168 // CHECK:   ret <4 x float> [[VCVT_N1]]
   7169 float32x4_t test_vcvtq_n_f32_u32(uint32x4_t a) {
   7170   return vcvtq_n_f32_u32(a, 31);
   7171 }
   7172 
   7173 // CHECK-LABEL: define <2 x double> @test_vcvtq_n_f64_u64(<2 x i64> %a) #0 {
   7174 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   7175 // CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   7176 // CHECK:   [[VCVT_N1:%.*]] = call <2 x double> @llvm.aarch64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64> [[VCVT_N]], i32 50)
   7177 // CHECK:   ret <2 x double> [[VCVT_N1]]
   7178 float64x2_t test_vcvtq_n_f64_u64(uint64x2_t a) {
   7179   return vcvtq_n_f64_u64(a, 50);
   7180 }
   7181 
   7182 // CHECK-LABEL: define <2 x i32> @test_vcvt_n_s32_f32(<2 x float> %a) #0 {
   7183 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   7184 // CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   7185 // CHECK:   [[VCVT_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> [[VCVT_N]], i32 31)
   7186 // CHECK:   ret <2 x i32> [[VCVT_N1]]
   7187 int32x2_t test_vcvt_n_s32_f32(float32x2_t a) {
   7188   return vcvt_n_s32_f32(a, 31);
   7189 }
   7190 
   7191 // CHECK-LABEL: define <4 x i32> @test_vcvtq_n_s32_f32(<4 x float> %a) #0 {
   7192 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
   7193 // CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
   7194 // CHECK:   [[VCVT_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> [[VCVT_N]], i32 31)
   7195 // CHECK:   ret <4 x i32> [[VCVT_N1]]
   7196 int32x4_t test_vcvtq_n_s32_f32(float32x4_t a) {
   7197   return vcvtq_n_s32_f32(a, 31);
   7198 }
   7199 
   7200 // CHECK-LABEL: define <2 x i64> @test_vcvtq_n_s64_f64(<2 x double> %a) #0 {
   7201 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
   7202 // CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
   7203 // CHECK:   [[VCVT_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double> [[VCVT_N]], i32 50)
   7204 // CHECK:   ret <2 x i64> [[VCVT_N1]]
   7205 int64x2_t test_vcvtq_n_s64_f64(float64x2_t a) {
   7206   return vcvtq_n_s64_f64(a, 50);
   7207 }
   7208 
   7209 // CHECK-LABEL: define <2 x i32> @test_vcvt_n_u32_f32(<2 x float> %a) #0 {
   7210 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   7211 // CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   7212 // CHECK:   [[VCVT_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> [[VCVT_N]], i32 31)
   7213 // CHECK:   ret <2 x i32> [[VCVT_N1]]
   7214 uint32x2_t test_vcvt_n_u32_f32(float32x2_t a) {
   7215   return vcvt_n_u32_f32(a, 31);
   7216 }
   7217 
   7218 // CHECK-LABEL: define <4 x i32> @test_vcvtq_n_u32_f32(<4 x float> %a) #0 {
   7219 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
   7220 // CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
   7221 // CHECK:   [[VCVT_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> [[VCVT_N]], i32 31)
   7222 // CHECK:   ret <4 x i32> [[VCVT_N1]]
   7223 uint32x4_t test_vcvtq_n_u32_f32(float32x4_t a) {
   7224   return vcvtq_n_u32_f32(a, 31);
   7225 }
   7226 
   7227 // CHECK-LABEL: define <2 x i64> @test_vcvtq_n_u64_f64(<2 x double> %a) #0 {
   7228 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
   7229 // CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
   7230 // CHECK:   [[VCVT_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double> [[VCVT_N]], i32 50)
   7231 // CHECK:   ret <2 x i64> [[VCVT_N1]]
   7232 uint64x2_t test_vcvtq_n_u64_f64(float64x2_t a) {
   7233   return vcvtq_n_u64_f64(a, 50);
   7234 }
   7235 
   7236 // CHECK-LABEL: define <8 x i16> @test_vaddl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
   7237 // CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %a to <8 x i16>
   7238 // CHECK:   [[VMOVL_I4_I:%.*]] = sext <8 x i8> %b to <8 x i16>
   7239 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
   7240 // CHECK:   ret <8 x i16> [[ADD_I]]
   7241 int16x8_t test_vaddl_s8(int8x8_t a, int8x8_t b) {
   7242   return vaddl_s8(a, b);
   7243 }
   7244 
   7245 // CHECK-LABEL: define <4 x i32> @test_vaddl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   7246 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   7247 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   7248 // CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
   7249 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   7250 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
   7251 // CHECK:   [[VMOVL_I4_I:%.*]] = sext <4 x i16> [[TMP3]] to <4 x i32>
   7252 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
   7253 // CHECK:   ret <4 x i32> [[ADD_I]]
   7254 int32x4_t test_vaddl_s16(int16x4_t a, int16x4_t b) {
   7255   return vaddl_s16(a, b);
   7256 }
   7257 
   7258 // CHECK-LABEL: define <2 x i64> @test_vaddl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   7259 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   7260 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   7261 // CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
   7262 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   7263 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
   7264 // CHECK:   [[VMOVL_I4_I:%.*]] = sext <2 x i32> [[TMP3]] to <2 x i64>
   7265 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
   7266 // CHECK:   ret <2 x i64> [[ADD_I]]
   7267 int64x2_t test_vaddl_s32(int32x2_t a, int32x2_t b) {
   7268   return vaddl_s32(a, b);
   7269 }
   7270 
   7271 // CHECK-LABEL: define <8 x i16> @test_vaddl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
   7272 // CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %a to <8 x i16>
   7273 // CHECK:   [[VMOVL_I4_I:%.*]] = zext <8 x i8> %b to <8 x i16>
   7274 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
   7275 // CHECK:   ret <8 x i16> [[ADD_I]]
   7276 uint16x8_t test_vaddl_u8(uint8x8_t a, uint8x8_t b) {
   7277   return vaddl_u8(a, b);
   7278 }
   7279 
   7280 // CHECK-LABEL: define <4 x i32> @test_vaddl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
   7281 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   7282 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   7283 // CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
   7284 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   7285 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
   7286 // CHECK:   [[VMOVL_I4_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
   7287 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
   7288 // CHECK:   ret <4 x i32> [[ADD_I]]
   7289 uint32x4_t test_vaddl_u16(uint16x4_t a, uint16x4_t b) {
   7290   return vaddl_u16(a, b);
   7291 }
   7292 
   7293 // CHECK-LABEL: define <2 x i64> @test_vaddl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
   7294 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   7295 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   7296 // CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
   7297 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   7298 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
   7299 // CHECK:   [[VMOVL_I4_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
   7300 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
   7301 // CHECK:   ret <2 x i64> [[ADD_I]]
   7302 uint64x2_t test_vaddl_u32(uint32x2_t a, uint32x2_t b) {
   7303   return vaddl_u32(a, b);
   7304 }
   7305 
   7306 // CHECK-LABEL: define <8 x i16> @test_vaddl_high_s8(<16 x i8> %a, <16 x i8> %b) #0 {
   7307 // CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   7308 // CHECK:   [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16>
   7309 // CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   7310 // CHECK:   [[TMP1:%.*]] = sext <8 x i8> [[SHUFFLE_I_I10_I]] to <8 x i16>
   7311 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> [[TMP0]], [[TMP1]]
   7312 // CHECK:   ret <8 x i16> [[ADD_I]]
   7313 int16x8_t test_vaddl_high_s8(int8x16_t a, int8x16_t b) {
   7314   return vaddl_high_s8(a, b);
   7315 }
   7316 
   7317 // CHECK-LABEL: define <4 x i32> @test_vaddl_high_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   7318 // CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   7319 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8>
   7320 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   7321 // CHECK:   [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
   7322 // CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   7323 // CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I10_I]] to <8 x i8>
   7324 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
   7325 // CHECK:   [[TMP5:%.*]] = sext <4 x i16> [[TMP4]] to <4 x i32>
   7326 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP5]]
   7327 // CHECK:   ret <4 x i32> [[ADD_I]]
   7328 int32x4_t test_vaddl_high_s16(int16x8_t a, int16x8_t b) {
   7329   return vaddl_high_s16(a, b);
   7330 }
   7331 
   7332 // CHECK-LABEL: define <2 x i64> @test_vaddl_high_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   7333 // CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
   7334 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8>
   7335 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   7336 // CHECK:   [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
   7337 // CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
   7338 // CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I10_I]] to <8 x i8>
   7339 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
   7340 // CHECK:   [[TMP5:%.*]] = sext <2 x i32> [[TMP4]] to <2 x i64>
   7341 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP5]]
   7342 // CHECK:   ret <2 x i64> [[ADD_I]]
   7343 int64x2_t test_vaddl_high_s32(int32x4_t a, int32x4_t b) {
   7344   return vaddl_high_s32(a, b);
   7345 }
   7346 
   7347 // CHECK-LABEL: define <8 x i16> @test_vaddl_high_u8(<16 x i8> %a, <16 x i8> %b) #0 {
   7348 // CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   7349 // CHECK:   [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16>
   7350 // CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   7351 // CHECK:   [[TMP1:%.*]] = zext <8 x i8> [[SHUFFLE_I_I10_I]] to <8 x i16>
   7352 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> [[TMP0]], [[TMP1]]
   7353 // CHECK:   ret <8 x i16> [[ADD_I]]
   7354 uint16x8_t test_vaddl_high_u8(uint8x16_t a, uint8x16_t b) {
   7355   return vaddl_high_u8(a, b);
   7356 }
   7357 
   7358 // CHECK-LABEL: define <4 x i32> @test_vaddl_high_u16(<8 x i16> %a, <8 x i16> %b) #0 {
   7359 // CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   7360 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8>
   7361 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   7362 // CHECK:   [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
   7363 // CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   7364 // CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I10_I]] to <8 x i8>
   7365 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
   7366 // CHECK:   [[TMP5:%.*]] = zext <4 x i16> [[TMP4]] to <4 x i32>
   7367 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP5]]
   7368 // CHECK:   ret <4 x i32> [[ADD_I]]
   7369 uint32x4_t test_vaddl_high_u16(uint16x8_t a, uint16x8_t b) {
   7370   return vaddl_high_u16(a, b);
   7371 }
   7372 
   7373 // CHECK-LABEL: define <2 x i64> @test_vaddl_high_u32(<4 x i32> %a, <4 x i32> %b) #0 {
   7374 // CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
   7375 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8>
   7376 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   7377 // CHECK:   [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
   7378 // CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
   7379 // CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I10_I]] to <8 x i8>
   7380 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
   7381 // CHECK:   [[TMP5:%.*]] = zext <2 x i32> [[TMP4]] to <2 x i64>
   7382 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP5]]
   7383 // CHECK:   ret <2 x i64> [[ADD_I]]
   7384 uint64x2_t test_vaddl_high_u32(uint32x4_t a, uint32x4_t b) {
   7385   return vaddl_high_u32(a, b);
   7386 }
   7387 
   7388 // CHECK-LABEL: define <8 x i16> @test_vaddw_s8(<8 x i16> %a, <8 x i8> %b) #0 {
   7389 // CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %b to <8 x i16>
   7390 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I]]
   7391 // CHECK:   ret <8 x i16> [[ADD_I]]
   7392 int16x8_t test_vaddw_s8(int16x8_t a, int8x8_t b) {
   7393   return vaddw_s8(a, b);
   7394 }
   7395 
   7396 // CHECK-LABEL: define <4 x i32> @test_vaddw_s16(<4 x i32> %a, <4 x i16> %b) #0 {
   7397 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   7398 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   7399 // CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
   7400 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I]]
   7401 // CHECK:   ret <4 x i32> [[ADD_I]]
   7402 int32x4_t test_vaddw_s16(int32x4_t a, int16x4_t b) {
   7403   return vaddw_s16(a, b);
   7404 }
   7405 
   7406 // CHECK-LABEL: define <2 x i64> @test_vaddw_s32(<2 x i64> %a, <2 x i32> %b) #0 {
   7407 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   7408 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   7409 // CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
   7410 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I]]
   7411 // CHECK:   ret <2 x i64> [[ADD_I]]
   7412 int64x2_t test_vaddw_s32(int64x2_t a, int32x2_t b) {
   7413   return vaddw_s32(a, b);
   7414 }
   7415 
   7416 // CHECK-LABEL: define <8 x i16> @test_vaddw_u8(<8 x i16> %a, <8 x i8> %b) #0 {
   7417 // CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %b to <8 x i16>
   7418 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I]]
   7419 // CHECK:   ret <8 x i16> [[ADD_I]]
   7420 uint16x8_t test_vaddw_u8(uint16x8_t a, uint8x8_t b) {
   7421   return vaddw_u8(a, b);
   7422 }
   7423 
   7424 // CHECK-LABEL: define <4 x i32> @test_vaddw_u16(<4 x i32> %a, <4 x i16> %b) #0 {
   7425 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   7426 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   7427 // CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
   7428 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I]]
   7429 // CHECK:   ret <4 x i32> [[ADD_I]]
   7430 uint32x4_t test_vaddw_u16(uint32x4_t a, uint16x4_t b) {
   7431   return vaddw_u16(a, b);
   7432 }
   7433 
   7434 // CHECK-LABEL: define <2 x i64> @test_vaddw_u32(<2 x i64> %a, <2 x i32> %b) #0 {
   7435 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   7436 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   7437 // CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
   7438 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I]]
   7439 // CHECK:   ret <2 x i64> [[ADD_I]]
   7440 uint64x2_t test_vaddw_u32(uint64x2_t a, uint32x2_t b) {
   7441   return vaddw_u32(a, b);
   7442 }
   7443 
   7444 // CHECK-LABEL: define <8 x i16> @test_vaddw_high_s8(<8 x i16> %a, <16 x i8> %b) #0 {
   7445 // CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   7446 // CHECK:   [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16>
   7447 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[TMP0]]
   7448 // CHECK:   ret <8 x i16> [[ADD_I]]
   7449 int16x8_t test_vaddw_high_s8(int16x8_t a, int8x16_t b) {
   7450   return vaddw_high_s8(a, b);
   7451 }
   7452 
   7453 // CHECK-LABEL: define <4 x i32> @test_vaddw_high_s16(<4 x i32> %a, <8 x i16> %b) #0 {
   7454 // CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   7455 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8>
   7456 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   7457 // CHECK:   [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
   7458 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[TMP2]]
   7459 // CHECK:   ret <4 x i32> [[ADD_I]]
   7460 int32x4_t test_vaddw_high_s16(int32x4_t a, int16x8_t b) {
   7461   return vaddw_high_s16(a, b);
   7462 }
   7463 
   7464 // CHECK-LABEL: define <2 x i64> @test_vaddw_high_s32(<2 x i64> %a, <4 x i32> %b) #0 {
   7465 // CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
   7466 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8>
   7467 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   7468 // CHECK:   [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
   7469 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[TMP2]]
   7470 // CHECK:   ret <2 x i64> [[ADD_I]]
   7471 int64x2_t test_vaddw_high_s32(int64x2_t a, int32x4_t b) {
   7472   return vaddw_high_s32(a, b);
   7473 }
   7474 
   7475 // CHECK-LABEL: define <8 x i16> @test_vaddw_high_u8(<8 x i16> %a, <16 x i8> %b) #0 {
   7476 // CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   7477 // CHECK:   [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16>
   7478 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[TMP0]]
   7479 // CHECK:   ret <8 x i16> [[ADD_I]]
   7480 uint16x8_t test_vaddw_high_u8(uint16x8_t a, uint8x16_t b) {
   7481   return vaddw_high_u8(a, b);
   7482 }
   7483 
   7484 // CHECK-LABEL: define <4 x i32> @test_vaddw_high_u16(<4 x i32> %a, <8 x i16> %b) #0 {
   7485 // CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   7486 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8>
   7487 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   7488 // CHECK:   [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
   7489 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[TMP2]]
   7490 // CHECK:   ret <4 x i32> [[ADD_I]]
   7491 uint32x4_t test_vaddw_high_u16(uint32x4_t a, uint16x8_t b) {
   7492   return vaddw_high_u16(a, b);
   7493 }
   7494 
   7495 // CHECK-LABEL: define <2 x i64> @test_vaddw_high_u32(<2 x i64> %a, <4 x i32> %b) #0 {
   7496 // CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
   7497 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8>
   7498 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   7499 // CHECK:   [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
   7500 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[TMP2]]
   7501 // CHECK:   ret <2 x i64> [[ADD_I]]
   7502 uint64x2_t test_vaddw_high_u32(uint64x2_t a, uint32x4_t b) {
   7503   return vaddw_high_u32(a, b);
   7504 }
   7505 
   7506 // CHECK-LABEL: define <8 x i16> @test_vsubl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
   7507 // CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %a to <8 x i16>
   7508 // CHECK:   [[VMOVL_I4_I:%.*]] = sext <8 x i8> %b to <8 x i16>
   7509 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
   7510 // CHECK:   ret <8 x i16> [[SUB_I]]
   7511 int16x8_t test_vsubl_s8(int8x8_t a, int8x8_t b) {
   7512   return vsubl_s8(a, b);
   7513 }
   7514 
   7515 // CHECK-LABEL: define <4 x i32> @test_vsubl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   7516 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   7517 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   7518 // CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
   7519 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   7520 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
   7521 // CHECK:   [[VMOVL_I4_I:%.*]] = sext <4 x i16> [[TMP3]] to <4 x i32>
   7522 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
   7523 // CHECK:   ret <4 x i32> [[SUB_I]]
   7524 int32x4_t test_vsubl_s16(int16x4_t a, int16x4_t b) {
   7525   return vsubl_s16(a, b);
   7526 }
   7527 
   7528 // CHECK-LABEL: define <2 x i64> @test_vsubl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   7529 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   7530 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   7531 // CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
   7532 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   7533 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
   7534 // CHECK:   [[VMOVL_I4_I:%.*]] = sext <2 x i32> [[TMP3]] to <2 x i64>
   7535 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
   7536 // CHECK:   ret <2 x i64> [[SUB_I]]
   7537 int64x2_t test_vsubl_s32(int32x2_t a, int32x2_t b) {
   7538   return vsubl_s32(a, b);
   7539 }
   7540 
   7541 // CHECK-LABEL: define <8 x i16> @test_vsubl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
   7542 // CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %a to <8 x i16>
   7543 // CHECK:   [[VMOVL_I4_I:%.*]] = zext <8 x i8> %b to <8 x i16>
   7544 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
   7545 // CHECK:   ret <8 x i16> [[SUB_I]]
   7546 uint16x8_t test_vsubl_u8(uint8x8_t a, uint8x8_t b) {
   7547   return vsubl_u8(a, b);
   7548 }
   7549 
   7550 // CHECK-LABEL: define <4 x i32> @test_vsubl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
   7551 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   7552 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   7553 // CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
   7554 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   7555 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
   7556 // CHECK:   [[VMOVL_I4_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
   7557 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
   7558 // CHECK:   ret <4 x i32> [[SUB_I]]
   7559 uint32x4_t test_vsubl_u16(uint16x4_t a, uint16x4_t b) {
   7560   return vsubl_u16(a, b);
   7561 }
   7562 
   7563 // CHECK-LABEL: define <2 x i64> @test_vsubl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
   7564 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   7565 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   7566 // CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
   7567 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   7568 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
   7569 // CHECK:   [[VMOVL_I4_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
   7570 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
   7571 // CHECK:   ret <2 x i64> [[SUB_I]]
   7572 uint64x2_t test_vsubl_u32(uint32x2_t a, uint32x2_t b) {
   7573   return vsubl_u32(a, b);
   7574 }
   7575 
   7576 // CHECK-LABEL: define <8 x i16> @test_vsubl_high_s8(<16 x i8> %a, <16 x i8> %b) #0 {
   7577 // CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   7578 // CHECK:   [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16>
   7579 // CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   7580 // CHECK:   [[TMP1:%.*]] = sext <8 x i8> [[SHUFFLE_I_I10_I]] to <8 x i16>
   7581 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> [[TMP0]], [[TMP1]]
   7582 // CHECK:   ret <8 x i16> [[SUB_I]]
   7583 int16x8_t test_vsubl_high_s8(int8x16_t a, int8x16_t b) {
   7584   return vsubl_high_s8(a, b);
   7585 }
   7586 
   7587 // CHECK-LABEL: define <4 x i32> @test_vsubl_high_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   7588 // CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   7589 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8>
   7590 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   7591 // CHECK:   [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
   7592 // CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   7593 // CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I10_I]] to <8 x i8>
   7594 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
   7595 // CHECK:   [[TMP5:%.*]] = sext <4 x i16> [[TMP4]] to <4 x i32>
   7596 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> [[TMP2]], [[TMP5]]
   7597 // CHECK:   ret <4 x i32> [[SUB_I]]
   7598 int32x4_t test_vsubl_high_s16(int16x8_t a, int16x8_t b) {
   7599   return vsubl_high_s16(a, b);
   7600 }
   7601 
   7602 // CHECK-LABEL: define <2 x i64> @test_vsubl_high_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   7603 // CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
   7604 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8>
   7605 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   7606 // CHECK:   [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
   7607 // CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
   7608 // CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I10_I]] to <8 x i8>
   7609 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
   7610 // CHECK:   [[TMP5:%.*]] = sext <2 x i32> [[TMP4]] to <2 x i64>
   7611 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> [[TMP2]], [[TMP5]]
   7612 // CHECK:   ret <2 x i64> [[SUB_I]]
   7613 int64x2_t test_vsubl_high_s32(int32x4_t a, int32x4_t b) {
   7614   return vsubl_high_s32(a, b);
   7615 }
   7616 
   7617 // CHECK-LABEL: define <8 x i16> @test_vsubl_high_u8(<16 x i8> %a, <16 x i8> %b) #0 {
   7618 // CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   7619 // CHECK:   [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16>
   7620 // CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   7621 // CHECK:   [[TMP1:%.*]] = zext <8 x i8> [[SHUFFLE_I_I10_I]] to <8 x i16>
   7622 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> [[TMP0]], [[TMP1]]
   7623 // CHECK:   ret <8 x i16> [[SUB_I]]
   7624 uint16x8_t test_vsubl_high_u8(uint8x16_t a, uint8x16_t b) {
   7625   return vsubl_high_u8(a, b);
   7626 }
   7627 
   7628 // CHECK-LABEL: define <4 x i32> @test_vsubl_high_u16(<8 x i16> %a, <8 x i16> %b) #0 {
   7629 // CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   7630 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8>
   7631 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   7632 // CHECK:   [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
   7633 // CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   7634 // CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I10_I]] to <8 x i8>
   7635 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
   7636 // CHECK:   [[TMP5:%.*]] = zext <4 x i16> [[TMP4]] to <4 x i32>
   7637 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> [[TMP2]], [[TMP5]]
   7638 // CHECK:   ret <4 x i32> [[SUB_I]]
   7639 uint32x4_t test_vsubl_high_u16(uint16x8_t a, uint16x8_t b) {
   7640   return vsubl_high_u16(a, b);
   7641 }
   7642 
   7643 // CHECK-LABEL: define <2 x i64> @test_vsubl_high_u32(<4 x i32> %a, <4 x i32> %b) #0 {
   7644 // CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
   7645 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8>
   7646 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   7647 // CHECK:   [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
   7648 // CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
   7649 // CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I10_I]] to <8 x i8>
   7650 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
   7651 // CHECK:   [[TMP5:%.*]] = zext <2 x i32> [[TMP4]] to <2 x i64>
   7652 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> [[TMP2]], [[TMP5]]
   7653 // CHECK:   ret <2 x i64> [[SUB_I]]
   7654 uint64x2_t test_vsubl_high_u32(uint32x4_t a, uint32x4_t b) {
   7655   return vsubl_high_u32(a, b);
   7656 }
   7657 
   7658 // CHECK-LABEL: define <8 x i16> @test_vsubw_s8(<8 x i16> %a, <8 x i8> %b) #0 {
   7659 // CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %b to <8 x i16>
   7660 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMOVL_I_I]]
   7661 // CHECK:   ret <8 x i16> [[SUB_I]]
   7662 int16x8_t test_vsubw_s8(int16x8_t a, int8x8_t b) {
   7663   return vsubw_s8(a, b);
   7664 }
   7665 
   7666 // CHECK-LABEL: define <4 x i32> @test_vsubw_s16(<4 x i32> %a, <4 x i16> %b) #0 {
   7667 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   7668 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   7669 // CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
   7670 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMOVL_I_I]]
   7671 // CHECK:   ret <4 x i32> [[SUB_I]]
   7672 int32x4_t test_vsubw_s16(int32x4_t a, int16x4_t b) {
   7673   return vsubw_s16(a, b);
   7674 }
   7675 
   7676 // CHECK-LABEL: define <2 x i64> @test_vsubw_s32(<2 x i64> %a, <2 x i32> %b) #0 {
   7677 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   7678 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   7679 // CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
   7680 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMOVL_I_I]]
   7681 // CHECK:   ret <2 x i64> [[SUB_I]]
   7682 int64x2_t test_vsubw_s32(int64x2_t a, int32x2_t b) {
   7683   return vsubw_s32(a, b);
   7684 }
   7685 
   7686 // CHECK-LABEL: define <8 x i16> @test_vsubw_u8(<8 x i16> %a, <8 x i8> %b) #0 {
   7687 // CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %b to <8 x i16>
   7688 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMOVL_I_I]]
   7689 // CHECK:   ret <8 x i16> [[SUB_I]]
   7690 uint16x8_t test_vsubw_u8(uint16x8_t a, uint8x8_t b) {
   7691   return vsubw_u8(a, b);
   7692 }
   7693 
   7694 // CHECK-LABEL: define <4 x i32> @test_vsubw_u16(<4 x i32> %a, <4 x i16> %b) #0 {
   7695 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   7696 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   7697 // CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
   7698 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMOVL_I_I]]
   7699 // CHECK:   ret <4 x i32> [[SUB_I]]
   7700 uint32x4_t test_vsubw_u16(uint32x4_t a, uint16x4_t b) {
   7701   return vsubw_u16(a, b);
   7702 }
   7703 
   7704 // CHECK-LABEL: define <2 x i64> @test_vsubw_u32(<2 x i64> %a, <2 x i32> %b) #0 {
   7705 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   7706 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   7707 // CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
   7708 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMOVL_I_I]]
   7709 // CHECK:   ret <2 x i64> [[SUB_I]]
   7710 uint64x2_t test_vsubw_u32(uint64x2_t a, uint32x2_t b) {
   7711   return vsubw_u32(a, b);
   7712 }
   7713 
   7714 // CHECK-LABEL: define <8 x i16> @test_vsubw_high_s8(<8 x i16> %a, <16 x i8> %b) #0 {
   7715 // CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   7716 // CHECK:   [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16>
   7717 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[TMP0]]
   7718 // CHECK:   ret <8 x i16> [[SUB_I]]
   7719 int16x8_t test_vsubw_high_s8(int16x8_t a, int8x16_t b) {
   7720   return vsubw_high_s8(a, b);
   7721 }
   7722 
   7723 // CHECK-LABEL: define <4 x i32> @test_vsubw_high_s16(<4 x i32> %a, <8 x i16> %b) #0 {
   7724 // CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   7725 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8>
   7726 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   7727 // CHECK:   [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
   7728 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[TMP2]]
   7729 // CHECK:   ret <4 x i32> [[SUB_I]]
   7730 int32x4_t test_vsubw_high_s16(int32x4_t a, int16x8_t b) {
   7731   return vsubw_high_s16(a, b);
   7732 }
   7733 
   7734 // CHECK-LABEL: define <2 x i64> @test_vsubw_high_s32(<2 x i64> %a, <4 x i32> %b) #0 {
   7735 // CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
   7736 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8>
   7737 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   7738 // CHECK:   [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
   7739 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[TMP2]]
   7740 // CHECK:   ret <2 x i64> [[SUB_I]]
   7741 int64x2_t test_vsubw_high_s32(int64x2_t a, int32x4_t b) {
   7742   return vsubw_high_s32(a, b);
   7743 }
   7744 
   7745 // CHECK-LABEL: define <8 x i16> @test_vsubw_high_u8(<8 x i16> %a, <16 x i8> %b) #0 {
   7746 // CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   7747 // CHECK:   [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16>
   7748 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[TMP0]]
   7749 // CHECK:   ret <8 x i16> [[SUB_I]]
   7750 uint16x8_t test_vsubw_high_u8(uint16x8_t a, uint8x16_t b) {
   7751   return vsubw_high_u8(a, b);
   7752 }
   7753 
   7754 // CHECK-LABEL: define <4 x i32> @test_vsubw_high_u16(<4 x i32> %a, <8 x i16> %b) #0 {
   7755 // CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   7756 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8>
   7757 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   7758 // CHECK:   [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
   7759 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[TMP2]]
   7760 // CHECK:   ret <4 x i32> [[SUB_I]]
   7761 uint32x4_t test_vsubw_high_u16(uint32x4_t a, uint16x8_t b) {
   7762   return vsubw_high_u16(a, b);
   7763 }
   7764 
   7765 // CHECK-LABEL: define <2 x i64> @test_vsubw_high_u32(<2 x i64> %a, <4 x i32> %b) #0 {
   7766 // CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
   7767 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8>
   7768 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   7769 // CHECK:   [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
   7770 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[TMP2]]
   7771 // CHECK:   ret <2 x i64> [[SUB_I]]
   7772 uint64x2_t test_vsubw_high_u32(uint64x2_t a, uint32x4_t b) {
   7773   return vsubw_high_u32(a, b);
   7774 }
   7775 
   7776 // CHECK-LABEL: define <8 x i8> @test_vaddhn_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   7777 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   7778 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   7779 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   7780 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   7781 // CHECK:   [[VADDHN_I:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]]
   7782 // CHECK:   [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
   7783 // CHECK:   [[VADDHN2_I:%.*]] = trunc <8 x i16> [[VADDHN1_I]] to <8 x i8>
   7784 // CHECK:   ret <8 x i8> [[VADDHN2_I]]
   7785 int8x8_t test_vaddhn_s16(int16x8_t a, int16x8_t b) {
   7786   return vaddhn_s16(a, b);
   7787 }
   7788 
   7789 // CHECK-LABEL: define <4 x i16> @test_vaddhn_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   7790 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   7791 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   7792 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   7793 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   7794 // CHECK:   [[VADDHN_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
   7795 // CHECK:   [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], <i32 16, i32 16, i32 16, i32 16>
   7796 // CHECK:   [[VADDHN2_I:%.*]] = trunc <4 x i32> [[VADDHN1_I]] to <4 x i16>
   7797 // CHECK:   ret <4 x i16> [[VADDHN2_I]]
   7798 int16x4_t test_vaddhn_s32(int32x4_t a, int32x4_t b) {
   7799   return vaddhn_s32(a, b);
   7800 }
   7801 
   7802 // CHECK-LABEL: define <2 x i32> @test_vaddhn_s64(<2 x i64> %a, <2 x i64> %b) #0 {
   7803 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   7804 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   7805 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   7806 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   7807 // CHECK:   [[VADDHN_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
   7808 // CHECK:   [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], <i64 32, i64 32>
   7809 // CHECK:   [[VADDHN2_I:%.*]] = trunc <2 x i64> [[VADDHN1_I]] to <2 x i32>
   7810 // CHECK:   ret <2 x i32> [[VADDHN2_I]]
   7811 int32x2_t test_vaddhn_s64(int64x2_t a, int64x2_t b) {
   7812   return vaddhn_s64(a, b);
   7813 }
   7814 
   7815 // CHECK-LABEL: define <8 x i8> @test_vaddhn_u16(<8 x i16> %a, <8 x i16> %b) #0 {
   7816 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   7817 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   7818 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   7819 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   7820 // CHECK:   [[VADDHN_I:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]]
   7821 // CHECK:   [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
   7822 // CHECK:   [[VADDHN2_I:%.*]] = trunc <8 x i16> [[VADDHN1_I]] to <8 x i8>
   7823 // CHECK:   ret <8 x i8> [[VADDHN2_I]]
   7824 uint8x8_t test_vaddhn_u16(uint16x8_t a, uint16x8_t b) {
   7825   return vaddhn_u16(a, b);
   7826 }
   7827 
   7828 // CHECK-LABEL: define <4 x i16> @test_vaddhn_u32(<4 x i32> %a, <4 x i32> %b) #0 {
   7829 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   7830 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   7831 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   7832 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   7833 // CHECK:   [[VADDHN_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
   7834 // CHECK:   [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], <i32 16, i32 16, i32 16, i32 16>
   7835 // CHECK:   [[VADDHN2_I:%.*]] = trunc <4 x i32> [[VADDHN1_I]] to <4 x i16>
   7836 // CHECK:   ret <4 x i16> [[VADDHN2_I]]
   7837 uint16x4_t test_vaddhn_u32(uint32x4_t a, uint32x4_t b) {
   7838   return vaddhn_u32(a, b);
   7839 }
   7840 
   7841 // CHECK-LABEL: define <2 x i32> @test_vaddhn_u64(<2 x i64> %a, <2 x i64> %b) #0 {
   7842 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   7843 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   7844 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   7845 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   7846 // CHECK:   [[VADDHN_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
   7847 // CHECK:   [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], <i64 32, i64 32>
   7848 // CHECK:   [[VADDHN2_I:%.*]] = trunc <2 x i64> [[VADDHN1_I]] to <2 x i32>
   7849 // CHECK:   ret <2 x i32> [[VADDHN2_I]]
   7850 uint32x2_t test_vaddhn_u64(uint64x2_t a, uint64x2_t b) {
   7851   return vaddhn_u64(a, b);
   7852 }
   7853 
   7854 // CHECK-LABEL: define <16 x i8> @test_vaddhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) #0 {
   7855 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   7856 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   7857 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   7858 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   7859 // CHECK:   [[VADDHN_I_I:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]]
   7860 // CHECK:   [[VADDHN1_I_I:%.*]] = lshr <8 x i16> [[VADDHN_I_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
   7861 // CHECK:   [[VADDHN2_I_I:%.*]] = trunc <8 x i16> [[VADDHN1_I_I]] to <8 x i8>
   7862 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VADDHN2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   7863 // CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
   7864 int8x16_t test_vaddhn_high_s16(int8x8_t r, int16x8_t a, int16x8_t b) {
   7865   return vaddhn_high_s16(r, a, b);
   7866 }
   7867 
   7868 // CHECK-LABEL: define <8 x i16> @test_vaddhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) #0 {
   7869 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   7870 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   7871 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   7872 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   7873 // CHECK:   [[VADDHN_I_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
   7874 // CHECK:   [[VADDHN1_I_I:%.*]] = lshr <4 x i32> [[VADDHN_I_I]], <i32 16, i32 16, i32 16, i32 16>
   7875 // CHECK:   [[VADDHN2_I_I:%.*]] = trunc <4 x i32> [[VADDHN1_I_I]] to <4 x i16>
   7876 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[VADDHN2_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   7877 // CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
   7878 int16x8_t test_vaddhn_high_s32(int16x4_t r, int32x4_t a, int32x4_t b) {
   7879   return vaddhn_high_s32(r, a, b);
   7880 }
   7881 
   7882 // CHECK-LABEL: define <4 x i32> @test_vaddhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) #0 {
   7883 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   7884 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   7885 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   7886 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   7887 // CHECK:   [[VADDHN_I_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
   7888 // CHECK:   [[VADDHN1_I_I:%.*]] = lshr <2 x i64> [[VADDHN_I_I]], <i64 32, i64 32>
   7889 // CHECK:   [[VADDHN2_I_I:%.*]] = trunc <2 x i64> [[VADDHN1_I_I]] to <2 x i32>
   7890 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[VADDHN2_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   7891 // CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
   7892 int32x4_t test_vaddhn_high_s64(int32x2_t r, int64x2_t a, int64x2_t b) {
   7893   return vaddhn_high_s64(r, a, b);
   7894 }
   7895 
   7896 // CHECK-LABEL: define <16 x i8> @test_vaddhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) #0 {
   7897 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   7898 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   7899 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   7900 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   7901 // CHECK:   [[VADDHN_I_I:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]]
   7902 // CHECK:   [[VADDHN1_I_I:%.*]] = lshr <8 x i16> [[VADDHN_I_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
   7903 // CHECK:   [[VADDHN2_I_I:%.*]] = trunc <8 x i16> [[VADDHN1_I_I]] to <8 x i8>
   7904 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VADDHN2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   7905 // CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
   7906 uint8x16_t test_vaddhn_high_u16(uint8x8_t r, uint16x8_t a, uint16x8_t b) {
   7907   return vaddhn_high_u16(r, a, b);
   7908 }
   7909 
   7910 // CHECK-LABEL: define <8 x i16> @test_vaddhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) #0 {
   7911 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   7912 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   7913 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   7914 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   7915 // CHECK:   [[VADDHN_I_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
   7916 // CHECK:   [[VADDHN1_I_I:%.*]] = lshr <4 x i32> [[VADDHN_I_I]], <i32 16, i32 16, i32 16, i32 16>
   7917 // CHECK:   [[VADDHN2_I_I:%.*]] = trunc <4 x i32> [[VADDHN1_I_I]] to <4 x i16>
   7918 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[VADDHN2_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   7919 // CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
   7920 uint16x8_t test_vaddhn_high_u32(uint16x4_t r, uint32x4_t a, uint32x4_t b) {
   7921   return vaddhn_high_u32(r, a, b);
   7922 }
   7923 
   7924 // CHECK-LABEL: define <4 x i32> @test_vaddhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) #0 {
   7925 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   7926 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   7927 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   7928 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   7929 // CHECK:   [[VADDHN_I_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
   7930 // CHECK:   [[VADDHN1_I_I:%.*]] = lshr <2 x i64> [[VADDHN_I_I]], <i64 32, i64 32>
   7931 // CHECK:   [[VADDHN2_I_I:%.*]] = trunc <2 x i64> [[VADDHN1_I_I]] to <2 x i32>
   7932 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[VADDHN2_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   7933 // CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
   7934 uint32x4_t test_vaddhn_high_u64(uint32x2_t r, uint64x2_t a, uint64x2_t b) {
   7935   return vaddhn_high_u64(r, a, b);
   7936 }
   7937 
   7938 // CHECK-LABEL: define <8 x i8> @test_vraddhn_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   7939 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   7940 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   7941 // CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   7942 // CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   7943 // CHECK:   [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> [[VRADDHN_V_I]], <8 x i16> [[VRADDHN_V1_I]]) #4
   7944 // CHECK:   ret <8 x i8> [[VRADDHN_V2_I]]
   7945 int8x8_t test_vraddhn_s16(int16x8_t a, int16x8_t b) {
   7946   return vraddhn_s16(a, b);
   7947 }
   7948 
   7949 // CHECK-LABEL: define <4 x i16> @test_vraddhn_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   7950 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   7951 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   7952 // CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   7953 // CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   7954 // CHECK:   [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> [[VRADDHN_V_I]], <4 x i32> [[VRADDHN_V1_I]]) #4
   7955 // CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I]] to <8 x i8>
   7956 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <4 x i16>
   7957 // CHECK:   ret <4 x i16> [[TMP2]]
   7958 int16x4_t test_vraddhn_s32(int32x4_t a, int32x4_t b) {
   7959   return vraddhn_s32(a, b);
   7960 }
   7961 
   7962 // CHECK-LABEL: define <2 x i32> @test_vraddhn_s64(<2 x i64> %a, <2 x i64> %b) #0 {
   7963 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   7964 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   7965 // CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   7966 // CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   7967 // CHECK:   [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> [[VRADDHN_V_I]], <2 x i64> [[VRADDHN_V1_I]]) #4
   7968 // CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I]] to <8 x i8>
   7969 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <2 x i32>
   7970 // CHECK:   ret <2 x i32> [[TMP2]]
   7971 int32x2_t test_vraddhn_s64(int64x2_t a, int64x2_t b) {
   7972   return vraddhn_s64(a, b);
   7973 }
   7974 
   7975 // CHECK-LABEL: define <8 x i8> @test_vraddhn_u16(<8 x i16> %a, <8 x i16> %b) #0 {
   7976 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   7977 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   7978 // CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   7979 // CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   7980 // CHECK:   [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> [[VRADDHN_V_I]], <8 x i16> [[VRADDHN_V1_I]]) #4
   7981 // CHECK:   ret <8 x i8> [[VRADDHN_V2_I]]
   7982 uint8x8_t test_vraddhn_u16(uint16x8_t a, uint16x8_t b) {
   7983   return vraddhn_u16(a, b);
   7984 }
   7985 
   7986 // CHECK-LABEL: define <4 x i16> @test_vraddhn_u32(<4 x i32> %a, <4 x i32> %b) #0 {
   7987 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   7988 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   7989 // CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   7990 // CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   7991 // CHECK:   [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> [[VRADDHN_V_I]], <4 x i32> [[VRADDHN_V1_I]]) #4
   7992 // CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I]] to <8 x i8>
   7993 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <4 x i16>
   7994 // CHECK:   ret <4 x i16> [[TMP2]]
   7995 uint16x4_t test_vraddhn_u32(uint32x4_t a, uint32x4_t b) {
   7996   return vraddhn_u32(a, b);
   7997 }
   7998 
   7999 // CHECK-LABEL: define <2 x i32> @test_vraddhn_u64(<2 x i64> %a, <2 x i64> %b) #0 {
   8000 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   8001 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   8002 // CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   8003 // CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   8004 // CHECK:   [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> [[VRADDHN_V_I]], <2 x i64> [[VRADDHN_V1_I]]) #4
   8005 // CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I]] to <8 x i8>
   8006 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <2 x i32>
   8007 // CHECK:   ret <2 x i32> [[TMP2]]
   8008 uint32x2_t test_vraddhn_u64(uint64x2_t a, uint64x2_t b) {
   8009   return vraddhn_u64(a, b);
   8010 }
   8011 
   8012 // CHECK-LABEL: define <16 x i8> @test_vraddhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) #0 {
   8013 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   8014 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   8015 // CHECK:   [[VRADDHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   8016 // CHECK:   [[VRADDHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   8017 // CHECK:   [[VRADDHN_V2_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> [[VRADDHN_V_I_I]], <8 x i16> [[VRADDHN_V1_I_I]]) #4
   8018 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VRADDHN_V2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   8019 // CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
   8020 int8x16_t test_vraddhn_high_s16(int8x8_t r, int16x8_t a, int16x8_t b) {
   8021   return vraddhn_high_s16(r, a, b);
   8022 }
   8023 
   8024 // CHECK-LABEL: define <8 x i16> @test_vraddhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) #0 {
   8025 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   8026 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   8027 // CHECK:   [[VRADDHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   8028 // CHECK:   [[VRADDHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   8029 // CHECK:   [[VRADDHN_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> [[VRADDHN_V_I_I]], <4 x i32> [[VRADDHN_V1_I_I]]) #4
   8030 // CHECK:   [[VRADDHN_V3_I_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I_I]] to <8 x i8>
   8031 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I_I]] to <4 x i16>
   8032 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   8033 // CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
   8034 int16x8_t test_vraddhn_high_s32(int16x4_t r, int32x4_t a, int32x4_t b) {
   8035   return vraddhn_high_s32(r, a, b);
   8036 }
   8037 
   8038 // CHECK-LABEL: define <4 x i32> @test_vraddhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) #0 {
   8039 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   8040 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   8041 // CHECK:   [[VRADDHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   8042 // CHECK:   [[VRADDHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   8043 // CHECK:   [[VRADDHN_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> [[VRADDHN_V_I_I]], <2 x i64> [[VRADDHN_V1_I_I]]) #4
   8044 // CHECK:   [[VRADDHN_V3_I_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I_I]] to <8 x i8>
   8045 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I_I]] to <2 x i32>
   8046 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   8047 // CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
   8048 int32x4_t test_vraddhn_high_s64(int32x2_t r, int64x2_t a, int64x2_t b) {
   8049   return vraddhn_high_s64(r, a, b);
   8050 }
   8051 
   8052 // CHECK-LABEL: define <16 x i8> @test_vraddhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) #0 {
   8053 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   8054 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   8055 // CHECK:   [[VRADDHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   8056 // CHECK:   [[VRADDHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   8057 // CHECK:   [[VRADDHN_V2_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> [[VRADDHN_V_I_I]], <8 x i16> [[VRADDHN_V1_I_I]]) #4
   8058 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VRADDHN_V2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   8059 // CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
   8060 uint8x16_t test_vraddhn_high_u16(uint8x8_t r, uint16x8_t a, uint16x8_t b) {
   8061   return vraddhn_high_u16(r, a, b);
   8062 }
   8063 
   8064 // CHECK-LABEL: define <8 x i16> @test_vraddhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) #0 {
   8065 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   8066 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   8067 // CHECK:   [[VRADDHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   8068 // CHECK:   [[VRADDHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   8069 // CHECK:   [[VRADDHN_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> [[VRADDHN_V_I_I]], <4 x i32> [[VRADDHN_V1_I_I]]) #4
   8070 // CHECK:   [[VRADDHN_V3_I_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I_I]] to <8 x i8>
   8071 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I_I]] to <4 x i16>
   8072 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   8073 // CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
   8074 uint16x8_t test_vraddhn_high_u32(uint16x4_t r, uint32x4_t a, uint32x4_t b) {
   8075   return vraddhn_high_u32(r, a, b);
   8076 }
   8077 
   8078 // CHECK-LABEL: define <4 x i32> @test_vraddhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) #0 {
   8079 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   8080 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   8081 // CHECK:   [[VRADDHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   8082 // CHECK:   [[VRADDHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   8083 // CHECK:   [[VRADDHN_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> [[VRADDHN_V_I_I]], <2 x i64> [[VRADDHN_V1_I_I]]) #4
   8084 // CHECK:   [[VRADDHN_V3_I_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I_I]] to <8 x i8>
   8085 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I_I]] to <2 x i32>
   8086 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   8087 // CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
   8088 uint32x4_t test_vraddhn_high_u64(uint32x2_t r, uint64x2_t a, uint64x2_t b) {
   8089   return vraddhn_high_u64(r, a, b);
   8090 }
   8091 
   8092 // CHECK-LABEL: define <8 x i8> @test_vsubhn_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   8093 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   8094 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   8095 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   8096 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   8097 // CHECK:   [[VSUBHN_I:%.*]] = sub <8 x i16> [[TMP2]], [[TMP3]]
   8098 // CHECK:   [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
   8099 // CHECK:   [[VSUBHN2_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I]] to <8 x i8>
   8100 // CHECK:   ret <8 x i8> [[VSUBHN2_I]]
   8101 int8x8_t test_vsubhn_s16(int16x8_t a, int16x8_t b) {
   8102   return vsubhn_s16(a, b);
   8103 }
   8104 
   8105 // CHECK-LABEL: define <4 x i16> @test_vsubhn_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   8106 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   8107 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   8108 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   8109 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   8110 // CHECK:   [[VSUBHN_I:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
   8111 // CHECK:   [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], <i32 16, i32 16, i32 16, i32 16>
   8112 // CHECK:   [[VSUBHN2_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I]] to <4 x i16>
   8113 // CHECK:   ret <4 x i16> [[VSUBHN2_I]]
   8114 int16x4_t test_vsubhn_s32(int32x4_t a, int32x4_t b) {
   8115   return vsubhn_s32(a, b);
   8116 }
   8117 
   8118 // CHECK-LABEL: define <2 x i32> @test_vsubhn_s64(<2 x i64> %a, <2 x i64> %b) #0 {
   8119 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   8120 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   8121 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   8122 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   8123 // CHECK:   [[VSUBHN_I:%.*]] = sub <2 x i64> [[TMP2]], [[TMP3]]
   8124 // CHECK:   [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], <i64 32, i64 32>
   8125 // CHECK:   [[VSUBHN2_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I]] to <2 x i32>
   8126 // CHECK:   ret <2 x i32> [[VSUBHN2_I]]
   8127 int32x2_t test_vsubhn_s64(int64x2_t a, int64x2_t b) {
   8128   return vsubhn_s64(a, b);
   8129 }
   8130 
   8131 // CHECK-LABEL: define <8 x i8> @test_vsubhn_u16(<8 x i16> %a, <8 x i16> %b) #0 {
   8132 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   8133 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   8134 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   8135 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   8136 // CHECK:   [[VSUBHN_I:%.*]] = sub <8 x i16> [[TMP2]], [[TMP3]]
   8137 // CHECK:   [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
   8138 // CHECK:   [[VSUBHN2_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I]] to <8 x i8>
   8139 // CHECK:   ret <8 x i8> [[VSUBHN2_I]]
   8140 uint8x8_t test_vsubhn_u16(uint16x8_t a, uint16x8_t b) {
   8141   return vsubhn_u16(a, b);
   8142 }
   8143 
   8144 // CHECK-LABEL: define <4 x i16> @test_vsubhn_u32(<4 x i32> %a, <4 x i32> %b) #0 {
   8145 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   8146 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   8147 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   8148 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   8149 // CHECK:   [[VSUBHN_I:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
   8150 // CHECK:   [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], <i32 16, i32 16, i32 16, i32 16>
   8151 // CHECK:   [[VSUBHN2_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I]] to <4 x i16>
   8152 // CHECK:   ret <4 x i16> [[VSUBHN2_I]]
   8153 uint16x4_t test_vsubhn_u32(uint32x4_t a, uint32x4_t b) {
   8154   return vsubhn_u32(a, b);
   8155 }
   8156 
   8157 // CHECK-LABEL: define <2 x i32> @test_vsubhn_u64(<2 x i64> %a, <2 x i64> %b) #0 {
   8158 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   8159 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   8160 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   8161 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   8162 // CHECK:   [[VSUBHN_I:%.*]] = sub <2 x i64> [[TMP2]], [[TMP3]]
   8163 // CHECK:   [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], <i64 32, i64 32>
   8164 // CHECK:   [[VSUBHN2_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I]] to <2 x i32>
   8165 // CHECK:   ret <2 x i32> [[VSUBHN2_I]]
   8166 uint32x2_t test_vsubhn_u64(uint64x2_t a, uint64x2_t b) {
   8167   return vsubhn_u64(a, b);
   8168 }
   8169 
   8170 // CHECK-LABEL: define <16 x i8> @test_vsubhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) #0 {
   8171 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   8172 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   8173 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   8174 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   8175 // CHECK:   [[VSUBHN_I_I:%.*]] = sub <8 x i16> [[TMP2]], [[TMP3]]
   8176 // CHECK:   [[VSUBHN1_I_I:%.*]] = lshr <8 x i16> [[VSUBHN_I_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
   8177 // CHECK:   [[VSUBHN2_I_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I_I]] to <8 x i8>
   8178 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VSUBHN2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   8179 // CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
   8180 int8x16_t test_vsubhn_high_s16(int8x8_t r, int16x8_t a, int16x8_t b) {
   8181   return vsubhn_high_s16(r, a, b);
   8182 }
   8183 
   8184 // CHECK-LABEL: define <8 x i16> @test_vsubhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) #0 {
   8185 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   8186 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   8187 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   8188 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   8189 // CHECK:   [[VSUBHN_I_I:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
   8190 // CHECK:   [[VSUBHN1_I_I:%.*]] = lshr <4 x i32> [[VSUBHN_I_I]], <i32 16, i32 16, i32 16, i32 16>
   8191 // CHECK:   [[VSUBHN2_I_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I_I]] to <4 x i16>
   8192 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[VSUBHN2_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   8193 // CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
   8194 int16x8_t test_vsubhn_high_s32(int16x4_t r, int32x4_t a, int32x4_t b) {
   8195   return vsubhn_high_s32(r, a, b);
   8196 }
   8197 
   8198 // CHECK-LABEL: define <4 x i32> @test_vsubhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) #0 {
   8199 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   8200 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   8201 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   8202 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   8203 // CHECK:   [[VSUBHN_I_I:%.*]] = sub <2 x i64> [[TMP2]], [[TMP3]]
   8204 // CHECK:   [[VSUBHN1_I_I:%.*]] = lshr <2 x i64> [[VSUBHN_I_I]], <i64 32, i64 32>
   8205 // CHECK:   [[VSUBHN2_I_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I_I]] to <2 x i32>
   8206 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[VSUBHN2_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   8207 // CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
   8208 int32x4_t test_vsubhn_high_s64(int32x2_t r, int64x2_t a, int64x2_t b) {
   8209   return vsubhn_high_s64(r, a, b);
   8210 }
   8211 
   8212 // CHECK-LABEL: define <16 x i8> @test_vsubhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) #0 {
   8213 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   8214 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   8215 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   8216 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   8217 // CHECK:   [[VSUBHN_I_I:%.*]] = sub <8 x i16> [[TMP2]], [[TMP3]]
   8218 // CHECK:   [[VSUBHN1_I_I:%.*]] = lshr <8 x i16> [[VSUBHN_I_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
   8219 // CHECK:   [[VSUBHN2_I_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I_I]] to <8 x i8>
   8220 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VSUBHN2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   8221 // CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
   8222 uint8x16_t test_vsubhn_high_u16(uint8x8_t r, uint16x8_t a, uint16x8_t b) {
   8223   return vsubhn_high_u16(r, a, b);
   8224 }
   8225 
   8226 // CHECK-LABEL: define <8 x i16> @test_vsubhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) #0 {
   8227 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   8228 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   8229 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   8230 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   8231 // CHECK:   [[VSUBHN_I_I:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
   8232 // CHECK:   [[VSUBHN1_I_I:%.*]] = lshr <4 x i32> [[VSUBHN_I_I]], <i32 16, i32 16, i32 16, i32 16>
   8233 // CHECK:   [[VSUBHN2_I_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I_I]] to <4 x i16>
   8234 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[VSUBHN2_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   8235 // CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
   8236 uint16x8_t test_vsubhn_high_u32(uint16x4_t r, uint32x4_t a, uint32x4_t b) {
   8237   return vsubhn_high_u32(r, a, b);
   8238 }
   8239 
   8240 // CHECK-LABEL: define <4 x i32> @test_vsubhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) #0 {
   8241 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   8242 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   8243 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   8244 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   8245 // CHECK:   [[VSUBHN_I_I:%.*]] = sub <2 x i64> [[TMP2]], [[TMP3]]
   8246 // CHECK:   [[VSUBHN1_I_I:%.*]] = lshr <2 x i64> [[VSUBHN_I_I]], <i64 32, i64 32>
   8247 // CHECK:   [[VSUBHN2_I_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I_I]] to <2 x i32>
   8248 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[VSUBHN2_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   8249 // CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
   8250 uint32x4_t test_vsubhn_high_u64(uint32x2_t r, uint64x2_t a, uint64x2_t b) {
   8251   return vsubhn_high_u64(r, a, b);
   8252 }
   8253 
   8254 // CHECK-LABEL: define <8 x i8> @test_vrsubhn_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   8255 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   8256 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   8257 // CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   8258 // CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   8259 // CHECK:   [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> [[VRSUBHN_V_I]], <8 x i16> [[VRSUBHN_V1_I]]) #4
   8260 // CHECK:   ret <8 x i8> [[VRSUBHN_V2_I]]
   8261 int8x8_t test_vrsubhn_s16(int16x8_t a, int16x8_t b) {
   8262   return vrsubhn_s16(a, b);
   8263 }
   8264 
   8265 // CHECK-LABEL: define <4 x i16> @test_vrsubhn_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   8266 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   8267 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   8268 // CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   8269 // CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   8270 // CHECK:   [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> [[VRSUBHN_V_I]], <4 x i32> [[VRSUBHN_V1_I]]) #4
   8271 // CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I]] to <8 x i8>
   8272 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I]] to <4 x i16>
   8273 // CHECK:   ret <4 x i16> [[TMP2]]
   8274 int16x4_t test_vrsubhn_s32(int32x4_t a, int32x4_t b) {
   8275   return vrsubhn_s32(a, b);
   8276 }
   8277 
   8278 // CHECK-LABEL: define <2 x i32> @test_vrsubhn_s64(<2 x i64> %a, <2 x i64> %b) #0 {
   8279 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   8280 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   8281 // CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   8282 // CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   8283 // CHECK:   [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> [[VRSUBHN_V_I]], <2 x i64> [[VRSUBHN_V1_I]]) #4
   8284 // CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I]] to <8 x i8>
   8285 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I]] to <2 x i32>
   8286 // CHECK:   ret <2 x i32> [[TMP2]]
   8287 int32x2_t test_vrsubhn_s64(int64x2_t a, int64x2_t b) {
   8288   return vrsubhn_s64(a, b);
   8289 }
   8290 
   8291 // CHECK-LABEL: define <8 x i8> @test_vrsubhn_u16(<8 x i16> %a, <8 x i16> %b) #0 {
   8292 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   8293 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   8294 // CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   8295 // CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   8296 // CHECK:   [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> [[VRSUBHN_V_I]], <8 x i16> [[VRSUBHN_V1_I]]) #4
   8297 // CHECK:   ret <8 x i8> [[VRSUBHN_V2_I]]
   8298 uint8x8_t test_vrsubhn_u16(uint16x8_t a, uint16x8_t b) {
   8299   return vrsubhn_u16(a, b);
   8300 }
   8301 
   8302 // CHECK-LABEL: define <4 x i16> @test_vrsubhn_u32(<4 x i32> %a, <4 x i32> %b) #0 {
   8303 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   8304 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   8305 // CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   8306 // CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   8307 // CHECK:   [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> [[VRSUBHN_V_I]], <4 x i32> [[VRSUBHN_V1_I]]) #4
   8308 // CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I]] to <8 x i8>
   8309 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I]] to <4 x i16>
   8310 // CHECK:   ret <4 x i16> [[TMP2]]
   8311 uint16x4_t test_vrsubhn_u32(uint32x4_t a, uint32x4_t b) {
   8312   return vrsubhn_u32(a, b);
   8313 }
   8314 
   8315 // CHECK-LABEL: define <2 x i32> @test_vrsubhn_u64(<2 x i64> %a, <2 x i64> %b) #0 {
   8316 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   8317 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   8318 // CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   8319 // CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   8320 // CHECK:   [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> [[VRSUBHN_V_I]], <2 x i64> [[VRSUBHN_V1_I]]) #4
   8321 // CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I]] to <8 x i8>
   8322 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I]] to <2 x i32>
   8323 // CHECK:   ret <2 x i32> [[TMP2]]
   8324 uint32x2_t test_vrsubhn_u64(uint64x2_t a, uint64x2_t b) {
   8325   return vrsubhn_u64(a, b);
   8326 }
   8327 
   8328 // CHECK-LABEL: define <16 x i8> @test_vrsubhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) #0 {
   8329 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   8330 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   8331 // CHECK:   [[VRSUBHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   8332 // CHECK:   [[VRSUBHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   8333 // CHECK:   [[VRSUBHN_V2_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> [[VRSUBHN_V_I_I]], <8 x i16> [[VRSUBHN_V1_I_I]]) #4
   8334 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VRSUBHN_V2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   8335 // CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
   8336 int8x16_t test_vrsubhn_high_s16(int8x8_t r, int16x8_t a, int16x8_t b) {
   8337   return vrsubhn_high_s16(r, a, b);
   8338 }
   8339 
   8340 // CHECK-LABEL: define <8 x i16> @test_vrsubhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) #0 {
   8341 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   8342 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   8343 // CHECK:   [[VRSUBHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   8344 // CHECK:   [[VRSUBHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   8345 // CHECK:   [[VRSUBHN_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> [[VRSUBHN_V_I_I]], <4 x i32> [[VRSUBHN_V1_I_I]]) #4
   8346 // CHECK:   [[VRSUBHN_V3_I_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I_I]] to <8 x i8>
   8347 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I_I]] to <4 x i16>
   8348 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   8349 // CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
   8350 int16x8_t test_vrsubhn_high_s32(int16x4_t r, int32x4_t a, int32x4_t b) {
   8351   return vrsubhn_high_s32(r, a, b);
   8352 }
   8353 
   8354 // CHECK-LABEL: define <4 x i32> @test_vrsubhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) #0 {
   8355 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   8356 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   8357 // CHECK:   [[VRSUBHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   8358 // CHECK:   [[VRSUBHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   8359 // CHECK:   [[VRSUBHN_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> [[VRSUBHN_V_I_I]], <2 x i64> [[VRSUBHN_V1_I_I]]) #4
   8360 // CHECK:   [[VRSUBHN_V3_I_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I_I]] to <8 x i8>
   8361 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I_I]] to <2 x i32>
   8362 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   8363 // CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
   8364 int32x4_t test_vrsubhn_high_s64(int32x2_t r, int64x2_t a, int64x2_t b) {
   8365   return vrsubhn_high_s64(r, a, b);
   8366 }
   8367 
   8368 // CHECK-LABEL: define <16 x i8> @test_vrsubhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) #0 {
   8369 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   8370 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   8371 // CHECK:   [[VRSUBHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   8372 // CHECK:   [[VRSUBHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   8373 // CHECK:   [[VRSUBHN_V2_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> [[VRSUBHN_V_I_I]], <8 x i16> [[VRSUBHN_V1_I_I]]) #4
   8374 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VRSUBHN_V2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   8375 // CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
   8376 uint8x16_t test_vrsubhn_high_u16(uint8x8_t r, uint16x8_t a, uint16x8_t b) {
   8377   return vrsubhn_high_u16(r, a, b);
   8378 }
   8379 
   8380 // CHECK-LABEL: define <8 x i16> @test_vrsubhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) #0 {
   8381 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   8382 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   8383 // CHECK:   [[VRSUBHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   8384 // CHECK:   [[VRSUBHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   8385 // CHECK:   [[VRSUBHN_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> [[VRSUBHN_V_I_I]], <4 x i32> [[VRSUBHN_V1_I_I]]) #4
   8386 // CHECK:   [[VRSUBHN_V3_I_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I_I]] to <8 x i8>
   8387 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I_I]] to <4 x i16>
   8388 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   8389 // CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
   8390 uint16x8_t test_vrsubhn_high_u32(uint16x4_t r, uint32x4_t a, uint32x4_t b) {
   8391   return vrsubhn_high_u32(r, a, b);
   8392 }
   8393 
   8394 // CHECK-LABEL: define <4 x i32> @test_vrsubhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) #0 {
   8395 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   8396 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   8397 // CHECK:   [[VRSUBHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   8398 // CHECK:   [[VRSUBHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   8399 // CHECK:   [[VRSUBHN_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> [[VRSUBHN_V_I_I]], <2 x i64> [[VRSUBHN_V1_I_I]]) #4
   8400 // CHECK:   [[VRSUBHN_V3_I_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I_I]] to <8 x i8>
   8401 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I_I]] to <2 x i32>
   8402 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   8403 // CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
   8404 uint32x4_t test_vrsubhn_high_u64(uint32x2_t r, uint64x2_t a, uint64x2_t b) {
   8405   return vrsubhn_high_u64(r, a, b);
   8406 }
   8407 
   8408 // CHECK-LABEL: define <8 x i16> @test_vabdl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
   8409 // CHECK:   [[VABD_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
   8410 // CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_I_I]] to <8 x i16>
   8411 // CHECK:   ret <8 x i16> [[VMOVL_I_I]]
   8412 int16x8_t test_vabdl_s8(int8x8_t a, int8x8_t b) {
   8413   return vabdl_s8(a, b);
   8414 }
   8415 // CHECK-LABEL: define <4 x i32> @test_vabdl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   8416 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   8417 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   8418 // CHECK:   [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   8419 // CHECK:   [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   8420 // CHECK:   [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I_I]], <4 x i16> [[VABD1_I_I]]) #4
   8421 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I]] to <8 x i8>
   8422 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
   8423 // CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
   8424 // CHECK:   ret <4 x i32> [[VMOVL_I_I]]
   8425 int32x4_t test_vabdl_s16(int16x4_t a, int16x4_t b) {
   8426   return vabdl_s16(a, b);
   8427 }
   8428 // CHECK-LABEL: define <2 x i64> @test_vabdl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   8429 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   8430 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   8431 // CHECK:   [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   8432 // CHECK:   [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   8433 // CHECK:   [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I_I]], <2 x i32> [[VABD1_I_I]]) #4
   8434 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I]] to <8 x i8>
   8435 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
   8436 // CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
   8437 // CHECK:   ret <2 x i64> [[VMOVL_I_I]]
   8438 int64x2_t test_vabdl_s32(int32x2_t a, int32x2_t b) {
   8439   return vabdl_s32(a, b);
   8440 }
   8441 // CHECK-LABEL: define <8 x i16> @test_vabdl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
   8442 // CHECK:   [[VABD_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
   8443 // CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_I_I]] to <8 x i16>
   8444 // CHECK:   ret <8 x i16> [[VMOVL_I_I]]
   8445 uint16x8_t test_vabdl_u8(uint8x8_t a, uint8x8_t b) {
   8446   return vabdl_u8(a, b);
   8447 }
   8448 // CHECK-LABEL: define <4 x i32> @test_vabdl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
   8449 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   8450 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   8451 // CHECK:   [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   8452 // CHECK:   [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   8453 // CHECK:   [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I_I]], <4 x i16> [[VABD1_I_I]]) #4
   8454 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I]] to <8 x i8>
   8455 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
   8456 // CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
   8457 // CHECK:   ret <4 x i32> [[VMOVL_I_I]]
   8458 uint32x4_t test_vabdl_u16(uint16x4_t a, uint16x4_t b) {
   8459   return vabdl_u16(a, b);
   8460 }
   8461 // CHECK-LABEL: define <2 x i64> @test_vabdl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
   8462 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   8463 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   8464 // CHECK:   [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   8465 // CHECK:   [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   8466 // CHECK:   [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I_I]], <2 x i32> [[VABD1_I_I]]) #4
   8467 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I]] to <8 x i8>
   8468 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
   8469 // CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
   8470 // CHECK:   ret <2 x i64> [[VMOVL_I_I]]
   8471 uint64x2_t test_vabdl_u32(uint32x2_t a, uint32x2_t b) {
   8472   return vabdl_u32(a, b);
   8473 }
   8474 
   8475 // CHECK-LABEL: define <8 x i16> @test_vabal_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
   8476 // CHECK:   [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %b, <8 x i8> %c) #4
   8477 // CHECK:   [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x i16>
   8478 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I]]
   8479 // CHECK:   ret <8 x i16> [[ADD_I]]
   8480 int16x8_t test_vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
   8481   return vabal_s8(a, b, c);
   8482 }
   8483 // CHECK-LABEL: define <4 x i32> @test_vabal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
   8484 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   8485 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
   8486 // CHECK:   [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   8487 // CHECK:   [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   8488 // CHECK:   [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I_I_I]], <4 x i16> [[VABD1_I_I_I]]) #4
   8489 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I]] to <8 x i8>
   8490 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
   8491 // CHECK:   [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
   8492 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I]]
   8493 // CHECK:   ret <4 x i32> [[ADD_I]]
   8494 int32x4_t test_vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   8495   return vabal_s16(a, b, c);
   8496 }
   8497 // CHECK-LABEL: define <2 x i64> @test_vabal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
   8498 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   8499 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
   8500 // CHECK:   [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   8501 // CHECK:   [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   8502 // CHECK:   [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I_I_I]], <2 x i32> [[VABD1_I_I_I]]) #4
   8503 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I]] to <8 x i8>
   8504 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
   8505 // CHECK:   [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
   8506 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I]]
   8507 // CHECK:   ret <2 x i64> [[ADD_I]]
   8508 int64x2_t test_vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   8509   return vabal_s32(a, b, c);
   8510 }
   8511 // CHECK-LABEL: define <8 x i16> @test_vabal_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
   8512 // CHECK:   [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %b, <8 x i8> %c) #4
   8513 // CHECK:   [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x i16>
   8514 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I]]
   8515 // CHECK:   ret <8 x i16> [[ADD_I]]
   8516 uint16x8_t test_vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
   8517   return vabal_u8(a, b, c);
   8518 }
   8519 // CHECK-LABEL: define <4 x i32> @test_vabal_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
   8520 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   8521 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
   8522 // CHECK:   [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   8523 // CHECK:   [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   8524 // CHECK:   [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I_I_I]], <4 x i16> [[VABD1_I_I_I]]) #4
   8525 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I]] to <8 x i8>
   8526 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
   8527 // CHECK:   [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
   8528 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I]]
   8529 // CHECK:   ret <4 x i32> [[ADD_I]]
   8530 uint32x4_t test_vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
   8531   return vabal_u16(a, b, c);
   8532 }
   8533 // CHECK-LABEL: define <2 x i64> @test_vabal_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
   8534 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   8535 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
   8536 // CHECK:   [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   8537 // CHECK:   [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   8538 // CHECK:   [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I_I_I]], <2 x i32> [[VABD1_I_I_I]]) #4
   8539 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I]] to <8 x i8>
   8540 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
   8541 // CHECK:   [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
   8542 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I]]
   8543 // CHECK:   ret <2 x i64> [[ADD_I]]
   8544 uint64x2_t test_vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
   8545   return vabal_u32(a, b, c);
   8546 }
   8547 
   8548 // CHECK-LABEL: define <8 x i16> @test_vabdl_high_s8(<16 x i8> %a, <16 x i8> %b) #0 {
   8549 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   8550 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   8551 // CHECK:   [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4
   8552 // CHECK:   [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x i16>
   8553 // CHECK:   ret <8 x i16> [[VMOVL_I_I_I]]
   8554 int16x8_t test_vabdl_high_s8(int8x16_t a, int8x16_t b) {
   8555   return vabdl_high_s8(a, b);
   8556 }
   8557 // CHECK-LABEL: define <4 x i32> @test_vabdl_high_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   8558 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   8559 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   8560 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
   8561 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
   8562 // CHECK:   [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   8563 // CHECK:   [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   8564 // CHECK:   [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I_I_I]], <4 x i16> [[VABD1_I_I_I]]) #4
   8565 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I]] to <8 x i8>
   8566 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
   8567 // CHECK:   [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
   8568 // CHECK:   ret <4 x i32> [[VMOVL_I_I_I]]
   8569 int32x4_t test_vabdl_high_s16(int16x8_t a, int16x8_t b) {
   8570   return vabdl_high_s16(a, b);
   8571 }
   8572 // CHECK-LABEL: define <2 x i64> @test_vabdl_high_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   8573 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
   8574 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
   8575 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
   8576 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
   8577 // CHECK:   [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   8578 // CHECK:   [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   8579 // CHECK:   [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I_I_I]], <2 x i32> [[VABD1_I_I_I]]) #4
   8580 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I]] to <8 x i8>
   8581 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
   8582 // CHECK:   [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
   8583 // CHECK:   ret <2 x i64> [[VMOVL_I_I_I]]
   8584 int64x2_t test_vabdl_high_s32(int32x4_t a, int32x4_t b) {
   8585   return vabdl_high_s32(a, b);
   8586 }
   8587 // CHECK-LABEL: define <8 x i16> @test_vabdl_high_u8(<16 x i8> %a, <16 x i8> %b) #0 {
   8588 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   8589 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   8590 // CHECK:   [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4
   8591 // CHECK:   [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x i16>
   8592 // CHECK:   ret <8 x i16> [[VMOVL_I_I_I]]
   8593 uint16x8_t test_vabdl_high_u8(uint8x16_t a, uint8x16_t b) {
   8594   return vabdl_high_u8(a, b);
   8595 }
   8596 // CHECK-LABEL: define <4 x i32> @test_vabdl_high_u16(<8 x i16> %a, <8 x i16> %b) #0 {
   8597 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   8598 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   8599 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
   8600 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
   8601 // CHECK:   [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   8602 // CHECK:   [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   8603 // CHECK:   [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I_I_I]], <4 x i16> [[VABD1_I_I_I]]) #4
   8604 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I]] to <8 x i8>
   8605 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
   8606 // CHECK:   [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
   8607 // CHECK:   ret <4 x i32> [[VMOVL_I_I_I]]
   8608 uint32x4_t test_vabdl_high_u16(uint16x8_t a, uint16x8_t b) {
   8609   return vabdl_high_u16(a, b);
   8610 }
   8611 // CHECK-LABEL: define <2 x i64> @test_vabdl_high_u32(<4 x i32> %a, <4 x i32> %b) #0 {
   8612 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
   8613 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
   8614 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
   8615 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
   8616 // CHECK:   [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   8617 // CHECK:   [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   8618 // CHECK:   [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I_I_I]], <2 x i32> [[VABD1_I_I_I]]) #4
   8619 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I]] to <8 x i8>
   8620 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
   8621 // CHECK:   [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
   8622 // CHECK:   ret <2 x i64> [[VMOVL_I_I_I]]
   8623 uint64x2_t test_vabdl_high_u32(uint32x4_t a, uint32x4_t b) {
   8624   return vabdl_high_u32(a, b);
   8625 }
   8626 
   8627 // CHECK-LABEL: define <8 x i16> @test_vabal_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) #0 {
   8628 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   8629 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %c, <16 x i8> %c, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   8630 // CHECK:   [[VABD_I_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4
   8631 // CHECK:   [[VMOVL_I_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I_I]] to <8 x i16>
   8632 // CHECK:   [[ADD_I_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I_I]]
   8633 // CHECK:   ret <8 x i16> [[ADD_I_I]]
   8634 int16x8_t test_vabal_high_s8(int16x8_t a, int8x16_t b, int8x16_t c) {
   8635   return vabal_high_s8(a, b, c);
   8636 }
   8637 // CHECK-LABEL: define <4 x i32> @test_vabal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) #0 {
   8638 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   8639 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   8640 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
   8641 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
   8642 // CHECK:   [[VABD_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   8643 // CHECK:   [[VABD1_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   8644 // CHECK:   [[VABD2_I_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I_I_I_I]], <4 x i16> [[VABD1_I_I_I_I]]) #4
   8645 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I_I]] to <8 x i8>
   8646 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
   8647 // CHECK:   [[VMOVL_I_I_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
   8648 // CHECK:   [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I_I]]
   8649 // CHECK:   ret <4 x i32> [[ADD_I_I]]
   8650 int32x4_t test_vabal_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) {
   8651   return vabal_high_s16(a, b, c);
   8652 }
   8653 // CHECK-LABEL: define <2 x i64> @test_vabal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) #0 {
   8654 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
   8655 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> <i32 2, i32 3>
   8656 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
   8657 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
   8658 // CHECK:   [[VABD_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   8659 // CHECK:   [[VABD1_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   8660 // CHECK:   [[VABD2_I_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I_I_I_I]], <2 x i32> [[VABD1_I_I_I_I]]) #4
   8661 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I_I]] to <8 x i8>
   8662 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
   8663 // CHECK:   [[VMOVL_I_I_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
   8664 // CHECK:   [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I_I]]
   8665 // CHECK:   ret <2 x i64> [[ADD_I_I]]
   8666 int64x2_t test_vabal_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) {
   8667   return vabal_high_s32(a, b, c);
   8668 }
   8669 // CHECK-LABEL: define <8 x i16> @test_vabal_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) #0 {
   8670 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   8671 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %c, <16 x i8> %c, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   8672 // CHECK:   [[VABD_I_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4
   8673 // CHECK:   [[VMOVL_I_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I_I]] to <8 x i16>
   8674 // CHECK:   [[ADD_I_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I_I]]
   8675 // CHECK:   ret <8 x i16> [[ADD_I_I]]
   8676 uint16x8_t test_vabal_high_u8(uint16x8_t a, uint8x16_t b, uint8x16_t c) {
   8677   return vabal_high_u8(a, b, c);
   8678 }
   8679 // CHECK-LABEL: define <4 x i32> @test_vabal_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) #0 {
   8680 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   8681 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   8682 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
   8683 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
   8684 // CHECK:   [[VABD_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   8685 // CHECK:   [[VABD1_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   8686 // CHECK:   [[VABD2_I_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I_I_I_I]], <4 x i16> [[VABD1_I_I_I_I]]) #4
   8687 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I_I]] to <8 x i8>
   8688 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
   8689 // CHECK:   [[VMOVL_I_I_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
   8690 // CHECK:   [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I_I]]
   8691 // CHECK:   ret <4 x i32> [[ADD_I_I]]
   8692 uint32x4_t test_vabal_high_u16(uint32x4_t a, uint16x8_t b, uint16x8_t c) {
   8693   return vabal_high_u16(a, b, c);
   8694 }
   8695 // CHECK-LABEL: define <2 x i64> @test_vabal_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) #0 {
   8696 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
   8697 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> <i32 2, i32 3>
   8698 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
   8699 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
   8700 // CHECK:   [[VABD_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   8701 // CHECK:   [[VABD1_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   8702 // CHECK:   [[VABD2_I_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I_I_I_I]], <2 x i32> [[VABD1_I_I_I_I]]) #4
   8703 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I_I]] to <8 x i8>
   8704 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
   8705 // CHECK:   [[VMOVL_I_I_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
   8706 // CHECK:   [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I_I]]
   8707 // CHECK:   ret <2 x i64> [[ADD_I_I]]
   8708 uint64x2_t test_vabal_high_u32(uint64x2_t a, uint32x4_t b, uint32x4_t c) {
   8709   return vabal_high_u32(a, b, c);
   8710 }
   8711 
   8712 // CHECK-LABEL: define <8 x i16> @test_vmull_s8(<8 x i8> %a, <8 x i8> %b) #0 {
   8713 // CHECK:   [[VMULL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %a, <8 x i8> %b) #4
   8714 // CHECK:   ret <8 x i16> [[VMULL_I]]
   8715 int16x8_t test_vmull_s8(int8x8_t a, int8x8_t b) {
   8716   return vmull_s8(a, b);
   8717 }
   8718 // CHECK-LABEL: define <4 x i32> @test_vmull_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   8719 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   8720 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   8721 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   8722 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   8723 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
   8724 // CHECK:   ret <4 x i32> [[VMULL2_I]]
   8725 int32x4_t test_vmull_s16(int16x4_t a, int16x4_t b) {
   8726   return vmull_s16(a, b);
   8727 }
   8728 // CHECK-LABEL: define <2 x i64> @test_vmull_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   8729 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   8730 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   8731 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   8732 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   8733 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
   8734 // CHECK:   ret <2 x i64> [[VMULL2_I]]
   8735 int64x2_t test_vmull_s32(int32x2_t a, int32x2_t b) {
   8736   return vmull_s32(a, b);
   8737 }
   8738 // CHECK-LABEL: define <8 x i16> @test_vmull_u8(<8 x i8> %a, <8 x i8> %b) #0 {
   8739 // CHECK:   [[VMULL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %a, <8 x i8> %b) #4
   8740 // CHECK:   ret <8 x i16> [[VMULL_I]]
   8741 uint16x8_t test_vmull_u8(uint8x8_t a, uint8x8_t b) {
   8742   return vmull_u8(a, b);
   8743 }
   8744 // CHECK-LABEL: define <4 x i32> @test_vmull_u16(<4 x i16> %a, <4 x i16> %b) #0 {
   8745 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   8746 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   8747 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   8748 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   8749 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
   8750 // CHECK:   ret <4 x i32> [[VMULL2_I]]
   8751 uint32x4_t test_vmull_u16(uint16x4_t a, uint16x4_t b) {
   8752   return vmull_u16(a, b);
   8753 }
   8754 // CHECK-LABEL: define <2 x i64> @test_vmull_u32(<2 x i32> %a, <2 x i32> %b) #0 {
   8755 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   8756 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   8757 // CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   8758 // CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   8759 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
   8760 // CHECK:   ret <2 x i64> [[VMULL2_I]]
   8761 uint64x2_t test_vmull_u32(uint32x2_t a, uint32x2_t b) {
   8762   return vmull_u32(a, b);
   8763 }
   8764 
   8765 // CHECK-LABEL: define <8 x i16> @test_vmull_high_s8(<16 x i8> %a, <16 x i8> %b) #0 {
   8766 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   8767 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   8768 // CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4
   8769 // CHECK:   ret <8 x i16> [[VMULL_I_I]]
   8770 int16x8_t test_vmull_high_s8(int8x16_t a, int8x16_t b) {
   8771   return vmull_high_s8(a, b);
   8772 }
   8773 // CHECK-LABEL: define <4 x i32> @test_vmull_high_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   8774 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   8775 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   8776 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
   8777 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
   8778 // CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   8779 // CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   8780 // CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
   8781 // CHECK:   ret <4 x i32> [[VMULL2_I_I]]
   8782 int32x4_t test_vmull_high_s16(int16x8_t a, int16x8_t b) {
   8783   return vmull_high_s16(a, b);
   8784 }
   8785 // CHECK-LABEL: define <2 x i64> @test_vmull_high_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   8786 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
   8787 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
   8788 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
   8789 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
   8790 // CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   8791 // CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   8792 // CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
   8793 // CHECK:   ret <2 x i64> [[VMULL2_I_I]]
   8794 int64x2_t test_vmull_high_s32(int32x4_t a, int32x4_t b) {
   8795   return vmull_high_s32(a, b);
   8796 }
   8797 // CHECK-LABEL: define <8 x i16> @test_vmull_high_u8(<16 x i8> %a, <16 x i8> %b) #0 {
   8798 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   8799 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   8800 // CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4
   8801 // CHECK:   ret <8 x i16> [[VMULL_I_I]]
   8802 uint16x8_t test_vmull_high_u8(uint8x16_t a, uint8x16_t b) {
   8803   return vmull_high_u8(a, b);
   8804 }
   8805 // CHECK-LABEL: define <4 x i32> @test_vmull_high_u16(<8 x i16> %a, <8 x i16> %b) #0 {
   8806 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   8807 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   8808 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
   8809 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
   8810 // CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   8811 // CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   8812 // CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
   8813 // CHECK:   ret <4 x i32> [[VMULL2_I_I]]
   8814 uint32x4_t test_vmull_high_u16(uint16x8_t a, uint16x8_t b) {
   8815   return vmull_high_u16(a, b);
   8816 }
   8817 // CHECK-LABEL: define <2 x i64> @test_vmull_high_u32(<4 x i32> %a, <4 x i32> %b) #0 {
   8818 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
   8819 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
   8820 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
   8821 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
   8822 // CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   8823 // CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   8824 // CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
   8825 // CHECK:   ret <2 x i64> [[VMULL2_I_I]]
   8826 uint64x2_t test_vmull_high_u32(uint32x4_t a, uint32x4_t b) {
   8827   return vmull_high_u32(a, b);
   8828 }
   8829 
   8830 // CHECK-LABEL: define <8 x i16> @test_vmlal_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
   8831 // CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %b, <8 x i8> %c) #4
   8832 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I]]
   8833 // CHECK:   ret <8 x i16> [[ADD_I]]
   8834 int16x8_t test_vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
   8835   return vmlal_s8(a, b, c);
   8836 }
   8837 // CHECK-LABEL: define <4 x i32> @test_vmlal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
   8838 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   8839 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
   8840 // CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   8841 // CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   8842 // CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
   8843 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
   8844 // CHECK:   ret <4 x i32> [[ADD_I]]
   8845 int32x4_t test_vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   8846   return vmlal_s16(a, b, c);
   8847 }
   8848 // CHECK-LABEL: define <2 x i64> @test_vmlal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
   8849 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   8850 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
   8851 // CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   8852 // CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   8853 // CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
   8854 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
   8855 // CHECK:   ret <2 x i64> [[ADD_I]]
   8856 int64x2_t test_vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   8857   return vmlal_s32(a, b, c);
   8858 }
   8859 // CHECK-LABEL: define <8 x i16> @test_vmlal_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
   8860 // CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %b, <8 x i8> %c) #4
   8861 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I]]
   8862 // CHECK:   ret <8 x i16> [[ADD_I]]
   8863 uint16x8_t test_vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
   8864   return vmlal_u8(a, b, c);
   8865 }
   8866 // CHECK-LABEL: define <4 x i32> @test_vmlal_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
   8867 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   8868 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
   8869 // CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   8870 // CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   8871 // CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
   8872 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
   8873 // CHECK:   ret <4 x i32> [[ADD_I]]
   8874 uint32x4_t test_vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
   8875   return vmlal_u16(a, b, c);
   8876 }
   8877 // CHECK-LABEL: define <2 x i64> @test_vmlal_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
   8878 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   8879 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
   8880 // CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   8881 // CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   8882 // CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
   8883 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
   8884 // CHECK:   ret <2 x i64> [[ADD_I]]
   8885 uint64x2_t test_vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
   8886   return vmlal_u32(a, b, c);
   8887 }
   8888 
   8889 // CHECK-LABEL: define <8 x i16> @test_vmlal_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) #0 {
   8890 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   8891 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %c, <16 x i8> %c, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   8892 // CHECK:   [[VMULL_I_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4
   8893 // CHECK:   [[ADD_I_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I_I]]
   8894 // CHECK:   ret <8 x i16> [[ADD_I_I]]
   8895 int16x8_t test_vmlal_high_s8(int16x8_t a, int8x16_t b, int8x16_t c) {
   8896   return vmlal_high_s8(a, b, c);
   8897 }
   8898 // CHECK-LABEL: define <4 x i32> @test_vmlal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) #0 {
   8899 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   8900 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   8901 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
   8902 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
   8903 // CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   8904 // CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   8905 // CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) #4
   8906 // CHECK:   [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I_I]]
   8907 // CHECK:   ret <4 x i32> [[ADD_I_I]]
   8908 int32x4_t test_vmlal_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) {
   8909   return vmlal_high_s16(a, b, c);
   8910 }
   8911 // CHECK-LABEL: define <2 x i64> @test_vmlal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) #0 {
   8912 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
   8913 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> <i32 2, i32 3>
   8914 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
   8915 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
   8916 // CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   8917 // CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   8918 // CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) #4
   8919 // CHECK:   [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I_I]]
   8920 // CHECK:   ret <2 x i64> [[ADD_I_I]]
   8921 int64x2_t test_vmlal_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) {
   8922   return vmlal_high_s32(a, b, c);
   8923 }
   8924 // CHECK-LABEL: define <8 x i16> @test_vmlal_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) #0 {
   8925 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   8926 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %c, <16 x i8> %c, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   8927 // CHECK:   [[VMULL_I_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4
   8928 // CHECK:   [[ADD_I_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I_I]]
   8929 // CHECK:   ret <8 x i16> [[ADD_I_I]]
   8930 uint16x8_t test_vmlal_high_u8(uint16x8_t a, uint8x16_t b, uint8x16_t c) {
   8931   return vmlal_high_u8(a, b, c);
   8932 }
   8933 // CHECK-LABEL: define <4 x i32> @test_vmlal_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) #0 {
   8934 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   8935 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   8936 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
   8937 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
   8938 // CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   8939 // CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   8940 // CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) #4
   8941 // CHECK:   [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I_I]]
   8942 // CHECK:   ret <4 x i32> [[ADD_I_I]]
   8943 uint32x4_t test_vmlal_high_u16(uint32x4_t a, uint16x8_t b, uint16x8_t c) {
   8944   return vmlal_high_u16(a, b, c);
   8945 }
   8946 // CHECK-LABEL: define <2 x i64> @test_vmlal_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) #0 {
   8947 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
   8948 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> <i32 2, i32 3>
   8949 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
   8950 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
   8951 // CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   8952 // CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   8953 // CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) #4
   8954 // CHECK:   [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I_I]]
   8955 // CHECK:   ret <2 x i64> [[ADD_I_I]]
   8956 uint64x2_t test_vmlal_high_u32(uint64x2_t a, uint32x4_t b, uint32x4_t c) {
   8957   return vmlal_high_u32(a, b, c);
   8958 }
   8959 
   8960 // CHECK-LABEL: define <8 x i16> @test_vmlsl_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
   8961 // CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %b, <8 x i8> %c) #4
   8962 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I]]
   8963 // CHECK:   ret <8 x i16> [[SUB_I]]
   8964 int16x8_t test_vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
   8965   return vmlsl_s8(a, b, c);
   8966 }
   8967 // CHECK-LABEL: define <4 x i32> @test_vmlsl_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
   8968 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   8969 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
   8970 // CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   8971 // CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   8972 // CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
   8973 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
   8974 // CHECK:   ret <4 x i32> [[SUB_I]]
   8975 int32x4_t test_vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   8976   return vmlsl_s16(a, b, c);
   8977 }
   8978 // CHECK-LABEL: define <2 x i64> @test_vmlsl_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
   8979 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   8980 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
   8981 // CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   8982 // CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   8983 // CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
   8984 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
   8985 // CHECK:   ret <2 x i64> [[SUB_I]]
   8986 int64x2_t test_vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   8987   return vmlsl_s32(a, b, c);
   8988 }
   8989 // CHECK-LABEL: define <8 x i16> @test_vmlsl_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
   8990 // CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %b, <8 x i8> %c) #4
   8991 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I]]
   8992 // CHECK:   ret <8 x i16> [[SUB_I]]
   8993 uint16x8_t test_vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
   8994   return vmlsl_u8(a, b, c);
   8995 }
   8996 // CHECK-LABEL: define <4 x i32> @test_vmlsl_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
   8997 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   8998 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
   8999 // CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   9000 // CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   9001 // CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
   9002 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
   9003 // CHECK:   ret <4 x i32> [[SUB_I]]
   9004 uint32x4_t test_vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
   9005   return vmlsl_u16(a, b, c);
   9006 }
   9007 // CHECK-LABEL: define <2 x i64> @test_vmlsl_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
   9008 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   9009 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
   9010 // CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   9011 // CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   9012 // CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
   9013 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
   9014 // CHECK:   ret <2 x i64> [[SUB_I]]
   9015 uint64x2_t test_vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
   9016   return vmlsl_u32(a, b, c);
   9017 }
   9018 
   9019 // CHECK-LABEL: define <8 x i16> @test_vmlsl_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) #0 {
   9020 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   9021 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %c, <16 x i8> %c, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   9022 // CHECK:   [[VMULL_I_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4
   9023 // CHECK:   [[SUB_I_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I_I]]
   9024 // CHECK:   ret <8 x i16> [[SUB_I_I]]
   9025 int16x8_t test_vmlsl_high_s8(int16x8_t a, int8x16_t b, int8x16_t c) {
   9026   return vmlsl_high_s8(a, b, c);
   9027 }
   9028 // CHECK-LABEL: define <4 x i32> @test_vmlsl_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) #0 {
   9029 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   9030 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   9031 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
   9032 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
   9033 // CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   9034 // CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   9035 // CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) #4
   9036 // CHECK:   [[SUB_I_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I_I]]
   9037 // CHECK:   ret <4 x i32> [[SUB_I_I]]
   9038 int32x4_t test_vmlsl_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) {
   9039   return vmlsl_high_s16(a, b, c);
   9040 }
   9041 // CHECK-LABEL: define <2 x i64> @test_vmlsl_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) #0 {
   9042 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
   9043 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> <i32 2, i32 3>
   9044 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
   9045 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
   9046 // CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   9047 // CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   9048 // CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) #4
   9049 // CHECK:   [[SUB_I_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I_I]]
   9050 // CHECK:   ret <2 x i64> [[SUB_I_I]]
   9051 int64x2_t test_vmlsl_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) {
   9052   return vmlsl_high_s32(a, b, c);
   9053 }
   9054 // CHECK-LABEL: define <8 x i16> @test_vmlsl_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) #0 {
   9055 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   9056 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %c, <16 x i8> %c, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   9057 // CHECK:   [[VMULL_I_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4
   9058 // CHECK:   [[SUB_I_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I_I]]
   9059 // CHECK:   ret <8 x i16> [[SUB_I_I]]
   9060 uint16x8_t test_vmlsl_high_u8(uint16x8_t a, uint8x16_t b, uint8x16_t c) {
   9061   return vmlsl_high_u8(a, b, c);
   9062 }
   9063 // CHECK-LABEL: define <4 x i32> @test_vmlsl_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) #0 {
   9064 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   9065 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   9066 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
   9067 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
   9068 // CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   9069 // CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   9070 // CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) #4
   9071 // CHECK:   [[SUB_I_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I_I]]
   9072 // CHECK:   ret <4 x i32> [[SUB_I_I]]
   9073 uint32x4_t test_vmlsl_high_u16(uint32x4_t a, uint16x8_t b, uint16x8_t c) {
   9074   return vmlsl_high_u16(a, b, c);
   9075 }
   9076 // CHECK-LABEL: define <2 x i64> @test_vmlsl_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) #0 {
   9077 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
   9078 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> <i32 2, i32 3>
   9079 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
   9080 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
   9081 // CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   9082 // CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   9083 // CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) #4
   9084 // CHECK:   [[SUB_I_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I_I]]
   9085 // CHECK:   ret <2 x i64> [[SUB_I_I]]
   9086 uint64x2_t test_vmlsl_high_u32(uint64x2_t a, uint32x4_t b, uint32x4_t c) {
   9087   return vmlsl_high_u32(a, b, c);
   9088 }
   9089 
   9090 // CHECK-LABEL: define <4 x i32> @test_vqdmull_s16(<4 x i16> %a, <4 x i16> %b) #0 {
   9091 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   9092 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   9093 // CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   9094 // CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   9095 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #4
   9096 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
   9097 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
   9098 // CHECK:   ret <4 x i32> [[TMP2]]
   9099 int32x4_t test_vqdmull_s16(int16x4_t a, int16x4_t b) {
   9100   return vqdmull_s16(a, b);
   9101 }
   9102 // CHECK-LABEL: define <2 x i64> @test_vqdmull_s32(<2 x i32> %a, <2 x i32> %b) #0 {
   9103 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   9104 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   9105 // CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   9106 // CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   9107 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #4
   9108 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
   9109 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
   9110 // CHECK:   ret <2 x i64> [[TMP2]]
   9111 int64x2_t test_vqdmull_s32(int32x2_t a, int32x2_t b) {
   9112   return vqdmull_s32(a, b);
   9113 }
   9114 
   9115 // CHECK-LABEL: define <4 x i32> @test_vqdmlal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
   9116 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   9117 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   9118 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
   9119 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   9120 // CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
   9121 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #4
   9122 // CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   9123 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #4
   9124 // CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
   9125 int32x4_t test_vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   9126   return vqdmlal_s16(a, b, c);
   9127 }
   9128 
   9129 // CHECK-LABEL: define <2 x i64> @test_vqdmlal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
   9130 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   9131 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   9132 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
   9133 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   9134 // CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
   9135 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #4
   9136 // CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   9137 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #4
   9138 // CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
   9139 int64x2_t test_vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   9140   return vqdmlal_s32(a, b, c);
   9141 }
   9142 
   9143 // CHECK-LABEL: define <4 x i32> @test_vqdmlsl_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
   9144 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   9145 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   9146 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
   9147 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   9148 // CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
   9149 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #4
   9150 // CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   9151 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #4
   9152 // CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
   9153 int32x4_t test_vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   9154   return vqdmlsl_s16(a, b, c);
   9155 }
   9156 
   9157 // CHECK-LABEL: define <2 x i64> @test_vqdmlsl_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
   9158 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   9159 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   9160 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
   9161 // CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   9162 // CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
   9163 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #4
   9164 // CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   9165 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #4
   9166 // CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
   9167 int64x2_t test_vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   9168   return vqdmlsl_s32(a, b, c);
   9169 }
   9170 
   9171 // CHECK-LABEL: define <4 x i32> @test_vqdmull_high_s16(<8 x i16> %a, <8 x i16> %b) #0 {
   9172 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   9173 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   9174 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
   9175 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
   9176 // CHECK:   [[VQDMULL_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   9177 // CHECK:   [[VQDMULL_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   9178 // CHECK:   [[VQDMULL_V2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I_I]], <4 x i16> [[VQDMULL_V1_I_I]]) #4
   9179 // CHECK:   [[VQDMULL_V3_I_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I_I]] to <16 x i8>
   9180 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I_I]] to <4 x i32>
   9181 // CHECK:   ret <4 x i32> [[TMP2]]
   9182 int32x4_t test_vqdmull_high_s16(int16x8_t a, int16x8_t b) {
   9183   return vqdmull_high_s16(a, b);
   9184 }
   9185 // CHECK-LABEL: define <2 x i64> @test_vqdmull_high_s32(<4 x i32> %a, <4 x i32> %b) #0 {
   9186 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
   9187 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
   9188 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
   9189 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
   9190 // CHECK:   [[VQDMULL_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   9191 // CHECK:   [[VQDMULL_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   9192 // CHECK:   [[VQDMULL_V2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I_I]], <2 x i32> [[VQDMULL_V1_I_I]]) #4
   9193 // CHECK:   [[VQDMULL_V3_I_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I_I]] to <16 x i8>
   9194 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I_I]] to <2 x i64>
   9195 // CHECK:   ret <2 x i64> [[TMP2]]
   9196 int64x2_t test_vqdmull_high_s32(int32x4_t a, int32x4_t b) {
   9197   return vqdmull_high_s32(a, b);
   9198 }
   9199 
   9200 // CHECK-LABEL: define <4 x i32> @test_vqdmlal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) #0 {
   9201 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   9202 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   9203 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   9204 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
   9205 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
   9206 // CHECK:   [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   9207 // CHECK:   [[VQDMLAL1_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
   9208 // CHECK:   [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I_I]], <4 x i16> [[VQDMLAL1_I_I]]) #4
   9209 // CHECK:   [[VQDMLAL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   9210 // CHECK:   [[VQDMLAL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I_I]], <4 x i32> [[VQDMLAL2_I_I]]) #4
   9211 // CHECK:   ret <4 x i32> [[VQDMLAL_V3_I_I]]
   9212 int32x4_t test_vqdmlal_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) {
   9213   return vqdmlal_high_s16(a, b, c);
   9214 }
   9215 
   9216 // CHECK-LABEL: define <2 x i64> @test_vqdmlal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) #0 {
   9217 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
   9218 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> <i32 2, i32 3>
   9219 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   9220 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
   9221 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
   9222 // CHECK:   [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   9223 // CHECK:   [[VQDMLAL1_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
   9224 // CHECK:   [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I_I]], <2 x i32> [[VQDMLAL1_I_I]]) #4
   9225 // CHECK:   [[VQDMLAL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   9226 // CHECK:   [[VQDMLAL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I_I]], <2 x i64> [[VQDMLAL2_I_I]]) #4
   9227 // CHECK:   ret <2 x i64> [[VQDMLAL_V3_I_I]]
   9228 int64x2_t test_vqdmlal_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) {
   9229   return vqdmlal_high_s32(a, b, c);
   9230 }
   9231 
   9232 // CHECK-LABEL: define <4 x i32> @test_vqdmlsl_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) #0 {
   9233 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   9234 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   9235 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   9236 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
   9237 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
   9238 // CHECK:   [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   9239 // CHECK:   [[VQDMLAL1_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
   9240 // CHECK:   [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I_I]], <4 x i16> [[VQDMLAL1_I_I]]) #4
   9241 // CHECK:   [[VQDMLSL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   9242 // CHECK:   [[VQDMLSL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I_I]], <4 x i32> [[VQDMLAL2_I_I]]) #4
   9243 // CHECK:   ret <4 x i32> [[VQDMLSL_V3_I_I]]
   9244 int32x4_t test_vqdmlsl_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) {
   9245   return vqdmlsl_high_s16(a, b, c);
   9246 }
   9247 
   9248 // CHECK-LABEL: define <2 x i64> @test_vqdmlsl_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) #0 {
   9249 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
   9250 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> <i32 2, i32 3>
   9251 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   9252 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
   9253 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
   9254 // CHECK:   [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   9255 // CHECK:   [[VQDMLAL1_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
   9256 // CHECK:   [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I_I]], <2 x i32> [[VQDMLAL1_I_I]]) #4
   9257 // CHECK:   [[VQDMLSL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   9258 // CHECK:   [[VQDMLSL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I_I]], <2 x i64> [[VQDMLAL2_I_I]]) #4
   9259 // CHECK:   ret <2 x i64> [[VQDMLSL_V3_I_I]]
   9260 int64x2_t test_vqdmlsl_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) {
   9261   return vqdmlsl_high_s32(a, b, c);
   9262 }
   9263 
   9264 // CHECK-LABEL: define <8 x i16> @test_vmull_p8(<8 x i8> %a, <8 x i8> %b) #0 {
   9265 // CHECK:   [[VMULL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %a, <8 x i8> %b) #4
   9266 // CHECK:   ret <8 x i16> [[VMULL_I]]
   9267 poly16x8_t test_vmull_p8(poly8x8_t a, poly8x8_t b) {
   9268   return vmull_p8(a, b);
   9269 }
   9270 
   9271 // CHECK-LABEL: define <8 x i16> @test_vmull_high_p8(<16 x i8> %a, <16 x i8> %b) #0 {
   9272 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   9273 // CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   9274 // CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4
   9275 // CHECK:   ret <8 x i16> [[VMULL_I_I]]
   9276 poly16x8_t test_vmull_high_p8(poly8x16_t a, poly8x16_t b) {
   9277   return vmull_high_p8(a, b);
   9278 }
   9279 
   9280 // CHECK-LABEL: define i64 @test_vaddd_s64(i64 %a, i64 %b) #0 {
   9281 // CHECK:   [[VADDD_I:%.*]] = add i64 %a, %b
   9282 // CHECK:   ret i64 [[VADDD_I]]
   9283 int64_t test_vaddd_s64(int64_t a, int64_t b) {
   9284   return vaddd_s64(a, b);
   9285 }
   9286 
   9287 // CHECK-LABEL: define i64 @test_vaddd_u64(i64 %a, i64 %b) #0 {
   9288 // CHECK:   [[VADDD_I:%.*]] = add i64 %a, %b
   9289 // CHECK:   ret i64 [[VADDD_I]]
   9290 uint64_t test_vaddd_u64(uint64_t a, uint64_t b) {
   9291   return vaddd_u64(a, b);
   9292 }
   9293 
   9294 // CHECK-LABEL: define i64 @test_vsubd_s64(i64 %a, i64 %b) #0 {
   9295 // CHECK:   [[VSUBD_I:%.*]] = sub i64 %a, %b
   9296 // CHECK:   ret i64 [[VSUBD_I]]
   9297 int64_t test_vsubd_s64(int64_t a, int64_t b) {
   9298   return vsubd_s64(a, b);
   9299 }
   9300 
   9301 // CHECK-LABEL: define i64 @test_vsubd_u64(i64 %a, i64 %b) #0 {
   9302 // CHECK:   [[VSUBD_I:%.*]] = sub i64 %a, %b
   9303 // CHECK:   ret i64 [[VSUBD_I]]
   9304 uint64_t test_vsubd_u64(uint64_t a, uint64_t b) {
   9305   return vsubd_u64(a, b);
   9306 }
   9307 
   9308 // CHECK-LABEL: define i8 @test_vqaddb_s8(i8 %a, i8 %b) #0 {
   9309 // CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
   9310 // CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0
   9311 // CHECK:   [[VQADDB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqadd.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4
   9312 // CHECK:   [[TMP2:%.*]] = extractelement <8 x i8> [[VQADDB_S8_I]], i64 0
   9313 // CHECK:   ret i8 [[TMP2]]
   9314 int8_t test_vqaddb_s8(int8_t a, int8_t b) {
   9315   return vqaddb_s8(a, b);
   9316 }
   9317 
   9318 // CHECK-LABEL: define i16 @test_vqaddh_s16(i16 %a, i16 %b) #0 {
   9319 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
   9320 // CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
   9321 // CHECK:   [[VQADDH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
   9322 // CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VQADDH_S16_I]], i64 0
   9323 // CHECK:   ret i16 [[TMP2]]
   9324 int16_t test_vqaddh_s16(int16_t a, int16_t b) {
   9325   return vqaddh_s16(a, b);
   9326 }
   9327 
   9328 // CHECK-LABEL: define i32 @test_vqadds_s32(i32 %a, i32 %b) #0 {
   9329 // CHECK:   [[VQADDS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %a, i32 %b) #4
   9330 // CHECK:   ret i32 [[VQADDS_S32_I]]
   9331 int32_t test_vqadds_s32(int32_t a, int32_t b) {
   9332   return vqadds_s32(a, b);
   9333 }
   9334 
   9335 // CHECK-LABEL: define i64 @test_vqaddd_s64(i64 %a, i64 %b) #0 {
   9336 // CHECK:   [[VQADDD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %a, i64 %b) #4
   9337 // CHECK:   ret i64 [[VQADDD_S64_I]]
   9338 int64_t test_vqaddd_s64(int64_t a, int64_t b) {
   9339   return vqaddd_s64(a, b);
   9340 }
   9341 
   9342 // CHECK-LABEL: define i8 @test_vqaddb_u8(i8 %a, i8 %b) #0 {
   9343 // CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
   9344 // CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0
   9345 // CHECK:   [[VQADDB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqadd.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4
   9346 // CHECK:   [[TMP2:%.*]] = extractelement <8 x i8> [[VQADDB_U8_I]], i64 0
   9347 // CHECK:   ret i8 [[TMP2]]
   9348 uint8_t test_vqaddb_u8(uint8_t a, uint8_t b) {
   9349   return vqaddb_u8(a, b);
   9350 }
   9351 
   9352 // CHECK-LABEL: define i16 @test_vqaddh_u16(i16 %a, i16 %b) #0 {
   9353 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
   9354 // CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
   9355 // CHECK:   [[VQADDH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqadd.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
   9356 // CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VQADDH_U16_I]], i64 0
   9357 // CHECK:   ret i16 [[TMP2]]
   9358 uint16_t test_vqaddh_u16(uint16_t a, uint16_t b) {
   9359   return vqaddh_u16(a, b);
   9360 }
   9361 
   9362 // CHECK-LABEL: define i32 @test_vqadds_u32(i32 %a, i32 %b) #0 {
   9363 // CHECK:   [[VQADDS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uqadd.i32(i32 %a, i32 %b) #4
   9364 // CHECK:   ret i32 [[VQADDS_U32_I]]
   9365 uint32_t test_vqadds_u32(uint32_t a, uint32_t b) {
   9366   return vqadds_u32(a, b);
   9367 }
   9368 
   9369 // CHECK-LABEL: define i64 @test_vqaddd_u64(i64 %a, i64 %b) #0 {
   9370 // CHECK:   [[VQADDD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uqadd.i64(i64 %a, i64 %b) #4
   9371 // CHECK:   ret i64 [[VQADDD_U64_I]]
   9372 uint64_t test_vqaddd_u64(uint64_t a, uint64_t b) {
   9373   return vqaddd_u64(a, b);
   9374 }
   9375 
   9376 // CHECK-LABEL: define i8 @test_vqsubb_s8(i8 %a, i8 %b) #0 {
   9377 // CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
   9378 // CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0
   9379 // CHECK:   [[VQSUBB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqsub.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4
   9380 // CHECK:   [[TMP2:%.*]] = extractelement <8 x i8> [[VQSUBB_S8_I]], i64 0
   9381 // CHECK:   ret i8 [[TMP2]]
   9382 int8_t test_vqsubb_s8(int8_t a, int8_t b) {
   9383   return vqsubb_s8(a, b);
   9384 }
   9385 
   9386 // CHECK-LABEL: define i16 @test_vqsubh_s16(i16 %a, i16 %b) #0 {
   9387 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
   9388 // CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
   9389 // CHECK:   [[VQSUBH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
   9390 // CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VQSUBH_S16_I]], i64 0
   9391 // CHECK:   ret i16 [[TMP2]]
   9392 int16_t test_vqsubh_s16(int16_t a, int16_t b) {
   9393   return vqsubh_s16(a, b);
   9394 }
   9395 
   9396 // CHECK-LABEL: define i32 @test_vqsubs_s32(i32 %a, i32 %b) #0 {
   9397 // CHECK:   [[VQSUBS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %a, i32 %b) #4
   9398 // CHECK:   ret i32 [[VQSUBS_S32_I]]
   9399 int32_t test_vqsubs_s32(int32_t a, int32_t b) {
   9400   return vqsubs_s32(a, b);
   9401 }
   9402 
   9403 // CHECK-LABEL: define i64 @test_vqsubd_s64(i64 %a, i64 %b) #0 {
   9404 // CHECK:   [[VQSUBD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %a, i64 %b) #4
   9405 // CHECK:   ret i64 [[VQSUBD_S64_I]]
   9406 int64_t test_vqsubd_s64(int64_t a, int64_t b) {
   9407   return vqsubd_s64(a, b);
   9408 }
   9409 
   9410 // CHECK-LABEL: define i8 @test_vqsubb_u8(i8 %a, i8 %b) #0 {
   9411 // CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
   9412 // CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0
   9413 // CHECK:   [[VQSUBB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqsub.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4
   9414 // CHECK:   [[TMP2:%.*]] = extractelement <8 x i8> [[VQSUBB_U8_I]], i64 0
   9415 // CHECK:   ret i8 [[TMP2]]
   9416 uint8_t test_vqsubb_u8(uint8_t a, uint8_t b) {
   9417   return vqsubb_u8(a, b);
   9418 }
   9419 
   9420 // CHECK-LABEL: define i16 @test_vqsubh_u16(i16 %a, i16 %b) #0 {
   9421 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
   9422 // CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
   9423 // CHECK:   [[VQSUBH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqsub.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
   9424 // CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VQSUBH_U16_I]], i64 0
   9425 // CHECK:   ret i16 [[TMP2]]
   9426 uint16_t test_vqsubh_u16(uint16_t a, uint16_t b) {
   9427   return vqsubh_u16(a, b);
   9428 }
   9429 
   9430 // CHECK-LABEL: define i32 @test_vqsubs_u32(i32 %a, i32 %b) #0 {
   9431 // CHECK:   [[VQSUBS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uqsub.i32(i32 %a, i32 %b) #4
   9432 // CHECK:   ret i32 [[VQSUBS_U32_I]]
   9433 uint32_t test_vqsubs_u32(uint32_t a, uint32_t b) {
   9434   return vqsubs_u32(a, b);
   9435 }
   9436 
   9437 // CHECK-LABEL: define i64 @test_vqsubd_u64(i64 %a, i64 %b) #0 {
   9438 // CHECK:   [[VQSUBD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uqsub.i64(i64 %a, i64 %b) #4
   9439 // CHECK:   ret i64 [[VQSUBD_U64_I]]
   9440 uint64_t test_vqsubd_u64(uint64_t a, uint64_t b) {
   9441   return vqsubd_u64(a, b);
   9442 }
   9443 
   9444 // CHECK-LABEL: define i64 @test_vshld_s64(i64 %a, i64 %b) #0 {
   9445 // CHECK:   [[VSHLD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sshl.i64(i64 %a, i64 %b) #4
   9446 // CHECK:   ret i64 [[VSHLD_S64_I]]
   9447 int64_t test_vshld_s64(int64_t a, int64_t b) {
   9448   return vshld_s64(a, b);
   9449 }
   9450 
   9451 // CHECK-LABEL: define i64 @test_vshld_u64(i64 %a, i64 %b) #0 {
   9452 // CHECK:   [[VSHLD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.ushl.i64(i64 %a, i64 %b) #4
   9453 // CHECK:   ret i64 [[VSHLD_U64_I]]
   9454 uint64_t test_vshld_u64(uint64_t a, uint64_t b) {
   9455   return vshld_u64(a, b);
   9456 }
   9457 
   9458 // CHECK-LABEL: define i8 @test_vqshlb_s8(i8 %a, i8 %b) #0 {
   9459 // CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
   9460 // CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0
   9461 // CHECK:   [[VQSHLB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4
   9462 // CHECK:   [[TMP2:%.*]] = extractelement <8 x i8> [[VQSHLB_S8_I]], i64 0
   9463 // CHECK:   ret i8 [[TMP2]]
   9464 int8_t test_vqshlb_s8(int8_t a, int8_t b) {
   9465   return vqshlb_s8(a, b);
   9466 }
   9467 
   9468 // CHECK-LABEL: define i16 @test_vqshlh_s16(i16 %a, i16 %b) #0 {
   9469 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
   9470 // CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
   9471 // CHECK:   [[VQSHLH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
   9472 // CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VQSHLH_S16_I]], i64 0
   9473 // CHECK:   ret i16 [[TMP2]]
   9474 int16_t test_vqshlh_s16(int16_t a, int16_t b) {
   9475   return vqshlh_s16(a, b);
   9476 }
   9477 
   9478 // CHECK-LABEL: define i32 @test_vqshls_s32(i32 %a, i32 %b) #0 {
   9479 // CHECK:   [[VQSHLS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqshl.i32(i32 %a, i32 %b) #4
   9480 // CHECK:   ret i32 [[VQSHLS_S32_I]]
   9481 int32_t test_vqshls_s32(int32_t a, int32_t b) {
   9482   return vqshls_s32(a, b);
   9483 }
   9484 
   9485 // CHECK-LABEL: define i64 @test_vqshld_s64(i64 %a, i64 %b) #0 {
   9486 // CHECK:   [[VQSHLD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqshl.i64(i64 %a, i64 %b) #4
   9487 // CHECK:   ret i64 [[VQSHLD_S64_I]]
   9488 int64_t test_vqshld_s64(int64_t a, int64_t b) {
   9489   return vqshld_s64(a, b);
   9490 }
   9491 
   9492 // CHECK-LABEL: define i8 @test_vqshlb_u8(i8 %a, i8 %b) #0 {
   9493 // CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
   9494 // CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0
   9495 // CHECK:   [[VQSHLB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4
   9496 // CHECK:   [[TMP2:%.*]] = extractelement <8 x i8> [[VQSHLB_U8_I]], i64 0
   9497 // CHECK:   ret i8 [[TMP2]]
   9498 uint8_t test_vqshlb_u8(uint8_t a, uint8_t b) {
   9499   return vqshlb_u8(a, b);
   9500 }
   9501 
   9502 // CHECK-LABEL: define i16 @test_vqshlh_u16(i16 %a, i16 %b) #0 {
   9503 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
   9504 // CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
   9505 // CHECK:   [[VQSHLH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
   9506 // CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VQSHLH_U16_I]], i64 0
   9507 // CHECK:   ret i16 [[TMP2]]
   9508 uint16_t test_vqshlh_u16(uint16_t a, uint16_t b) {
   9509   return vqshlh_u16(a, b);
   9510 }
   9511 
   9512 // CHECK-LABEL: define i32 @test_vqshls_u32(i32 %a, i32 %b) #0 {
   9513 // CHECK:   [[VQSHLS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uqshl.i32(i32 %a, i32 %b) #4
   9514 // CHECK:   ret i32 [[VQSHLS_U32_I]]
   9515 uint32_t test_vqshls_u32(uint32_t a, uint32_t b) {
   9516   return vqshls_u32(a, b);
   9517 }
   9518 
   9519 // CHECK-LABEL: define i64 @test_vqshld_u64(i64 %a, i64 %b) #0 {
   9520 // CHECK:   [[VQSHLD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uqshl.i64(i64 %a, i64 %b) #4
   9521 // CHECK:   ret i64 [[VQSHLD_U64_I]]
   9522 uint64_t test_vqshld_u64(uint64_t a, uint64_t b) {
   9523   return vqshld_u64(a, b);
   9524 }
   9525 
   9526 // CHECK-LABEL: define i64 @test_vrshld_s64(i64 %a, i64 %b) #0 {
   9527 // CHECK:   [[VRSHLD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.srshl.i64(i64 %a, i64 %b) #4
   9528 // CHECK:   ret i64 [[VRSHLD_S64_I]]
   9529 int64_t test_vrshld_s64(int64_t a, int64_t b) {
   9530   return vrshld_s64(a, b);
   9531 }
   9532 
   9533 
   9534 // CHECK-LABEL: define i64 @test_vrshld_u64(i64 %a, i64 %b) #0 {
   9535 // CHECK:   [[VRSHLD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.urshl.i64(i64 %a, i64 %b) #4
   9536 // CHECK:   ret i64 [[VRSHLD_U64_I]]
   9537 uint64_t test_vrshld_u64(uint64_t a, uint64_t b) {
   9538   return vrshld_u64(a, b);
   9539 }
   9540 
   9541 // CHECK-LABEL: define i8 @test_vqrshlb_s8(i8 %a, i8 %b) #0 {
   9542 // CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
   9543 // CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0
   9544 // CHECK:   [[VQRSHLB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4
   9545 // CHECK:   [[TMP2:%.*]] = extractelement <8 x i8> [[VQRSHLB_S8_I]], i64 0
   9546 // CHECK:   ret i8 [[TMP2]]
   9547 int8_t test_vqrshlb_s8(int8_t a, int8_t b) {
   9548   return vqrshlb_s8(a, b);
   9549 }
   9550 
   9551 // CHECK-LABEL: define i16 @test_vqrshlh_s16(i16 %a, i16 %b) #0 {
   9552 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
   9553 // CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
   9554 // CHECK:   [[VQRSHLH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
   9555 // CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VQRSHLH_S16_I]], i64 0
   9556 // CHECK:   ret i16 [[TMP2]]
   9557 int16_t test_vqrshlh_s16(int16_t a, int16_t b) {
   9558   return vqrshlh_s16(a, b);
   9559 }
   9560 
   9561 // CHECK-LABEL: define i32 @test_vqrshls_s32(i32 %a, i32 %b) #0 {
   9562 // CHECK:   [[VQRSHLS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrshl.i32(i32 %a, i32 %b) #4
   9563 // CHECK:   ret i32 [[VQRSHLS_S32_I]]
   9564 int32_t test_vqrshls_s32(int32_t a, int32_t b) {
   9565   return vqrshls_s32(a, b);
   9566 }
   9567 
   9568 // CHECK-LABEL: define i64 @test_vqrshld_s64(i64 %a, i64 %b) #0 {
   9569 // CHECK:   [[VQRSHLD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqrshl.i64(i64 %a, i64 %b) #4
   9570 // CHECK:   ret i64 [[VQRSHLD_S64_I]]
   9571 int64_t test_vqrshld_s64(int64_t a, int64_t b) {
   9572   return vqrshld_s64(a, b);
   9573 }
   9574 
   9575 // CHECK-LABEL: define i8 @test_vqrshlb_u8(i8 %a, i8 %b) #0 {
   9576 // CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
   9577 // CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0
   9578 // CHECK:   [[VQRSHLB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4
   9579 // CHECK:   [[TMP2:%.*]] = extractelement <8 x i8> [[VQRSHLB_U8_I]], i64 0
   9580 // CHECK:   ret i8 [[TMP2]]
   9581 uint8_t test_vqrshlb_u8(uint8_t a, uint8_t b) {
   9582   return vqrshlb_u8(a, b);
   9583 }
   9584 
   9585 // CHECK-LABEL: define i16 @test_vqrshlh_u16(i16 %a, i16 %b) #0 {
   9586 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
   9587 // CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
   9588 // CHECK:   [[VQRSHLH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
   9589 // CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VQRSHLH_U16_I]], i64 0
   9590 // CHECK:   ret i16 [[TMP2]]
   9591 uint16_t test_vqrshlh_u16(uint16_t a, uint16_t b) {
   9592   return vqrshlh_u16(a, b);
   9593 }
   9594 
   9595 // CHECK-LABEL: define i32 @test_vqrshls_u32(i32 %a, i32 %b) #0 {
   9596 // CHECK:   [[VQRSHLS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uqrshl.i32(i32 %a, i32 %b) #4
   9597 // CHECK:   ret i32 [[VQRSHLS_U32_I]]
   9598 uint32_t test_vqrshls_u32(uint32_t a, uint32_t b) {
   9599   return vqrshls_u32(a, b);
   9600 }
   9601 
   9602 // CHECK-LABEL: define i64 @test_vqrshld_u64(i64 %a, i64 %b) #0 {
   9603 // CHECK:   [[VQRSHLD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uqrshl.i64(i64 %a, i64 %b) #4
   9604 // CHECK:   ret i64 [[VQRSHLD_U64_I]]
   9605 uint64_t test_vqrshld_u64(uint64_t a, uint64_t b) {
   9606   return vqrshld_u64(a, b);
   9607 }
   9608 
   9609 // CHECK-LABEL: define i64 @test_vpaddd_s64(<2 x i64> %a) #0 {
   9610 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   9611 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   9612 // CHECK:   [[VPADDD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> [[TMP1]]) #4
   9613 // CHECK:   ret i64 [[VPADDD_S64_I]]
   9614 int64_t test_vpaddd_s64(int64x2_t a) {
   9615   return vpaddd_s64(a);
   9616 }
   9617 
   9618 // CHECK-LABEL: define float @test_vpadds_f32(<2 x float> %a) #0 {
   9619 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   9620 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   9621 // CHECK:   [[LANE0_I:%.*]] = extractelement <2 x float> [[TMP1]], i64 0
   9622 // CHECK:   [[LANE1_I:%.*]] = extractelement <2 x float> [[TMP1]], i64 1
   9623 // CHECK:   [[VPADDD_I:%.*]] = fadd float [[LANE0_I]], [[LANE1_I]]
   9624 // CHECK:   ret float [[VPADDD_I]]
   9625 float32_t test_vpadds_f32(float32x2_t a) {
   9626   return vpadds_f32(a);
   9627 }
   9628 
   9629 // CHECK-LABEL: define double @test_vpaddd_f64(<2 x double> %a) #0 {
   9630 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
   9631 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
   9632 // CHECK:   [[LANE0_I:%.*]] = extractelement <2 x double> [[TMP1]], i64 0
   9633 // CHECK:   [[LANE1_I:%.*]] = extractelement <2 x double> [[TMP1]], i64 1
   9634 // CHECK:   [[VPADDD_I:%.*]] = fadd double [[LANE0_I]], [[LANE1_I]]
   9635 // CHECK:   ret double [[VPADDD_I]]
   9636 float64_t test_vpaddd_f64(float64x2_t a) {
   9637   return vpaddd_f64(a);
   9638 }
   9639 
   9640 // CHECK-LABEL: define float @test_vpmaxnms_f32(<2 x float> %a) #0 {
   9641 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   9642 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   9643 // CHECK:   [[VPMAXNMS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxnmv.f32.v2f32(<2 x float> [[TMP1]]) #4
   9644 // CHECK:   ret float [[VPMAXNMS_F32_I]]
   9645 float32_t test_vpmaxnms_f32(float32x2_t a) {
   9646   return vpmaxnms_f32(a);
   9647 }
   9648 
   9649 // CHECK-LABEL: define double @test_vpmaxnmqd_f64(<2 x double> %a) #0 {
   9650 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
   9651 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
   9652 // CHECK:   [[VPMAXNMQD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmaxnmv.f64.v2f64(<2 x double> [[TMP1]]) #4
   9653 // CHECK:   ret double [[VPMAXNMQD_F64_I]]
   9654 float64_t test_vpmaxnmqd_f64(float64x2_t a) {
   9655   return vpmaxnmqd_f64(a);
   9656 }
   9657 
   9658 // CHECK-LABEL: define float @test_vpmaxs_f32(<2 x float> %a) #0 {
   9659 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   9660 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   9661 // CHECK:   [[VPMAXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float> [[TMP1]]) #4
   9662 // CHECK:   ret float [[VPMAXS_F32_I]]
   9663 float32_t test_vpmaxs_f32(float32x2_t a) {
   9664   return vpmaxs_f32(a);
   9665 }
   9666 
   9667 // CHECK-LABEL: define double @test_vpmaxqd_f64(<2 x double> %a) #0 {
   9668 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
   9669 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
   9670 // CHECK:   [[VPMAXQD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmaxv.f64.v2f64(<2 x double> [[TMP1]]) #4
   9671 // CHECK:   ret double [[VPMAXQD_F64_I]]
   9672 float64_t test_vpmaxqd_f64(float64x2_t a) {
   9673   return vpmaxqd_f64(a);
   9674 }
   9675 
   9676 // CHECK-LABEL: define float @test_vpminnms_f32(<2 x float> %a) #0 {
   9677 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   9678 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   9679 // CHECK:   [[VPMINNMS_F32_I:%.*]] = call float @llvm.aarch64.neon.fminnmv.f32.v2f32(<2 x float> [[TMP1]]) #4
   9680 // CHECK:   ret float [[VPMINNMS_F32_I]]
   9681 float32_t test_vpminnms_f32(float32x2_t a) {
   9682   return vpminnms_f32(a);
   9683 }
   9684 
   9685 // CHECK-LABEL: define double @test_vpminnmqd_f64(<2 x double> %a) #0 {
   9686 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
   9687 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
   9688 // CHECK:   [[VPMINNMQD_F64_I:%.*]] = call double @llvm.aarch64.neon.fminnmv.f64.v2f64(<2 x double> [[TMP1]]) #4
   9689 // CHECK:   ret double [[VPMINNMQD_F64_I]]
   9690 float64_t test_vpminnmqd_f64(float64x2_t a) {
   9691   return vpminnmqd_f64(a);
   9692 }
   9693 
   9694 // CHECK-LABEL: define float @test_vpmins_f32(<2 x float> %a) #0 {
   9695 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   9696 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   9697 // CHECK:   [[VPMINS_F32_I:%.*]] = call float @llvm.aarch64.neon.fminv.f32.v2f32(<2 x float> [[TMP1]]) #4
   9698 // CHECK:   ret float [[VPMINS_F32_I]]
   9699 float32_t test_vpmins_f32(float32x2_t a) {
   9700   return vpmins_f32(a);
   9701 }
   9702 
   9703 // CHECK-LABEL: define double @test_vpminqd_f64(<2 x double> %a) #0 {
   9704 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
   9705 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
   9706 // CHECK:   [[VPMINQD_F64_I:%.*]] = call double @llvm.aarch64.neon.fminv.f64.v2f64(<2 x double> [[TMP1]]) #4
   9707 // CHECK:   ret double [[VPMINQD_F64_I]]
   9708 float64_t test_vpminqd_f64(float64x2_t a) {
   9709   return vpminqd_f64(a);
   9710 }
   9711 
   9712 // CHECK-LABEL: define i16 @test_vqdmulhh_s16(i16 %a, i16 %b) #0 {
   9713 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
   9714 // CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
   9715 // CHECK:   [[VQDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
   9716 // CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VQDMULHH_S16_I]], i64 0
   9717 // CHECK:   ret i16 [[TMP2]]
   9718 int16_t test_vqdmulhh_s16(int16_t a, int16_t b) {
   9719   return vqdmulhh_s16(a, b);
   9720 }
   9721 
   9722 // CHECK-LABEL: define i32 @test_vqdmulhs_s32(i32 %a, i32 %b) #0 {
   9723 // CHECK:   [[VQDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %a, i32 %b) #4
   9724 // CHECK:   ret i32 [[VQDMULHS_S32_I]]
   9725 int32_t test_vqdmulhs_s32(int32_t a, int32_t b) {
   9726   return vqdmulhs_s32(a, b);
   9727 }
   9728 
   9729 // CHECK-LABEL: define i16 @test_vqrdmulhh_s16(i16 %a, i16 %b) #0 {
   9730 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
   9731 // CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
   9732 // CHECK:   [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
   9733 // CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0
   9734 // CHECK:   ret i16 [[TMP2]]
   9735 int16_t test_vqrdmulhh_s16(int16_t a, int16_t b) {
   9736   return vqrdmulhh_s16(a, b);
   9737 }
   9738 
   9739 // CHECK-LABEL: define i32 @test_vqrdmulhs_s32(i32 %a, i32 %b) #0 {
   9740 // CHECK:   [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %a, i32 %b) #4
   9741 // CHECK:   ret i32 [[VQRDMULHS_S32_I]]
   9742 int32_t test_vqrdmulhs_s32(int32_t a, int32_t b) {
   9743   return vqrdmulhs_s32(a, b);
   9744 }
   9745 
   9746 // CHECK-LABEL: define float @test_vmulxs_f32(float %a, float %b) #0 {
   9747 // CHECK:   [[VMULXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) #4
   9748 // CHECK:   ret float [[VMULXS_F32_I]]
   9749 float32_t test_vmulxs_f32(float32_t a, float32_t b) {
   9750   return vmulxs_f32(a, b);
   9751 }
   9752 
   9753 // CHECK-LABEL: define double @test_vmulxd_f64(double %a, double %b) #0 {
   9754 // CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b) #4
   9755 // CHECK:   ret double [[VMULXD_F64_I]]
   9756 float64_t test_vmulxd_f64(float64_t a, float64_t b) {
   9757   return vmulxd_f64(a, b);
   9758 }
   9759 
   9760 // CHECK-LABEL: define <1 x double> @test_vmulx_f64(<1 x double> %a, <1 x double> %b) #0 {
   9761 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
   9762 // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
   9763 // CHECK:   [[VMULX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
   9764 // CHECK:   [[VMULX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
   9765 // CHECK:   [[VMULX2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fmulx.v1f64(<1 x double> [[VMULX_I]], <1 x double> [[VMULX1_I]]) #4
   9766 // CHECK:   ret <1 x double> [[VMULX2_I]]
   9767 float64x1_t test_vmulx_f64(float64x1_t a, float64x1_t b) {
   9768   return vmulx_f64(a, b);
   9769 }
   9770 
   9771 // CHECK-LABEL: define float @test_vrecpss_f32(float %a, float %b) #0 {
   9772 // CHECK:   [[VRECPS_I:%.*]] = call float @llvm.aarch64.neon.frecps.f32(float %a, float %b) #4
   9773 // CHECK:   ret float [[VRECPS_I]]
   9774 float32_t test_vrecpss_f32(float32_t a, float32_t b) {
   9775   return vrecpss_f32(a, b);
   9776 }
   9777 
   9778 // CHECK-LABEL: define double @test_vrecpsd_f64(double %a, double %b) #0 {
   9779 // CHECK:   [[VRECPS_I:%.*]] = call double @llvm.aarch64.neon.frecps.f64(double %a, double %b) #4
   9780 // CHECK:   ret double [[VRECPS_I]]
   9781 float64_t test_vrecpsd_f64(float64_t a, float64_t b) {
   9782   return vrecpsd_f64(a, b);
   9783 }
   9784 
   9785 // CHECK-LABEL: define float @test_vrsqrtss_f32(float %a, float %b) #0 {
   9786 // CHECK:   [[VRSQRTSS_F32_I:%.*]] = call float @llvm.aarch64.neon.frsqrts.f32(float %a, float %b) #4
   9787 // CHECK:   ret float [[VRSQRTSS_F32_I]]
   9788 float32_t test_vrsqrtss_f32(float32_t a, float32_t b) {
   9789   return vrsqrtss_f32(a, b);
   9790 }
   9791 
   9792 // CHECK-LABEL: define double @test_vrsqrtsd_f64(double %a, double %b) #0 {
   9793 // CHECK:   [[VRSQRTSD_F64_I:%.*]] = call double @llvm.aarch64.neon.frsqrts.f64(double %a, double %b) #4
   9794 // CHECK:   ret double [[VRSQRTSD_F64_I]]
   9795 float64_t test_vrsqrtsd_f64(float64_t a, float64_t b) {
   9796   return vrsqrtsd_f64(a, b);
   9797 }
   9798 
   9799 // CHECK-LABEL: define float @test_vcvts_f32_s32(i32 %a) #0 {
   9800 // CHECK:   [[TMP0:%.*]] = sitofp i32 %a to float
   9801 // CHECK:   ret float [[TMP0]]
   9802 float32_t test_vcvts_f32_s32(int32_t a) {
   9803   return vcvts_f32_s32(a);
   9804 }
   9805 
   9806 // CHECK-LABEL: define double @test_vcvtd_f64_s64(i64 %a) #0 {
   9807 // CHECK:   [[TMP0:%.*]] = sitofp i64 %a to double
   9808 // CHECK:   ret double [[TMP0]]
   9809 float64_t test_vcvtd_f64_s64(int64_t a) {
   9810   return vcvtd_f64_s64(a);
   9811 }
   9812 
   9813 // CHECK-LABEL: define float @test_vcvts_f32_u32(i32 %a) #0 {
   9814 // CHECK:   [[TMP0:%.*]] = uitofp i32 %a to float
   9815 // CHECK:   ret float [[TMP0]]
   9816 float32_t test_vcvts_f32_u32(uint32_t a) {
   9817   return vcvts_f32_u32(a);
   9818 }
   9819 
   9820 // CHECK-LABEL: define double @test_vcvtd_f64_u64(i64 %a) #0 {
   9821 // CHECK:   [[TMP0:%.*]] = uitofp i64 %a to double
   9822 // CHECK:   ret double [[TMP0]]
   9823 float64_t test_vcvtd_f64_u64(uint64_t a) {
   9824   return vcvtd_f64_u64(a);
   9825 }
   9826 
   9827 // CHECK-LABEL: define float @test_vrecpes_f32(float %a) #0 {
   9828 // CHECK:   [[VRECPES_F32_I:%.*]] = call float @llvm.aarch64.neon.frecpe.f32(float %a) #4
   9829 // CHECK:   ret float [[VRECPES_F32_I]]
   9830 float32_t test_vrecpes_f32(float32_t a) {
   9831   return vrecpes_f32(a);
   9832 }
   9833 
   9834 // CHECK-LABEL: define double @test_vrecped_f64(double %a) #0 {
   9835 // CHECK:   [[VRECPED_F64_I:%.*]] = call double @llvm.aarch64.neon.frecpe.f64(double %a) #4
   9836 // CHECK:   ret double [[VRECPED_F64_I]]
   9837 float64_t test_vrecped_f64(float64_t a) {
   9838   return vrecped_f64(a);
   9839 }
   9840 
   9841 // CHECK-LABEL: define float @test_vrecpxs_f32(float %a) #0 {
   9842 // CHECK:   [[VRECPXS_F32_I:%.*]] = call float @llvm.aarch64.neon.frecpx.f32(float %a) #4
   9843 // CHECK:   ret float [[VRECPXS_F32_I]]
   9844 float32_t test_vrecpxs_f32(float32_t a) {
   9845   return vrecpxs_f32(a);
   9846  }
   9847 
   9848 // CHECK-LABEL: define double @test_vrecpxd_f64(double %a) #0 {
   9849 // CHECK:   [[VRECPXD_F64_I:%.*]] = call double @llvm.aarch64.neon.frecpx.f64(double %a) #4
   9850 // CHECK:   ret double [[VRECPXD_F64_I]]
   9851 float64_t test_vrecpxd_f64(float64_t a) {
   9852   return vrecpxd_f64(a);
   9853 }
   9854 
   9855 // CHECK-LABEL: define <2 x i32> @test_vrsqrte_u32(<2 x i32> %a) #0 {
   9856 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   9857 // CHECK:   [[VRSQRTE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   9858 // CHECK:   [[VRSQRTE_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.ursqrte.v2i32(<2 x i32> [[VRSQRTE_V_I]]) #4
   9859 // CHECK:   ret <2 x i32> [[VRSQRTE_V1_I]]
   9860 uint32x2_t test_vrsqrte_u32(uint32x2_t a) {
   9861   return vrsqrte_u32(a);
   9862 }
   9863 
   9864 // CHECK-LABEL: define <4 x i32> @test_vrsqrteq_u32(<4 x i32> %a) #0 {
   9865 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   9866 // CHECK:   [[VRSQRTEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   9867 // CHECK:   [[VRSQRTEQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.ursqrte.v4i32(<4 x i32> [[VRSQRTEQ_V_I]]) #4
   9868 // CHECK:   ret <4 x i32> [[VRSQRTEQ_V1_I]]
   9869 uint32x4_t test_vrsqrteq_u32(uint32x4_t a) {
   9870   return vrsqrteq_u32(a);
   9871 }
   9872 
   9873 // CHECK-LABEL: define float @test_vrsqrtes_f32(float %a) #0 {
   9874 // CHECK:   [[VRSQRTES_F32_I:%.*]] = call float @llvm.aarch64.neon.frsqrte.f32(float %a) #4
   9875 // CHECK:   ret float [[VRSQRTES_F32_I]]
   9876 float32_t test_vrsqrtes_f32(float32_t a) {
   9877   return vrsqrtes_f32(a);
   9878 }
   9879 
   9880 // CHECK-LABEL: define double @test_vrsqrted_f64(double %a) #0 {
   9881 // CHECK:   [[VRSQRTED_F64_I:%.*]] = call double @llvm.aarch64.neon.frsqrte.f64(double %a) #4
   9882 // CHECK:   ret double [[VRSQRTED_F64_I]]
   9883 float64_t test_vrsqrted_f64(float64_t a) {
   9884   return vrsqrted_f64(a);
   9885 }
   9886 
   9887 // CHECK-LABEL: define <16 x i8> @test_vld1q_u8(i8* %a) #0 {
   9888 // CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <16 x i8>*
   9889 // CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]]
   9890 // CHECK:   ret <16 x i8> [[TMP1]]
   9891 uint8x16_t test_vld1q_u8(uint8_t const *a) {
   9892   return vld1q_u8(a);
   9893 }
   9894 
   9895 // CHECK-LABEL: define <8 x i16> @test_vld1q_u16(i16* %a) #0 {
   9896 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
   9897 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
   9898 // CHECK:   [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]]
   9899 // CHECK:   ret <8 x i16> [[TMP2]]
   9900 uint16x8_t test_vld1q_u16(uint16_t const *a) {
   9901   return vld1q_u16(a);
   9902 }
   9903 
   9904 // CHECK-LABEL: define <4 x i32> @test_vld1q_u32(i32* %a) #0 {
   9905 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
   9906 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
   9907 // CHECK:   [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]]
   9908 // CHECK:   ret <4 x i32> [[TMP2]]
   9909 uint32x4_t test_vld1q_u32(uint32_t const *a) {
   9910   return vld1q_u32(a);
   9911 }
   9912 
   9913 // CHECK-LABEL: define <2 x i64> @test_vld1q_u64(i64* %a) #0 {
   9914 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
   9915 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x i64>*
   9916 // CHECK:   [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]]
   9917 // CHECK:   ret <2 x i64> [[TMP2]]
   9918 uint64x2_t test_vld1q_u64(uint64_t const *a) {
   9919   return vld1q_u64(a);
   9920 }
   9921 
   9922 // CHECK-LABEL: define <16 x i8> @test_vld1q_s8(i8* %a) #0 {
   9923 // CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <16 x i8>*
   9924 // CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]]
   9925 // CHECK:   ret <16 x i8> [[TMP1]]
   9926 int8x16_t test_vld1q_s8(int8_t const *a) {
   9927   return vld1q_s8(a);
   9928 }
   9929 
   9930 // CHECK-LABEL: define <8 x i16> @test_vld1q_s16(i16* %a) #0 {
   9931 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
   9932 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
   9933 // CHECK:   [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]]
   9934 // CHECK:   ret <8 x i16> [[TMP2]]
   9935 int16x8_t test_vld1q_s16(int16_t const *a) {
   9936   return vld1q_s16(a);
   9937 }
   9938 
   9939 // CHECK-LABEL: define <4 x i32> @test_vld1q_s32(i32* %a) #0 {
   9940 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
   9941 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
   9942 // CHECK:   [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]]
   9943 // CHECK:   ret <4 x i32> [[TMP2]]
   9944 int32x4_t test_vld1q_s32(int32_t const *a) {
   9945   return vld1q_s32(a);
   9946 }
   9947 
   9948 // CHECK-LABEL: define <2 x i64> @test_vld1q_s64(i64* %a) #0 {
   9949 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
   9950 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x i64>*
   9951 // CHECK:   [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]]
   9952 // CHECK:   ret <2 x i64> [[TMP2]]
   9953 int64x2_t test_vld1q_s64(int64_t const *a) {
   9954   return vld1q_s64(a);
   9955 }
   9956 
   9957 // CHECK-LABEL: define <8 x half> @test_vld1q_f16(half* %a) #0 {
   9958 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
   9959 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
   9960 // CHECK:   [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]]
   9961 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <8 x half>
   9962 // CHECK:   ret <8 x half> [[TMP3]]
   9963 float16x8_t test_vld1q_f16(float16_t const *a) {
   9964   return vld1q_f16(a);
   9965 }
   9966 
   9967 // CHECK-LABEL: define <4 x float> @test_vld1q_f32(float* %a) #0 {
   9968 // CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
   9969 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
   9970 // CHECK:   [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]]
   9971 // CHECK:   ret <4 x float> [[TMP2]]
   9972 float32x4_t test_vld1q_f32(float32_t const *a) {
   9973   return vld1q_f32(a);
   9974 }
   9975 
   9976 // CHECK-LABEL: define <2 x double> @test_vld1q_f64(double* %a) #0 {
   9977 // CHECK:   [[TMP0:%.*]] = bitcast double* %a to i8*
   9978 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x double>*
   9979 // CHECK:   [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]]
   9980 // CHECK:   ret <2 x double> [[TMP2]]
   9981 float64x2_t test_vld1q_f64(float64_t const *a) {
   9982   return vld1q_f64(a);
   9983 }
   9984 
   9985 // CHECK-LABEL: define <16 x i8> @test_vld1q_p8(i8* %a) #0 {
   9986 // CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <16 x i8>*
   9987 // CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]]
   9988 // CHECK:   ret <16 x i8> [[TMP1]]
   9989 poly8x16_t test_vld1q_p8(poly8_t const *a) {
   9990   return vld1q_p8(a);
   9991 }
   9992 
   9993 // CHECK-LABEL: define <8 x i16> @test_vld1q_p16(i16* %a) #0 {
   9994 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
   9995 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
   9996 // CHECK:   [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]]
   9997 // CHECK:   ret <8 x i16> [[TMP2]]
   9998 poly16x8_t test_vld1q_p16(poly16_t const *a) {
   9999   return vld1q_p16(a);
   10000 }
   10001 
   10002 // CHECK-LABEL: define <8 x i8> @test_vld1_u8(i8* %a) #0 {
   10003 // CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <8 x i8>*
   10004 // CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]]
   10005 // CHECK:   ret <8 x i8> [[TMP1]]
   10006 uint8x8_t test_vld1_u8(uint8_t const *a) {
   10007   return vld1_u8(a);
   10008 }
   10009 
   10010 // CHECK-LABEL: define <4 x i16> @test_vld1_u16(i16* %a) #0 {
   10011 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
   10012 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
   10013 // CHECK:   [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]]
   10014 // CHECK:   ret <4 x i16> [[TMP2]]
   10015 uint16x4_t test_vld1_u16(uint16_t const *a) {
   10016   return vld1_u16(a);
   10017 }
   10018 
   10019 // CHECK-LABEL: define <2 x i32> @test_vld1_u32(i32* %a) #0 {
   10020 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
   10021 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
   10022 // CHECK:   [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]]
   10023 // CHECK:   ret <2 x i32> [[TMP2]]
   10024 uint32x2_t test_vld1_u32(uint32_t const *a) {
   10025   return vld1_u32(a);
   10026 }
   10027 
   10028 // CHECK-LABEL: define <1 x i64> @test_vld1_u64(i64* %a) #0 {
   10029 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
   10030 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <1 x i64>*
   10031 // CHECK:   [[TMP2:%.*]] = load <1 x i64>, <1 x i64>* [[TMP1]]
   10032 // CHECK:   ret <1 x i64> [[TMP2]]
   10033 uint64x1_t test_vld1_u64(uint64_t const *a) {
   10034   return vld1_u64(a);
   10035 }
   10036 
   10037 // CHECK-LABEL: define <8 x i8> @test_vld1_s8(i8* %a) #0 {
   10038 // CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <8 x i8>*
   10039 // CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]]
   10040 // CHECK:   ret <8 x i8> [[TMP1]]
   10041 int8x8_t test_vld1_s8(int8_t const *a) {
   10042   return vld1_s8(a);
   10043 }
   10044 
   10045 // CHECK-LABEL: define <4 x i16> @test_vld1_s16(i16* %a) #0 {
   10046 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
   10047 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
   10048 // CHECK:   [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]]
   10049 // CHECK:   ret <4 x i16> [[TMP2]]
   10050 int16x4_t test_vld1_s16(int16_t const *a) {
   10051   return vld1_s16(a);
   10052 }
   10053 
   10054 // CHECK-LABEL: define <2 x i32> @test_vld1_s32(i32* %a) #0 {
   10055 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
   10056 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
   10057 // CHECK:   [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]]
   10058 // CHECK:   ret <2 x i32> [[TMP2]]
   10059 int32x2_t test_vld1_s32(int32_t const *a) {
   10060   return vld1_s32(a);
   10061 }
   10062 
   10063 // CHECK-LABEL: define <1 x i64> @test_vld1_s64(i64* %a) #0 {
   10064 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
   10065 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <1 x i64>*
   10066 // CHECK:   [[TMP2:%.*]] = load <1 x i64>, <1 x i64>* [[TMP1]]
   10067 // CHECK:   ret <1 x i64> [[TMP2]]
   10068 int64x1_t test_vld1_s64(int64_t const *a) {
   10069   return vld1_s64(a);
   10070 }
   10071 
   10072 // CHECK-LABEL: define <4 x half> @test_vld1_f16(half* %a) #0 {
   10073 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
   10074 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
   10075 // CHECK:   [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]]
   10076 // CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <4 x half>
   10077 // CHECK:   ret <4 x half> [[TMP3]]
   10078 float16x4_t test_vld1_f16(float16_t const *a) {
   10079   return vld1_f16(a);
   10080 }
   10081 
   10082 // CHECK-LABEL: define <2 x float> @test_vld1_f32(float* %a) #0 {
   10083 // CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
   10084 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
   10085 // CHECK:   [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]]
   10086 // CHECK:   ret <2 x float> [[TMP2]]
   10087 float32x2_t test_vld1_f32(float32_t const *a) {
   10088   return vld1_f32(a);
   10089 }
   10090 
   10091 // CHECK-LABEL: define <1 x double> @test_vld1_f64(double* %a) #0 {
   10092 // CHECK:   [[TMP0:%.*]] = bitcast double* %a to i8*
   10093 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <1 x double>*
   10094 // CHECK:   [[TMP2:%.*]] = load <1 x double>, <1 x double>* [[TMP1]]
   10095 // CHECK:   ret <1 x double> [[TMP2]]
   10096 float64x1_t test_vld1_f64(float64_t const *a) {
   10097   return vld1_f64(a);
   10098 }
   10099 
   10100 // CHECK-LABEL: define <8 x i8> @test_vld1_p8(i8* %a) #0 {
   10101 // CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <8 x i8>*
   10102 // CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]]
   10103 // CHECK:   ret <8 x i8> [[TMP1]]
   10104 poly8x8_t test_vld1_p8(poly8_t const *a) {
   10105   return vld1_p8(a);
   10106 }
   10107 
   10108 // CHECK-LABEL: define <4 x i16> @test_vld1_p16(i16* %a) #0 {
   10109 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
   10110 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
   10111 // CHECK:   [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]]
   10112 // CHECK:   ret <4 x i16> [[TMP2]]
   10113 poly16x4_t test_vld1_p16(poly16_t const *a) {
   10114   return vld1_p16(a);
   10115 }
   10116 
   10117 // CHECK-LABEL: define %struct.uint8x16x2_t @test_vld2q_u8(i8* %a) #0 {
   10118 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
   10119 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align 16
   10120 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
   10121 // CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>*
   10122 // CHECK:   [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0v16i8(<16 x i8>* [[TMP1]])
   10123 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
   10124 // CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP2]]
   10125 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL]] to i8*
   10126 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
   10127 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false)
   10128 // CHECK:   [[TMP5:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16
   10129 // CHECK:   ret %struct.uint8x16x2_t [[TMP5]]
   10130 uint8x16x2_t test_vld2q_u8(uint8_t const *a) {
   10131   return vld2q_u8(a);
   10132 }
   10133 
   10134 // CHECK-LABEL: define %struct.uint16x8x2_t @test_vld2q_u16(i16* %a) #0 {
   10135 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
   10136 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16
   10137 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
   10138 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   10139 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
   10140 // CHECK:   [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
   10141 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
   10142 // CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
   10143 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL]] to i8*
   10144 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
   10145 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
   10146 // CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16
   10147 // CHECK:   ret %struct.uint16x8x2_t [[TMP6]]
   10148 uint16x8x2_t test_vld2q_u16(uint16_t const *a) {
   10149   return vld2q_u16(a);
   10150 }
   10151 
   10152 // CHECK-LABEL: define %struct.uint32x4x2_t @test_vld2q_u32(i32* %a) #0 {
   10153 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
   10154 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16
   10155 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
   10156 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
   10157 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i32>*
   10158 // CHECK:   [[VLD2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
   10159 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }*
   10160 // CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2]], { <4 x i32>, <4 x i32> }* [[TMP3]]
   10161 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL]] to i8*
   10162 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
   10163 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
   10164 // CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16
   10165 // CHECK:   ret %struct.uint32x4x2_t [[TMP6]]
   10166 uint32x4x2_t test_vld2q_u32(uint32_t const *a) {
   10167   return vld2q_u32(a);
   10168 }
   10169 
   10170 // CHECK-LABEL: define %struct.uint64x2x2_t @test_vld2q_u64(i64* %a) #0 {
   10171 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x2_t, align 16
   10172 // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x2_t, align 16
   10173 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
   10174 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
   10175 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i64>*
   10176 // CHECK:   [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0v2i64(<2 x i64>* [[TMP2]])
   10177 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }*
   10178 // CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]]
   10179 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x2x2_t* [[RETVAL]] to i8*
   10180 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
   10181 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
   10182 // CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[RETVAL]], align 16
   10183 // CHECK:   ret %struct.uint64x2x2_t [[TMP6]]
   10184 uint64x2x2_t test_vld2q_u64(uint64_t const *a) {
   10185   return vld2q_u64(a);
   10186 }
   10187 
   10188 // CHECK-LABEL: define %struct.int8x16x2_t @test_vld2q_s8(i8* %a) #0 {
   10189 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
   10190 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x2_t, align 16
   10191 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
   10192 // CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>*
   10193 // CHECK:   [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0v16i8(<16 x i8>* [[TMP1]])
   10194 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
   10195 // CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP2]]
   10196 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL]] to i8*
   10197 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
   10198 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false)
   10199 // CHECK:   [[TMP5:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16
   10200 // CHECK:   ret %struct.int8x16x2_t [[TMP5]]
   10201 int8x16x2_t test_vld2q_s8(int8_t const *a) {
   10202   return vld2q_s8(a);
   10203 }
   10204 
   10205 // CHECK-LABEL: define %struct.int16x8x2_t @test_vld2q_s16(i16* %a) #0 {
   10206 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
   10207 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16
   10208 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
   10209 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   10210 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
   10211 // CHECK:   [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
   10212 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
   10213 // CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
   10214 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL]] to i8*
   10215 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
   10216 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
   10217 // CHECK:   [[TMP6:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16
   10218 // CHECK:   ret %struct.int16x8x2_t [[TMP6]]
   10219 int16x8x2_t test_vld2q_s16(int16_t const *a) {
   10220   return vld2q_s16(a);
   10221 }
   10222 
   10223 // CHECK-LABEL: define %struct.int32x4x2_t @test_vld2q_s32(i32* %a) #0 {
   10224 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
   10225 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16
   10226 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
   10227 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
   10228 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i32>*
   10229 // CHECK:   [[VLD2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
   10230 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }*
   10231 // CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2]], { <4 x i32>, <4 x i32> }* [[TMP3]]
   10232 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL]] to i8*
   10233 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
   10234 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
   10235 // CHECK:   [[TMP6:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16
   10236 // CHECK:   ret %struct.int32x4x2_t [[TMP6]]
   10237 int32x4x2_t test_vld2q_s32(int32_t const *a) {
   10238   return vld2q_s32(a);
   10239 }
   10240 
   10241 // CHECK-LABEL: define %struct.int64x2x2_t @test_vld2q_s64(i64* %a) #0 {
   10242 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x2_t, align 16
   10243 // CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x2_t, align 16
   10244 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
   10245 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
   10246 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i64>*
   10247 // CHECK:   [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0v2i64(<2 x i64>* [[TMP2]])
   10248 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }*
   10249 // CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]]
   10250 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x2x2_t* [[RETVAL]] to i8*
   10251 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
   10252 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
   10253 // CHECK:   [[TMP6:%.*]] = load %struct.int64x2x2_t, %struct.int64x2x2_t* [[RETVAL]], align 16
   10254 // CHECK:   ret %struct.int64x2x2_t [[TMP6]]
   10255 int64x2x2_t test_vld2q_s64(int64_t const *a) {
   10256   return vld2q_s64(a);
   10257 }
   10258 
   10259 // CHECK-LABEL: define %struct.float16x8x2_t @test_vld2q_f16(half* %a) #0 {
   10260 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x2_t, align 16
   10261 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
   10262 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
   10263 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
   10264 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
   10265 // CHECK:   [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
   10266 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
   10267 // CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
   10268 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
   10269 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
   10270 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
   10271 // CHECK:   [[TMP6:%.*]] = load %struct.float16x8x2_t, %struct.float16x8x2_t* [[RETVAL]], align 16
   10272 // CHECK:   ret %struct.float16x8x2_t [[TMP6]]
   10273 float16x8x2_t test_vld2q_f16(float16_t const *a) {
   10274   return vld2q_f16(a);
   10275 }
   10276 
   10277 // CHECK-LABEL: define %struct.float32x4x2_t @test_vld2q_f32(float* %a) #0 {
   10278 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
   10279 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16
   10280 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
   10281 // CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
   10282 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x float>*
   10283 // CHECK:   [[VLD2:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0v4f32(<4 x float>* [[TMP2]])
   10284 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float> }*
   10285 // CHECK:   store { <4 x float>, <4 x float> } [[VLD2]], { <4 x float>, <4 x float> }* [[TMP3]]
   10286 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL]] to i8*
   10287 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
   10288 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
   10289 // CHECK:   [[TMP6:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16
   10290 // CHECK:   ret %struct.float32x4x2_t [[TMP6]]
   10291 float32x4x2_t test_vld2q_f32(float32_t const *a) {
   10292   return vld2q_f32(a);
   10293 }
   10294 
   10295 // CHECK-LABEL: define %struct.float64x2x2_t @test_vld2q_f64(double* %a) #0 {
   10296 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x2_t, align 16
   10297 // CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x2_t, align 16
   10298 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8*
   10299 // CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
   10300 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x double>*
   10301 // CHECK:   [[VLD2:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2.v2f64.p0v2f64(<2 x double>* [[TMP2]])
   10302 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x double>, <2 x double> }*
   10303 // CHECK:   store { <2 x double>, <2 x double> } [[VLD2]], { <2 x double>, <2 x double> }* [[TMP3]]
   10304 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x2x2_t* [[RETVAL]] to i8*
   10305 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8*
   10306 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
   10307 // CHECK:   [[TMP6:%.*]] = load %struct.float64x2x2_t, %struct.float64x2x2_t* [[RETVAL]], align 16
   10308 // CHECK:   ret %struct.float64x2x2_t [[TMP6]]
   10309 float64x2x2_t test_vld2q_f64(float64_t const *a) {
   10310   return vld2q_f64(a);
   10311 }
   10312 
   10313 // CHECK-LABEL: define %struct.poly8x16x2_t @test_vld2q_p8(i8* %a) #0 {
   10314 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
   10315 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align 16
   10316 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
   10317 // CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>*
   10318 // CHECK:   [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0v16i8(<16 x i8>* [[TMP1]])
   10319 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
   10320 // CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP2]]
   10321 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL]] to i8*
   10322 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
   10323 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false)
   10324 // CHECK:   [[TMP5:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16
   10325 // CHECK:   ret %struct.poly8x16x2_t [[TMP5]]
   10326 poly8x16x2_t test_vld2q_p8(poly8_t const *a) {
   10327   return vld2q_p8(a);
   10328 }
   10329 
   10330 // CHECK-LABEL: define %struct.poly16x8x2_t @test_vld2q_p16(i16* %a) #0 {
   10331 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
   10332 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16
   10333 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
   10334 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   10335 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
   10336 // CHECK:   [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
   10337 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
   10338 // CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
   10339 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL]] to i8*
   10340 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
   10341 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
   10342 // CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16
   10343 // CHECK:   ret %struct.poly16x8x2_t [[TMP6]]
   10344 poly16x8x2_t test_vld2q_p16(poly16_t const *a) {
   10345   return vld2q_p16(a);
   10346 }
   10347 
   10348 // CHECK-LABEL: define %struct.uint8x8x2_t @test_vld2_u8(i8* %a) #0 {
   10349 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
   10350 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
   10351 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
   10352 // CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>*
   10353 // CHECK:   [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>* [[TMP1]])
   10354 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
   10355 // CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP2]]
   10356 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL]] to i8*
   10357 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
   10358 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false)
   10359 // CHECK:   [[TMP5:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8
   10360 // CHECK:   ret %struct.uint8x8x2_t [[TMP5]]
   10361 uint8x8x2_t test_vld2_u8(uint8_t const *a) {
   10362   return vld2_u8(a);
   10363 }
   10364 
   10365 // CHECK-LABEL: define %struct.uint16x4x2_t @test_vld2_u16(i16* %a) #0 {
   10366 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
   10367 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
   10368 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
   10369 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   10370 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
   10371 // CHECK:   [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
   10372 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
   10373 // CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
   10374 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL]] to i8*
   10375 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
   10376 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
   10377 // CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
   10378 // CHECK:   ret %struct.uint16x4x2_t [[TMP6]]
   10379 uint16x4x2_t test_vld2_u16(uint16_t const *a) {
   10380   return vld2_u16(a);
   10381 }
   10382 
   10383 // CHECK-LABEL: define %struct.uint32x2x2_t @test_vld2_u32(i32* %a) #0 {
   10384 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
   10385 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
   10386 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
   10387 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
   10388 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i32>*
   10389 // CHECK:   [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0v2i32(<2 x i32>* [[TMP2]])
   10390 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
   10391 // CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2]], { <2 x i32>, <2 x i32> }* [[TMP3]]
   10392 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL]] to i8*
   10393 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
   10394 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
   10395 // CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
   10396 // CHECK:   ret %struct.uint32x2x2_t [[TMP6]]
   10397 uint32x2x2_t test_vld2_u32(uint32_t const *a) {
   10398   return vld2_u32(a);
   10399 }
   10400 
   10401 // CHECK-LABEL: define %struct.uint64x1x2_t @test_vld2_u64(i64* %a) #0 {
   10402 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x2_t, align 8
   10403 // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8
   10404 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
   10405 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
   10406 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>*
   10407 // CHECK:   [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0v1i64(<1 x i64>* [[TMP2]])
   10408 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
   10409 // CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
   10410 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x2_t* [[RETVAL]] to i8*
   10411 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
   10412 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
   10413 // CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[RETVAL]], align 8
   10414 // CHECK:   ret %struct.uint64x1x2_t [[TMP6]]
   10415 uint64x1x2_t test_vld2_u64(uint64_t const *a) {
   10416   return vld2_u64(a);
   10417 }
   10418 
   10419 // CHECK-LABEL: define %struct.int8x8x2_t @test_vld2_s8(i8* %a) #0 {
   10420 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
   10421 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
   10422 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
   10423 // CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>*
   10424 // CHECK:   [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>* [[TMP1]])
   10425 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
   10426 // CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP2]]
   10427 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL]] to i8*
   10428 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
   10429 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false)
   10430 // CHECK:   [[TMP5:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8
   10431 // CHECK:   ret %struct.int8x8x2_t [[TMP5]]
   10432 int8x8x2_t test_vld2_s8(int8_t const *a) {
   10433   return vld2_s8(a);
   10434 }
   10435 
   10436 // CHECK-LABEL: define %struct.int16x4x2_t @test_vld2_s16(i16* %a) #0 {
   10437 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
   10438 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
   10439 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
   10440 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   10441 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
   10442 // CHECK:   [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
   10443 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
   10444 // CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
   10445 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL]] to i8*
   10446 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
   10447 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
   10448 // CHECK:   [[TMP6:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
   10449 // CHECK:   ret %struct.int16x4x2_t [[TMP6]]
   10450 int16x4x2_t test_vld2_s16(int16_t const *a) {
   10451   return vld2_s16(a);
   10452 }
   10453 
   10454 // CHECK-LABEL: define %struct.int32x2x2_t @test_vld2_s32(i32* %a) #0 {
   10455 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
   10456 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
   10457 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
   10458 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
   10459 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i32>*
   10460 // CHECK:   [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0v2i32(<2 x i32>* [[TMP2]])
   10461 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
   10462 // CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2]], { <2 x i32>, <2 x i32> }* [[TMP3]]
   10463 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL]] to i8*
   10464 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
   10465 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
   10466 // CHECK:   [[TMP6:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
   10467 // CHECK:   ret %struct.int32x2x2_t [[TMP6]]
   10468 int32x2x2_t test_vld2_s32(int32_t const *a) {
   10469   return vld2_s32(a);
   10470 }
   10471 
   10472 // CHECK-LABEL: define %struct.int64x1x2_t @test_vld2_s64(i64* %a) #0 {
   10473 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x2_t, align 8
   10474 // CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8
   10475 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
   10476 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
   10477 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>*
   10478 // CHECK:   [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0v1i64(<1 x i64>* [[TMP2]])
   10479 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
   10480 // CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
   10481 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x2_t* [[RETVAL]] to i8*
   10482 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
   10483 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
   10484 // CHECK:   [[TMP6:%.*]] = load %struct.int64x1x2_t, %struct.int64x1x2_t* [[RETVAL]], align 8
   10485 // CHECK:   ret %struct.int64x1x2_t [[TMP6]]
   10486 int64x1x2_t test_vld2_s64(int64_t const *a) {
   10487   return vld2_s64(a);
   10488 }
   10489 
   10490 // CHECK-LABEL: define %struct.float16x4x2_t @test_vld2_f16(half* %a) #0 {
   10491 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x2_t, align 8
   10492 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
   10493 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
   10494 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
   10495 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
   10496 // CHECK:   [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
   10497 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
   10498 // CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
   10499 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
   10500 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
   10501 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
   10502 // CHECK:   [[TMP6:%.*]] = load %struct.float16x4x2_t, %struct.float16x4x2_t* [[RETVAL]], align 8
   10503 // CHECK:   ret %struct.float16x4x2_t [[TMP6]]
   10504 float16x4x2_t test_vld2_f16(float16_t const *a) {
   10505   return vld2_f16(a);
   10506 }
   10507 
   10508 // CHECK-LABEL: define %struct.float32x2x2_t @test_vld2_f32(float* %a) #0 {
   10509 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
   10510 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
   10511 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
   10512 // CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
   10513 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x float>*
   10514 // CHECK:   [[VLD2:%.*]] = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2.v2f32.p0v2f32(<2 x float>* [[TMP2]])
   10515 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float> }*
   10516 // CHECK:   store { <2 x float>, <2 x float> } [[VLD2]], { <2 x float>, <2 x float> }* [[TMP3]]
   10517 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL]] to i8*
   10518 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
   10519 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
   10520 // CHECK:   [[TMP6:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
   10521 // CHECK:   ret %struct.float32x2x2_t [[TMP6]]
   10522 float32x2x2_t test_vld2_f32(float32_t const *a) {
   10523   return vld2_f32(a);
   10524 }
   10525 
   10526 // CHECK-LABEL: define %struct.float64x1x2_t @test_vld2_f64(double* %a) #0 {
   10527 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x2_t, align 8
   10528 // CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x2_t, align 8
   10529 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8*
   10530 // CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
   10531 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x double>*
   10532 // CHECK:   [[VLD2:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2.v1f64.p0v1f64(<1 x double>* [[TMP2]])
   10533 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x double>, <1 x double> }*
   10534 // CHECK:   store { <1 x double>, <1 x double> } [[VLD2]], { <1 x double>, <1 x double> }* [[TMP3]]
   10535 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x1x2_t* [[RETVAL]] to i8*
   10536 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8*
   10537 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
   10538 // CHECK:   [[TMP6:%.*]] = load %struct.float64x1x2_t, %struct.float64x1x2_t* [[RETVAL]], align 8
   10539 // CHECK:   ret %struct.float64x1x2_t [[TMP6]]
   10540 float64x1x2_t test_vld2_f64(float64_t const *a) {
   10541   return vld2_f64(a);
   10542 }
   10543 
   10544 // CHECK-LABEL: define %struct.poly8x8x2_t @test_vld2_p8(i8* %a) #0 {
   10545 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
   10546 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
   10547 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
   10548 // CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>*
   10549 // CHECK:   [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>* [[TMP1]])
   10550 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
   10551 // CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP2]]
   10552 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL]] to i8*
   10553 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
   10554 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false)
   10555 // CHECK:   [[TMP5:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8
   10556 // CHECK:   ret %struct.poly8x8x2_t [[TMP5]]
   10557 poly8x8x2_t test_vld2_p8(poly8_t const *a) {
   10558   return vld2_p8(a);
   10559 }
   10560 
   10561 // CHECK-LABEL: define %struct.poly16x4x2_t @test_vld2_p16(i16* %a) #0 {
   10562 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
   10563 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
   10564 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
   10565 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   10566 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
   10567 // CHECK:   [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
   10568 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
   10569 // CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
   10570 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL]] to i8*
   10571 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
   10572 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
   10573 // CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
   10574 // CHECK:   ret %struct.poly16x4x2_t [[TMP6]]
   10575 poly16x4x2_t test_vld2_p16(poly16_t const *a) {
   10576   return vld2_p16(a);
   10577 }
   10578 
   10579 // CHECK-LABEL: define %struct.uint8x16x3_t @test_vld3q_u8(i8* %a) #0 {
   10580 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x3_t, align 16
   10581 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align 16
   10582 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
   10583 // CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>*
   10584 // CHECK:   [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0v16i8(<16 x i8>* [[TMP1]])
   10585 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
   10586 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]]
   10587 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x3_t* [[RETVAL]] to i8*
   10588 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
   10589 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 48, i32 16, i1 false)
   10590 // CHECK:   [[TMP5:%.*]] = load %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[RETVAL]], align 16
   10591 // CHECK:   ret %struct.uint8x16x3_t [[TMP5]]
   10592 uint8x16x3_t test_vld3q_u8(uint8_t const *a) {
   10593   return vld3q_u8(a);
   10594 }
   10595 
   10596 // CHECK-LABEL: define %struct.uint16x8x3_t @test_vld3q_u16(i16* %a) #0 {
   10597 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x3_t, align 16
   10598 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16
   10599 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
   10600 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   10601 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
   10602 // CHECK:   [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
   10603 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
   10604 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
   10605 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x3_t* [[RETVAL]] to i8*
   10606 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
   10607 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
   10608 // CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[RETVAL]], align 16
   10609 // CHECK:   ret %struct.uint16x8x3_t [[TMP6]]
   10610 uint16x8x3_t test_vld3q_u16(uint16_t const *a) {
   10611   return vld3q_u16(a);
   10612 }
   10613 
   10614 // CHECK-LABEL: define %struct.uint32x4x3_t @test_vld3q_u32(i32* %a) #0 {
   10615 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x3_t, align 16
   10616 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16
   10617 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
   10618 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
   10619 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i32>*
   10620 // CHECK:   [[VLD3:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
   10621 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
   10622 // CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
   10623 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x3_t* [[RETVAL]] to i8*
   10624 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
   10625 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
   10626 // CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[RETVAL]], align 16
   10627 // CHECK:   ret %struct.uint32x4x3_t [[TMP6]]
   10628 uint32x4x3_t test_vld3q_u32(uint32_t const *a) {
   10629   return vld3q_u32(a);
   10630 }
   10631 
   10632 // CHECK-LABEL: define %struct.uint64x2x3_t @test_vld3q_u64(i64* %a) #0 {
   10633 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x3_t, align 16
   10634 // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x3_t, align 16
   10635 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
   10636 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
   10637 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i64>*
   10638 // CHECK:   [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0v2i64(<2 x i64>* [[TMP2]])
   10639 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
   10640 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
   10641 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x2x3_t* [[RETVAL]] to i8*
   10642 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
   10643 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
   10644 // CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[RETVAL]], align 16
   10645 // CHECK:   ret %struct.uint64x2x3_t [[TMP6]]
   10646 uint64x2x3_t test_vld3q_u64(uint64_t const *a) {
   10647   return vld3q_u64(a);
   10648 }
   10649 
   10650 // CHECK-LABEL: define %struct.int8x16x3_t @test_vld3q_s8(i8* %a) #0 {
   10651 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x3_t, align 16
   10652 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x3_t, align 16
   10653 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
   10654 // CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>*
   10655 // CHECK:   [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0v16i8(<16 x i8>* [[TMP1]])
   10656 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
   10657 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]]
   10658 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x3_t* [[RETVAL]] to i8*
   10659 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
   10660 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 48, i32 16, i1 false)
   10661 // CHECK:   [[TMP5:%.*]] = load %struct.int8x16x3_t, %struct.int8x16x3_t* [[RETVAL]], align 16
   10662 // CHECK:   ret %struct.int8x16x3_t [[TMP5]]
   10663 int8x16x3_t test_vld3q_s8(int8_t const *a) {
   10664   return vld3q_s8(a);
   10665 }
   10666 
   10667 // CHECK-LABEL: define %struct.int16x8x3_t @test_vld3q_s16(i16* %a) #0 {
   10668 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x3_t, align 16
   10669 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16
   10670 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
   10671 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   10672 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
   10673 // CHECK:   [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
   10674 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
   10675 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
   10676 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x3_t* [[RETVAL]] to i8*
   10677 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
   10678 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
   10679 // CHECK:   [[TMP6:%.*]] = load %struct.int16x8x3_t, %struct.int16x8x3_t* [[RETVAL]], align 16
   10680 // CHECK:   ret %struct.int16x8x3_t [[TMP6]]
   10681 int16x8x3_t test_vld3q_s16(int16_t const *a) {
   10682   return vld3q_s16(a);
   10683 }
   10684 
   10685 // CHECK-LABEL: define %struct.int32x4x3_t @test_vld3q_s32(i32* %a) #0 {
   10686 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x3_t, align 16
   10687 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16
   10688 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
   10689 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
   10690 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i32>*
   10691 // CHECK:   [[VLD3:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
   10692 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
   10693 // CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
   10694 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x3_t* [[RETVAL]] to i8*
   10695 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
   10696 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
   10697 // CHECK:   [[TMP6:%.*]] = load %struct.int32x4x3_t, %struct.int32x4x3_t* [[RETVAL]], align 16
   10698 // CHECK:   ret %struct.int32x4x3_t [[TMP6]]
   10699 int32x4x3_t test_vld3q_s32(int32_t const *a) {
   10700   return vld3q_s32(a);
   10701 }
   10702 
   10703 // CHECK-LABEL: define %struct.int64x2x3_t @test_vld3q_s64(i64* %a) #0 {
   10704 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x3_t, align 16
   10705 // CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x3_t, align 16
   10706 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
   10707 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
   10708 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i64>*
   10709 // CHECK:   [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0v2i64(<2 x i64>* [[TMP2]])
   10710 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
   10711 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
   10712 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x2x3_t* [[RETVAL]] to i8*
   10713 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
   10714 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
   10715 // CHECK:   [[TMP6:%.*]] = load %struct.int64x2x3_t, %struct.int64x2x3_t* [[RETVAL]], align 16
   10716 // CHECK:   ret %struct.int64x2x3_t [[TMP6]]
   10717 int64x2x3_t test_vld3q_s64(int64_t const *a) {
   10718   return vld3q_s64(a);
   10719 }
   10720 
   10721 // CHECK-LABEL: define %struct.float16x8x3_t @test_vld3q_f16(half* %a) #0 {
   10722 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x3_t, align 16
   10723 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
   10724 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
   10725 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
   10726 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
   10727 // CHECK:   [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
   10728 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
   10729 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
   10730 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x3_t* [[RETVAL]] to i8*
   10731 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
   10732 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
   10733 // CHECK:   [[TMP6:%.*]] = load %struct.float16x8x3_t, %struct.float16x8x3_t* [[RETVAL]], align 16
   10734 // CHECK:   ret %struct.float16x8x3_t [[TMP6]]
   10735 float16x8x3_t test_vld3q_f16(float16_t const *a) {
   10736   return vld3q_f16(a);
   10737 }
   10738 
   10739 // CHECK-LABEL: define %struct.float32x4x3_t @test_vld3q_f32(float* %a) #0 {
   10740 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x3_t, align 16
   10741 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16
   10742 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
   10743 // CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
   10744 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x float>*
   10745 // CHECK:   [[VLD3:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3.v4f32.p0v4f32(<4 x float>* [[TMP2]])
   10746 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float> }*
   10747 // CHECK:   store { <4 x float>, <4 x float>, <4 x float> } [[VLD3]], { <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
   10748 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x3_t* [[RETVAL]] to i8*
   10749 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
   10750 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
   10751 // CHECK:   [[TMP6:%.*]] = load %struct.float32x4x3_t, %struct.float32x4x3_t* [[RETVAL]], align 16
   10752 // CHECK:   ret %struct.float32x4x3_t [[TMP6]]
   10753 float32x4x3_t test_vld3q_f32(float32_t const *a) {
   10754   return vld3q_f32(a);
   10755 }
   10756 
   10757 // CHECK-LABEL: define %struct.float64x2x3_t @test_vld3q_f64(double* %a) #0 {
   10758 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x3_t, align 16
   10759 // CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x3_t, align 16
   10760 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8*
   10761 // CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
   10762 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x double>*
   10763 // CHECK:   [[VLD3:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3.v2f64.p0v2f64(<2 x double>* [[TMP2]])
   10764 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x double>, <2 x double>, <2 x double> }*
   10765 // CHECK:   store { <2 x double>, <2 x double>, <2 x double> } [[VLD3]], { <2 x double>, <2 x double>, <2 x double> }* [[TMP3]]
   10766 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x2x3_t* [[RETVAL]] to i8*
   10767 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8*
   10768 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
   10769 // CHECK:   [[TMP6:%.*]] = load %struct.float64x2x3_t, %struct.float64x2x3_t* [[RETVAL]], align 16
   10770 // CHECK:   ret %struct.float64x2x3_t [[TMP6]]
   10771 float64x2x3_t test_vld3q_f64(float64_t const *a) {
   10772   return vld3q_f64(a);
   10773 }
   10774 
   10775 // CHECK-LABEL: define %struct.poly8x16x3_t @test_vld3q_p8(i8* %a) #0 {
   10776 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x3_t, align 16
   10777 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align 16
   10778 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
   10779 // CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>*
   10780 // CHECK:   [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0v16i8(<16 x i8>* [[TMP1]])
   10781 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
   10782 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]]
   10783 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x3_t* [[RETVAL]] to i8*
   10784 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
   10785 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 48, i32 16, i1 false)
   10786 // CHECK:   [[TMP5:%.*]] = load %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[RETVAL]], align 16
   10787 // CHECK:   ret %struct.poly8x16x3_t [[TMP5]]
   10788 poly8x16x3_t test_vld3q_p8(poly8_t const *a) {
   10789   return vld3q_p8(a);
   10790 }
   10791 
   10792 // CHECK-LABEL: define %struct.poly16x8x3_t @test_vld3q_p16(i16* %a) #0 {
   10793 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x3_t, align 16
   10794 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16
   10795 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
   10796 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   10797 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
   10798 // CHECK:   [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
   10799 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
   10800 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
   10801 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x3_t* [[RETVAL]] to i8*
   10802 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
   10803 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
   10804 // CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[RETVAL]], align 16
   10805 // CHECK:   ret %struct.poly16x8x3_t [[TMP6]]
   10806 poly16x8x3_t test_vld3q_p16(poly16_t const *a) {
   10807   return vld3q_p16(a);
   10808 }
   10809 
   10810 // CHECK-LABEL: define %struct.uint8x8x3_t @test_vld3_u8(i8* %a) #0 {
   10811 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x3_t, align 8
   10812 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
   10813 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
   10814 // CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>*
   10815 // CHECK:   [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0v8i8(<8 x i8>* [[TMP1]])
   10816 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
   10817 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]]
   10818 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x3_t* [[RETVAL]] to i8*
   10819 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
   10820 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 24, i32 8, i1 false)
   10821 // CHECK:   [[TMP5:%.*]] = load %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[RETVAL]], align 8
   10822 // CHECK:   ret %struct.uint8x8x3_t [[TMP5]]
   10823 uint8x8x3_t test_vld3_u8(uint8_t const *a) {
   10824   return vld3_u8(a);
   10825 }
   10826 
   10827 // CHECK-LABEL: define %struct.uint16x4x3_t @test_vld3_u16(i16* %a) #0 {
   10828 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x3_t, align 8
   10829 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
   10830 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
   10831 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   10832 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
   10833 // CHECK:   [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
   10834 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
   10835 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
   10836 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x3_t* [[RETVAL]] to i8*
   10837 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
   10838 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
   10839 // CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[RETVAL]], align 8
   10840 // CHECK:   ret %struct.uint16x4x3_t [[TMP6]]
   10841 uint16x4x3_t test_vld3_u16(uint16_t const *a) {
   10842   return vld3_u16(a);
   10843 }
   10844 
   10845 // CHECK-LABEL: define %struct.uint32x2x3_t @test_vld3_u32(i32* %a) #0 {
   10846 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x3_t, align 8
   10847 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
   10848 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
   10849 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
   10850 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i32>*
   10851 // CHECK:   [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3.v2i32.p0v2i32(<2 x i32>* [[TMP2]])
   10852 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
   10853 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
   10854 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x3_t* [[RETVAL]] to i8*
   10855 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
   10856 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
   10857 // CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[RETVAL]], align 8
   10858 // CHECK:   ret %struct.uint32x2x3_t [[TMP6]]
   10859 uint32x2x3_t test_vld3_u32(uint32_t const *a) {
   10860   return vld3_u32(a);
   10861 }
   10862 
   10863 // CHECK-LABEL: define %struct.uint64x1x3_t @test_vld3_u64(i64* %a) #0 {
   10864 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x3_t, align 8
   10865 // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8
   10866 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
   10867 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
   10868 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>*
   10869 // CHECK:   [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0v1i64(<1 x i64>* [[TMP2]])
   10870 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
   10871 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
   10872 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x3_t* [[RETVAL]] to i8*
   10873 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
   10874 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
   10875 // CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[RETVAL]], align 8
   10876 // CHECK:   ret %struct.uint64x1x3_t [[TMP6]]
   10877 uint64x1x3_t test_vld3_u64(uint64_t const *a) {
   10878   return vld3_u64(a);
   10879 }
   10880 
   10881 // CHECK-LABEL: define %struct.int8x8x3_t @test_vld3_s8(i8* %a) #0 {
   10882 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x3_t, align 8
   10883 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
   10884 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
   10885 // CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>*
   10886 // CHECK:   [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0v8i8(<8 x i8>* [[TMP1]])
   10887 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
   10888 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]]
   10889 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x3_t* [[RETVAL]] to i8*
   10890 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
   10891 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 24, i32 8, i1 false)
   10892 // CHECK:   [[TMP5:%.*]] = load %struct.int8x8x3_t, %struct.int8x8x3_t* [[RETVAL]], align 8
   10893 // CHECK:   ret %struct.int8x8x3_t [[TMP5]]
   10894 int8x8x3_t test_vld3_s8(int8_t const *a) {
   10895   return vld3_s8(a);
   10896 }
   10897 
   10898 // CHECK-LABEL: define %struct.int16x4x3_t @test_vld3_s16(i16* %a) #0 {
   10899 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x3_t, align 8
   10900 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
   10901 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
   10902 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   10903 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
   10904 // CHECK:   [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
   10905 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
   10906 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
   10907 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x3_t* [[RETVAL]] to i8*
   10908 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
   10909 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
   10910 // CHECK:   [[TMP6:%.*]] = load %struct.int16x4x3_t, %struct.int16x4x3_t* [[RETVAL]], align 8
   10911 // CHECK:   ret %struct.int16x4x3_t [[TMP6]]
   10912 int16x4x3_t test_vld3_s16(int16_t const *a) {
   10913   return vld3_s16(a);
   10914 }
   10915 
   10916 // CHECK-LABEL: define %struct.int32x2x3_t @test_vld3_s32(i32* %a) #0 {
   10917 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x3_t, align 8
   10918 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
   10919 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
   10920 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
   10921 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i32>*
   10922 // CHECK:   [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3.v2i32.p0v2i32(<2 x i32>* [[TMP2]])
   10923 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
   10924 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
   10925 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x3_t* [[RETVAL]] to i8*
   10926 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
   10927 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
   10928 // CHECK:   [[TMP6:%.*]] = load %struct.int32x2x3_t, %struct.int32x2x3_t* [[RETVAL]], align 8
   10929 // CHECK:   ret %struct.int32x2x3_t [[TMP6]]
   10930 int32x2x3_t test_vld3_s32(int32_t const *a) {
   10931   return vld3_s32(a);
   10932 }
   10933 
   10934 // CHECK-LABEL: define %struct.int64x1x3_t @test_vld3_s64(i64* %a) #0 {
   10935 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x3_t, align 8
   10936 // CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8
   10937 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
   10938 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
   10939 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>*
   10940 // CHECK:   [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0v1i64(<1 x i64>* [[TMP2]])
   10941 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
   10942 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
   10943 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x3_t* [[RETVAL]] to i8*
   10944 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
   10945 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
   10946 // CHECK:   [[TMP6:%.*]] = load %struct.int64x1x3_t, %struct.int64x1x3_t* [[RETVAL]], align 8
   10947 // CHECK:   ret %struct.int64x1x3_t [[TMP6]]
   10948 int64x1x3_t test_vld3_s64(int64_t const *a) {
   10949   return vld3_s64(a);
   10950 }
   10951 
   10952 // CHECK-LABEL: define %struct.float16x4x3_t @test_vld3_f16(half* %a) #0 {
   10953 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x3_t, align 8
   10954 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
   10955 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
   10956 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
   10957 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
   10958 // CHECK:   [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
   10959 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
   10960 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
   10961 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x3_t* [[RETVAL]] to i8*
   10962 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
   10963 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
   10964 // CHECK:   [[TMP6:%.*]] = load %struct.float16x4x3_t, %struct.float16x4x3_t* [[RETVAL]], align 8
   10965 // CHECK:   ret %struct.float16x4x3_t [[TMP6]]
   10966 float16x4x3_t test_vld3_f16(float16_t const *a) {
   10967   return vld3_f16(a);
   10968 }
   10969 
   10970 // CHECK-LABEL: define %struct.float32x2x3_t @test_vld3_f32(float* %a) #0 {
   10971 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x3_t, align 8
   10972 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
   10973 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
   10974 // CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
   10975 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x float>*
   10976 // CHECK:   [[VLD3:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3.v2f32.p0v2f32(<2 x float>* [[TMP2]])
   10977 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float> }*
   10978 // CHECK:   store { <2 x float>, <2 x float>, <2 x float> } [[VLD3]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
   10979 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x3_t* [[RETVAL]] to i8*
   10980 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
   10981 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
   10982 // CHECK:   [[TMP6:%.*]] = load %struct.float32x2x3_t, %struct.float32x2x3_t* [[RETVAL]], align 8
   10983 // CHECK:   ret %struct.float32x2x3_t [[TMP6]]
   10984 float32x2x3_t test_vld3_f32(float32_t const *a) {
   10985   return vld3_f32(a);
   10986 }
   10987 
   10988 // CHECK-LABEL: define %struct.float64x1x3_t @test_vld3_f64(double* %a) #0 {
   10989 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x3_t, align 8
   10990 // CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x3_t, align 8
   10991 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8*
   10992 // CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
   10993 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x double>*
   10994 // CHECK:   [[VLD3:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3.v1f64.p0v1f64(<1 x double>* [[TMP2]])
   10995 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x double>, <1 x double>, <1 x double> }*
   10996 // CHECK:   store { <1 x double>, <1 x double>, <1 x double> } [[VLD3]], { <1 x double>, <1 x double>, <1 x double> }* [[TMP3]]
   10997 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x1x3_t* [[RETVAL]] to i8*
   10998 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8*
   10999 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
   11000 // CHECK:   [[TMP6:%.*]] = load %struct.float64x1x3_t, %struct.float64x1x3_t* [[RETVAL]], align 8
   11001 // CHECK:   ret %struct.float64x1x3_t [[TMP6]]
   11002 float64x1x3_t test_vld3_f64(float64_t const *a) {
   11003   return vld3_f64(a);
   11004 }
   11005 
   11006 // CHECK-LABEL: define %struct.poly8x8x3_t @test_vld3_p8(i8* %a) #0 {
   11007 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x3_t, align 8
   11008 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
   11009 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
   11010 // CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>*
   11011 // CHECK:   [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0v8i8(<8 x i8>* [[TMP1]])
   11012 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
   11013 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]]
   11014 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x3_t* [[RETVAL]] to i8*
   11015 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
   11016 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 24, i32 8, i1 false)
   11017 // CHECK:   [[TMP5:%.*]] = load %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[RETVAL]], align 8
   11018 // CHECK:   ret %struct.poly8x8x3_t [[TMP5]]
   11019 poly8x8x3_t test_vld3_p8(poly8_t const *a) {
   11020   return vld3_p8(a);
   11021 }
   11022 
   11023 // CHECK-LABEL: define %struct.poly16x4x3_t @test_vld3_p16(i16* %a) #0 {
   11024 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x3_t, align 8
   11025 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
   11026 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
   11027 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   11028 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
   11029 // CHECK:   [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
   11030 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
   11031 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
   11032 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x3_t* [[RETVAL]] to i8*
   11033 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
   11034 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
   11035 // CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[RETVAL]], align 8
   11036 // CHECK:   ret %struct.poly16x4x3_t [[TMP6]]
   11037 poly16x4x3_t test_vld3_p16(poly16_t const *a) {
   11038   return vld3_p16(a);
   11039 }
   11040 
   11041 // CHECK-LABEL: define %struct.uint8x16x4_t @test_vld4q_u8(i8* %a) #0 {
   11042 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x4_t, align 16
   11043 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align 16
   11044 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
   11045 // CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>*
   11046 // CHECK:   [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0v16i8(<16 x i8>* [[TMP1]])
   11047 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
   11048 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]]
   11049 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x4_t* [[RETVAL]] to i8*
   11050 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
   11051 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 64, i32 16, i1 false)
   11052 // CHECK:   [[TMP5:%.*]] = load %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[RETVAL]], align 16
   11053 // CHECK:   ret %struct.uint8x16x4_t [[TMP5]]
   11054 uint8x16x4_t test_vld4q_u8(uint8_t const *a) {
   11055   return vld4q_u8(a);
   11056 }
   11057 
   11058 // CHECK-LABEL: define %struct.uint16x8x4_t @test_vld4q_u16(i16* %a) #0 {
   11059 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x4_t, align 16
   11060 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16
   11061 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
   11062 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   11063 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
   11064 // CHECK:   [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
   11065 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
   11066 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
   11067 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x4_t* [[RETVAL]] to i8*
   11068 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
   11069 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
   11070 // CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[RETVAL]], align 16
   11071 // CHECK:   ret %struct.uint16x8x4_t [[TMP6]]
   11072 uint16x8x4_t test_vld4q_u16(uint16_t const *a) {
   11073   return vld4q_u16(a);
   11074 }
   11075 
   11076 // CHECK-LABEL: define %struct.uint32x4x4_t @test_vld4q_u32(i32* %a) #0 {
   11077 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x4_t, align 16
   11078 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16
   11079 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
   11080 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
   11081 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i32>*
   11082 // CHECK:   [[VLD4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
   11083 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
   11084 // CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
   11085 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x4_t* [[RETVAL]] to i8*
   11086 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
   11087 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
   11088 // CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[RETVAL]], align 16
   11089 // CHECK:   ret %struct.uint32x4x4_t [[TMP6]]
   11090 uint32x4x4_t test_vld4q_u32(uint32_t const *a) {
   11091   return vld4q_u32(a);
   11092 }
   11093 
   11094 // CHECK-LABEL: define %struct.uint64x2x4_t @test_vld4q_u64(i64* %a) #0 {
   11095 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x4_t, align 16
   11096 // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x4_t, align 16
   11097 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
   11098 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
   11099 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i64>*
   11100 // CHECK:   [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0v2i64(<2 x i64>* [[TMP2]])
   11101 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
   11102 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
   11103 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x2x4_t* [[RETVAL]] to i8*
   11104 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
   11105 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
   11106 // CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[RETVAL]], align 16
   11107 // CHECK:   ret %struct.uint64x2x4_t [[TMP6]]
   11108 uint64x2x4_t test_vld4q_u64(uint64_t const *a) {
   11109   return vld4q_u64(a);
   11110 }
   11111 
   11112 // CHECK-LABEL: define %struct.int8x16x4_t @test_vld4q_s8(i8* %a) #0 {
   11113 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x4_t, align 16
   11114 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x4_t, align 16
   11115 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
   11116 // CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>*
   11117 // CHECK:   [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0v16i8(<16 x i8>* [[TMP1]])
   11118 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
   11119 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]]
   11120 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x4_t* [[RETVAL]] to i8*
   11121 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
   11122 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 64, i32 16, i1 false)
   11123 // CHECK:   [[TMP5:%.*]] = load %struct.int8x16x4_t, %struct.int8x16x4_t* [[RETVAL]], align 16
   11124 // CHECK:   ret %struct.int8x16x4_t [[TMP5]]
   11125 int8x16x4_t test_vld4q_s8(int8_t const *a) {
   11126   return vld4q_s8(a);
   11127 }
   11128 
   11129 // CHECK-LABEL: define %struct.int16x8x4_t @test_vld4q_s16(i16* %a) #0 {
   11130 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x4_t, align 16
   11131 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16
   11132 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
   11133 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   11134 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
   11135 // CHECK:   [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
   11136 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
   11137 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
   11138 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x4_t* [[RETVAL]] to i8*
   11139 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
   11140 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
   11141 // CHECK:   [[TMP6:%.*]] = load %struct.int16x8x4_t, %struct.int16x8x4_t* [[RETVAL]], align 16
   11142 // CHECK:   ret %struct.int16x8x4_t [[TMP6]]
   11143 int16x8x4_t test_vld4q_s16(int16_t const *a) {
   11144   return vld4q_s16(a);
   11145 }
   11146 
   11147 // CHECK-LABEL: define %struct.int32x4x4_t @test_vld4q_s32(i32* %a) #0 {
   11148 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x4_t, align 16
   11149 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16
   11150 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
   11151 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
   11152 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i32>*
   11153 // CHECK:   [[VLD4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
   11154 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
   11155 // CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
   11156 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x4_t* [[RETVAL]] to i8*
   11157 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
   11158 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
   11159 // CHECK:   [[TMP6:%.*]] = load %struct.int32x4x4_t, %struct.int32x4x4_t* [[RETVAL]], align 16
   11160 // CHECK:   ret %struct.int32x4x4_t [[TMP6]]
   11161 int32x4x4_t test_vld4q_s32(int32_t const *a) {
   11162   return vld4q_s32(a);
   11163 }
   11164 
   11165 // CHECK-LABEL: define %struct.int64x2x4_t @test_vld4q_s64(i64* %a) #0 {
   11166 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x4_t, align 16
   11167 // CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x4_t, align 16
   11168 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
   11169 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
   11170 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i64>*
   11171 // CHECK:   [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0v2i64(<2 x i64>* [[TMP2]])
   11172 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
   11173 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
   11174 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x2x4_t* [[RETVAL]] to i8*
   11175 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
   11176 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
   11177 // CHECK:   [[TMP6:%.*]] = load %struct.int64x2x4_t, %struct.int64x2x4_t* [[RETVAL]], align 16
   11178 // CHECK:   ret %struct.int64x2x4_t [[TMP6]]
   11179 int64x2x4_t test_vld4q_s64(int64_t const *a) {
   11180   return vld4q_s64(a);
   11181 }
   11182 
   11183 // CHECK-LABEL: define %struct.float16x8x4_t @test_vld4q_f16(half* %a) #0 {
   11184 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x4_t, align 16
   11185 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
   11186 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
   11187 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
   11188 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
   11189 // CHECK:   [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
   11190 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
   11191 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
   11192 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x4_t* [[RETVAL]] to i8*
   11193 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
   11194 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
   11195 // CHECK:   [[TMP6:%.*]] = load %struct.float16x8x4_t, %struct.float16x8x4_t* [[RETVAL]], align 16
   11196 // CHECK:   ret %struct.float16x8x4_t [[TMP6]]
   11197 float16x8x4_t test_vld4q_f16(float16_t const *a) {
   11198   return vld4q_f16(a);
   11199 }
   11200 
   11201 // CHECK-LABEL: define %struct.float32x4x4_t @test_vld4q_f32(float* %a) #0 {
   11202 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x4_t, align 16
   11203 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16
   11204 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
   11205 // CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
   11206 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x float>*
   11207 // CHECK:   [[VLD4:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4.v4f32.p0v4f32(<4 x float>* [[TMP2]])
   11208 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float>, <4 x float> }*
   11209 // CHECK:   store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4]], { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
   11210 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x4_t* [[RETVAL]] to i8*
   11211 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
   11212 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
   11213 // CHECK:   [[TMP6:%.*]] = load %struct.float32x4x4_t, %struct.float32x4x4_t* [[RETVAL]], align 16
   11214 // CHECK:   ret %struct.float32x4x4_t [[TMP6]]
   11215 float32x4x4_t test_vld4q_f32(float32_t const *a) {
   11216   return vld4q_f32(a);
   11217 }
   11218 
   11219 // CHECK-LABEL: define %struct.float64x2x4_t @test_vld4q_f64(double* %a) #0 {
   11220 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x4_t, align 16
   11221 // CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x4_t, align 16
   11222 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8*
   11223 // CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
   11224 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x double>*
   11225 // CHECK:   [[VLD4:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4.v2f64.p0v2f64(<2 x double>* [[TMP2]])
   11226 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x double>, <2 x double>, <2 x double>, <2 x double> }*
   11227 // CHECK:   store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4]], { <2 x double>, <2 x double>, <2 x double>, <2 x double> }* [[TMP3]]
   11228 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x2x4_t* [[RETVAL]] to i8*
   11229 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8*
   11230 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
   11231 // CHECK:   [[TMP6:%.*]] = load %struct.float64x2x4_t, %struct.float64x2x4_t* [[RETVAL]], align 16
   11232 // CHECK:   ret %struct.float64x2x4_t [[TMP6]]
   11233 float64x2x4_t test_vld4q_f64(float64_t const *a) {
   11234   return vld4q_f64(a);
   11235 }
   11236 
   11237 // CHECK-LABEL: define %struct.poly8x16x4_t @test_vld4q_p8(i8* %a) #0 {
   11238 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x4_t, align 16
   11239 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align 16
   11240 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
   11241 // CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>*
   11242 // CHECK:   [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0v16i8(<16 x i8>* [[TMP1]])
   11243 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
   11244 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]]
   11245 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x4_t* [[RETVAL]] to i8*
   11246 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
   11247 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 64, i32 16, i1 false)
   11248 // CHECK:   [[TMP5:%.*]] = load %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[RETVAL]], align 16
   11249 // CHECK:   ret %struct.poly8x16x4_t [[TMP5]]
   11250 poly8x16x4_t test_vld4q_p8(poly8_t const *a) {
   11251   return vld4q_p8(a);
   11252 }
   11253 
   11254 // CHECK-LABEL: define %struct.poly16x8x4_t @test_vld4q_p16(i16* %a) #0 {
   11255 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x4_t, align 16
   11256 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16
   11257 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
   11258 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   11259 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
   11260 // CHECK:   [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
   11261 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
   11262 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
   11263 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x4_t* [[RETVAL]] to i8*
   11264 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
   11265 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
   11266 // CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[RETVAL]], align 16
   11267 // CHECK:   ret %struct.poly16x8x4_t [[TMP6]]
   11268 poly16x8x4_t test_vld4q_p16(poly16_t const *a) {
   11269   return vld4q_p16(a);
   11270 }
   11271 
   11272 // CHECK-LABEL: define %struct.uint8x8x4_t @test_vld4_u8(i8* %a) #0 {
   11273 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x4_t, align 8
   11274 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
   11275 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
   11276 // CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>*
   11277 // CHECK:   [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0v8i8(<8 x i8>* [[TMP1]])
   11278 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
   11279 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]]
   11280 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x4_t* [[RETVAL]] to i8*
   11281 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
   11282 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 8, i1 false)
   11283 // CHECK:   [[TMP5:%.*]] = load %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[RETVAL]], align 8
   11284 // CHECK:   ret %struct.uint8x8x4_t [[TMP5]]
   11285 uint8x8x4_t test_vld4_u8(uint8_t const *a) {
   11286   return vld4_u8(a);
   11287 }
   11288 
   11289 // CHECK-LABEL: define %struct.uint16x4x4_t @test_vld4_u16(i16* %a) #0 {
   11290 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x4_t, align 8
   11291 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
   11292 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
   11293 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   11294 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
   11295 // CHECK:   [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
   11296 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
   11297 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
   11298 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x4_t* [[RETVAL]] to i8*
   11299 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
   11300 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
   11301 // CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[RETVAL]], align 8
   11302 // CHECK:   ret %struct.uint16x4x4_t [[TMP6]]
   11303 uint16x4x4_t test_vld4_u16(uint16_t const *a) {
   11304   return vld4_u16(a);
   11305 }
   11306 
   11307 // CHECK-LABEL: define %struct.uint32x2x4_t @test_vld4_u32(i32* %a) #0 {
   11308 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x4_t, align 8
   11309 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
   11310 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
   11311 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
   11312 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i32>*
   11313 // CHECK:   [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4.v2i32.p0v2i32(<2 x i32>* [[TMP2]])
   11314 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
   11315 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
   11316 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x4_t* [[RETVAL]] to i8*
   11317 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
   11318 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
   11319 // CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[RETVAL]], align 8
   11320 // CHECK:   ret %struct.uint32x2x4_t [[TMP6]]
   11321 uint32x2x4_t test_vld4_u32(uint32_t const *a) {
   11322   return vld4_u32(a);
   11323 }
   11324 
   11325 // CHECK-LABEL: define %struct.uint64x1x4_t @test_vld4_u64(i64* %a) #0 {
   11326 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x4_t, align 8
   11327 // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8
   11328 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
   11329 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
   11330 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>*
   11331 // CHECK:   [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0v1i64(<1 x i64>* [[TMP2]])
   11332 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
   11333 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
   11334 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x4_t* [[RETVAL]] to i8*
   11335 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
   11336 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
   11337 // CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[RETVAL]], align 8
   11338 // CHECK:   ret %struct.uint64x1x4_t [[TMP6]]
   11339 uint64x1x4_t test_vld4_u64(uint64_t const *a) {
   11340   return vld4_u64(a);
   11341 }
   11342 
   11343 // CHECK-LABEL: define %struct.int8x8x4_t @test_vld4_s8(i8* %a) #0 {
   11344 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x4_t, align 8
   11345 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
   11346 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
   11347 // CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>*
   11348 // CHECK:   [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0v8i8(<8 x i8>* [[TMP1]])
   11349 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
   11350 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]]
   11351 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x4_t* [[RETVAL]] to i8*
   11352 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
   11353 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 8, i1 false)
   11354 // CHECK:   [[TMP5:%.*]] = load %struct.int8x8x4_t, %struct.int8x8x4_t* [[RETVAL]], align 8
   11355 // CHECK:   ret %struct.int8x8x4_t [[TMP5]]
   11356 int8x8x4_t test_vld4_s8(int8_t const *a) {
   11357   return vld4_s8(a);
   11358 }
   11359 
   11360 // CHECK-LABEL: define %struct.int16x4x4_t @test_vld4_s16(i16* %a) #0 {
   11361 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x4_t, align 8
   11362 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
   11363 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
   11364 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   11365 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
   11366 // CHECK:   [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
   11367 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
   11368 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
   11369 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x4_t* [[RETVAL]] to i8*
   11370 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
   11371 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
   11372 // CHECK:   [[TMP6:%.*]] = load %struct.int16x4x4_t, %struct.int16x4x4_t* [[RETVAL]], align 8
   11373 // CHECK:   ret %struct.int16x4x4_t [[TMP6]]
   11374 int16x4x4_t test_vld4_s16(int16_t const *a) {
   11375   return vld4_s16(a);
   11376 }
   11377 
   11378 // CHECK-LABEL: define %struct.int32x2x4_t @test_vld4_s32(i32* %a) #0 {
   11379 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x4_t, align 8
   11380 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
   11381 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
   11382 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
   11383 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i32>*
   11384 // CHECK:   [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4.v2i32.p0v2i32(<2 x i32>* [[TMP2]])
   11385 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
   11386 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
   11387 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x4_t* [[RETVAL]] to i8*
   11388 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
   11389 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
   11390 // CHECK:   [[TMP6:%.*]] = load %struct.int32x2x4_t, %struct.int32x2x4_t* [[RETVAL]], align 8
   11391 // CHECK:   ret %struct.int32x2x4_t [[TMP6]]
   11392 int32x2x4_t test_vld4_s32(int32_t const *a) {
   11393   return vld4_s32(a);
   11394 }
   11395 
   11396 // CHECK-LABEL: define %struct.int64x1x4_t @test_vld4_s64(i64* %a) #0 {
   11397 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x4_t, align 8
   11398 // CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8
   11399 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
   11400 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
   11401 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>*
   11402 // CHECK:   [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0v1i64(<1 x i64>* [[TMP2]])
   11403 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
   11404 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
   11405 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x4_t* [[RETVAL]] to i8*
   11406 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
   11407 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
   11408 // CHECK:   [[TMP6:%.*]] = load %struct.int64x1x4_t, %struct.int64x1x4_t* [[RETVAL]], align 8
   11409 // CHECK:   ret %struct.int64x1x4_t [[TMP6]]
   11410 int64x1x4_t test_vld4_s64(int64_t const *a) {
   11411   return vld4_s64(a);
   11412 }
   11413 
   11414 // CHECK-LABEL: define %struct.float16x4x4_t @test_vld4_f16(half* %a) #0 {
   11415 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x4_t, align 8
   11416 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
   11417 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
   11418 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
   11419 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
   11420 // CHECK:   [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
   11421 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
   11422 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
   11423 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x4_t* [[RETVAL]] to i8*
   11424 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
   11425 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
   11426 // CHECK:   [[TMP6:%.*]] = load %struct.float16x4x4_t, %struct.float16x4x4_t* [[RETVAL]], align 8
   11427 // CHECK:   ret %struct.float16x4x4_t [[TMP6]]
   11428 float16x4x4_t test_vld4_f16(float16_t const *a) {
   11429   return vld4_f16(a);
   11430 }
   11431 
   11432 // CHECK-LABEL: define %struct.float32x2x4_t @test_vld4_f32(float* %a) #0 {
   11433 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x4_t, align 8
   11434 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
   11435 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
   11436 // CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
   11437 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x float>*
   11438 // CHECK:   [[VLD4:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4.v2f32.p0v2f32(<2 x float>* [[TMP2]])
   11439 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float>, <2 x float> }*
   11440 // CHECK:   store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
   11441 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x4_t* [[RETVAL]] to i8*
   11442 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
   11443 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
   11444 // CHECK:   [[TMP6:%.*]] = load %struct.float32x2x4_t, %struct.float32x2x4_t* [[RETVAL]], align 8
   11445 // CHECK:   ret %struct.float32x2x4_t [[TMP6]]
   11446 float32x2x4_t test_vld4_f32(float32_t const *a) {
   11447   return vld4_f32(a);
   11448 }
   11449 
   11450 // CHECK-LABEL: define %struct.float64x1x4_t @test_vld4_f64(double* %a) #0 {
   11451 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x4_t, align 8
   11452 // CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x4_t, align 8
   11453 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8*
   11454 // CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
   11455 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x double>*
   11456 // CHECK:   [[VLD4:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4.v1f64.p0v1f64(<1 x double>* [[TMP2]])
   11457 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x double>, <1 x double>, <1 x double>, <1 x double> }*
   11458 // CHECK:   store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4]], { <1 x double>, <1 x double>, <1 x double>, <1 x double> }* [[TMP3]]
   11459 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x1x4_t* [[RETVAL]] to i8*
   11460 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8*
   11461 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
   11462 // CHECK:   [[TMP6:%.*]] = load %struct.float64x1x4_t, %struct.float64x1x4_t* [[RETVAL]], align 8
   11463 // CHECK:   ret %struct.float64x1x4_t [[TMP6]]
   11464 float64x1x4_t test_vld4_f64(float64_t const *a) {
   11465   return vld4_f64(a);
   11466 }
   11467 
   11468 // CHECK-LABEL: define %struct.poly8x8x4_t @test_vld4_p8(i8* %a) #0 {
   11469 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x4_t, align 8
   11470 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
   11471 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
   11472 // CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>*
   11473 // CHECK:   [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0v8i8(<8 x i8>* [[TMP1]])
   11474 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
   11475 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]]
   11476 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x4_t* [[RETVAL]] to i8*
   11477 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
   11478 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 8, i1 false)
   11479 // CHECK:   [[TMP5:%.*]] = load %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[RETVAL]], align 8
   11480 // CHECK:   ret %struct.poly8x8x4_t [[TMP5]]
   11481 poly8x8x4_t test_vld4_p8(poly8_t const *a) {
   11482   return vld4_p8(a);
   11483 }
   11484 
   11485 // CHECK-LABEL: define %struct.poly16x4x4_t @test_vld4_p16(i16* %a) #0 {
   11486 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x4_t, align 8
   11487 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
   11488 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
   11489 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   11490 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
   11491 // CHECK:   [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
   11492 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
   11493 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
   11494 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x4_t* [[RETVAL]] to i8*
   11495 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
   11496 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
   11497 // CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[RETVAL]], align 8
   11498 // CHECK:   ret %struct.poly16x4x4_t [[TMP6]]
   11499 poly16x4x4_t test_vld4_p16(poly16_t const *a) {
   11500   return vld4_p16(a);
   11501 }
   11502 
   11503 // CHECK-LABEL: define void @test_vst1q_u8(i8* %a, <16 x i8> %b) #0 {
   11504 // CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <16 x i8>*
   11505 // CHECK:   store <16 x i8> %b, <16 x i8>* [[TMP0]]
   11506 // CHECK:   ret void
   11507 void test_vst1q_u8(uint8_t *a, uint8x16_t b) {
   11508   vst1q_u8(a, b);
   11509 }
   11510 
   11511 // CHECK-LABEL: define void @test_vst1q_u16(i16* %a, <8 x i16> %b) #0 {
   11512 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
   11513 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   11514 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
   11515 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   11516 // CHECK:   store <8 x i16> [[TMP3]], <8 x i16>* [[TMP2]]
   11517 // CHECK:   ret void
   11518 void test_vst1q_u16(uint16_t *a, uint16x8_t b) {
   11519   vst1q_u16(a, b);
   11520 }
   11521 
   11522 // CHECK-LABEL: define void @test_vst1q_u32(i32* %a, <4 x i32> %b) #0 {
   11523 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
   11524 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   11525 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
   11526 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   11527 // CHECK:   store <4 x i32> [[TMP3]], <4 x i32>* [[TMP2]]
   11528 // CHECK:   ret void
   11529 void test_vst1q_u32(uint32_t *a, uint32x4_t b) {
   11530   vst1q_u32(a, b);
   11531 }
   11532 
   11533 // CHECK-LABEL: define void @test_vst1q_u64(i64* %a, <2 x i64> %b) #0 {
   11534 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
   11535 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   11536 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <2 x i64>*
   11537 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   11538 // CHECK:   store <2 x i64> [[TMP3]], <2 x i64>* [[TMP2]]
   11539 // CHECK:   ret void
   11540 void test_vst1q_u64(uint64_t *a, uint64x2_t b) {
   11541   vst1q_u64(a, b);
   11542 }
   11543 
   11544 // CHECK-LABEL: define void @test_vst1q_s8(i8* %a, <16 x i8> %b) #0 {
   11545 // CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <16 x i8>*
   11546 // CHECK:   store <16 x i8> %b, <16 x i8>* [[TMP0]]
   11547 // CHECK:   ret void
   11548 void test_vst1q_s8(int8_t *a, int8x16_t b) {
   11549   vst1q_s8(a, b);
   11550 }
   11551 
   11552 // CHECK-LABEL: define void @test_vst1q_s16(i16* %a, <8 x i16> %b) #0 {
   11553 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
   11554 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   11555 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
   11556 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   11557 // CHECK:   store <8 x i16> [[TMP3]], <8 x i16>* [[TMP2]]
   11558 // CHECK:   ret void
   11559 void test_vst1q_s16(int16_t *a, int16x8_t b) {
   11560   vst1q_s16(a, b);
   11561 }
   11562 
   11563 // CHECK-LABEL: define void @test_vst1q_s32(i32* %a, <4 x i32> %b) #0 {
   11564 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
   11565 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   11566 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
   11567 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   11568 // CHECK:   store <4 x i32> [[TMP3]], <4 x i32>* [[TMP2]]
   11569 // CHECK:   ret void
   11570 void test_vst1q_s32(int32_t *a, int32x4_t b) {
   11571   vst1q_s32(a, b);
   11572 }
   11573 
   11574 // CHECK-LABEL: define void @test_vst1q_s64(i64* %a, <2 x i64> %b) #0 {
   11575 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
   11576 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   11577 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <2 x i64>*
   11578 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   11579 // CHECK:   store <2 x i64> [[TMP3]], <2 x i64>* [[TMP2]]
   11580 // CHECK:   ret void
   11581 void test_vst1q_s64(int64_t *a, int64x2_t b) {
   11582   vst1q_s64(a, b);
   11583 }
   11584 
   11585 // CHECK-LABEL: define void @test_vst1q_f16(half* %a, <8 x half> %b) #0 {
   11586 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
   11587 // CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
   11588 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
   11589 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   11590 // CHECK:   store <8 x i16> [[TMP3]], <8 x i16>* [[TMP2]]
   11591 // CHECK:   ret void
   11592 void test_vst1q_f16(float16_t *a, float16x8_t b) {
   11593   vst1q_f16(a, b);
   11594 }
   11595 
   11596 // CHECK-LABEL: define void @test_vst1q_f32(float* %a, <4 x float> %b) #0 {
   11597 // CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
   11598 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
   11599 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
   11600 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
   11601 // CHECK:   store <4 x float> [[TMP3]], <4 x float>* [[TMP2]]
   11602 // CHECK:   ret void
   11603 void test_vst1q_f32(float32_t *a, float32x4_t b) {
   11604   vst1q_f32(a, b);
   11605 }
   11606 
   11607 // CHECK-LABEL: define void @test_vst1q_f64(double* %a, <2 x double> %b) #0 {
   11608 // CHECK:   [[TMP0:%.*]] = bitcast double* %a to i8*
   11609 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
   11610 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <2 x double>*
   11611 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
   11612 // CHECK:   store <2 x double> [[TMP3]], <2 x double>* [[TMP2]]
   11613 // CHECK:   ret void
   11614 void test_vst1q_f64(float64_t *a, float64x2_t b) {
   11615   vst1q_f64(a, b);
   11616 }
   11617 
   11618 // CHECK-LABEL: define void @test_vst1q_p8(i8* %a, <16 x i8> %b) #0 {
   11619 // CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <16 x i8>*
   11620 // CHECK:   store <16 x i8> %b, <16 x i8>* [[TMP0]]
   11621 // CHECK:   ret void
   11622 void test_vst1q_p8(poly8_t *a, poly8x16_t b) {
   11623   vst1q_p8(a, b);
   11624 }
   11625 
   11626 // CHECK-LABEL: define void @test_vst1q_p16(i16* %a, <8 x i16> %b) #0 {
   11627 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
   11628 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   11629 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
   11630 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   11631 // CHECK:   store <8 x i16> [[TMP3]], <8 x i16>* [[TMP2]]
   11632 // CHECK:   ret void
   11633 void test_vst1q_p16(poly16_t *a, poly16x8_t b) {
   11634   vst1q_p16(a, b);
   11635 }
   11636 
   11637 // CHECK-LABEL: define void @test_vst1_u8(i8* %a, <8 x i8> %b) #0 {
   11638 // CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <8 x i8>*
   11639 // CHECK:   store <8 x i8> %b, <8 x i8>* [[TMP0]]
   11640 // CHECK:   ret void
   11641 void test_vst1_u8(uint8_t *a, uint8x8_t b) {
   11642   vst1_u8(a, b);
   11643 }
   11644 
   11645 // CHECK-LABEL: define void @test_vst1_u16(i16* %a, <4 x i16> %b) #0 {
   11646 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
   11647 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   11648 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
   11649 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   11650 // CHECK:   store <4 x i16> [[TMP3]], <4 x i16>* [[TMP2]]
   11651 // CHECK:   ret void
   11652 void test_vst1_u16(uint16_t *a, uint16x4_t b) {
   11653   vst1_u16(a, b);
   11654 }
   11655 
   11656 // CHECK-LABEL: define void @test_vst1_u32(i32* %a, <2 x i32> %b) #0 {
   11657 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
   11658 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   11659 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
   11660 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   11661 // CHECK:   store <2 x i32> [[TMP3]], <2 x i32>* [[TMP2]]
   11662 // CHECK:   ret void
   11663 void test_vst1_u32(uint32_t *a, uint32x2_t b) {
   11664   vst1_u32(a, b);
   11665 }
   11666 
   11667 // CHECK-LABEL: define void @test_vst1_u64(i64* %a, <1 x i64> %b) #0 {
   11668 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
   11669 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   11670 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <1 x i64>*
   11671 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   11672 // CHECK:   store <1 x i64> [[TMP3]], <1 x i64>* [[TMP2]]
   11673 // CHECK:   ret void
   11674 void test_vst1_u64(uint64_t *a, uint64x1_t b) {
   11675   vst1_u64(a, b);
   11676 }
   11677 
   11678 // CHECK-LABEL: define void @test_vst1_s8(i8* %a, <8 x i8> %b) #0 {
   11679 // CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <8 x i8>*
   11680 // CHECK:   store <8 x i8> %b, <8 x i8>* [[TMP0]]
   11681 // CHECK:   ret void
   11682 void test_vst1_s8(int8_t *a, int8x8_t b) {
   11683   vst1_s8(a, b);
   11684 }
   11685 
   11686 // CHECK-LABEL: define void @test_vst1_s16(i16* %a, <4 x i16> %b) #0 {
   11687 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
   11688 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   11689 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
   11690 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   11691 // CHECK:   store <4 x i16> [[TMP3]], <4 x i16>* [[TMP2]]
   11692 // CHECK:   ret void
   11693 void test_vst1_s16(int16_t *a, int16x4_t b) {
   11694   vst1_s16(a, b);
   11695 }
   11696 
   11697 // CHECK-LABEL: define void @test_vst1_s32(i32* %a, <2 x i32> %b) #0 {
   11698 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
   11699 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   11700 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
   11701 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   11702 // CHECK:   store <2 x i32> [[TMP3]], <2 x i32>* [[TMP2]]
   11703 // CHECK:   ret void
   11704 void test_vst1_s32(int32_t *a, int32x2_t b) {
   11705   vst1_s32(a, b);
   11706 }
   11707 
   11708 // CHECK-LABEL: define void @test_vst1_s64(i64* %a, <1 x i64> %b) #0 {
   11709 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
   11710 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   11711 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <1 x i64>*
   11712 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   11713 // CHECK:   store <1 x i64> [[TMP3]], <1 x i64>* [[TMP2]]
   11714 // CHECK:   ret void
   11715 void test_vst1_s64(int64_t *a, int64x1_t b) {
   11716   vst1_s64(a, b);
   11717 }
   11718 
   11719 // CHECK-LABEL: define void @test_vst1_f16(half* %a, <4 x half> %b) #0 {
   11720 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
   11721 // CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
   11722 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
   11723 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   11724 // CHECK:   store <4 x i16> [[TMP3]], <4 x i16>* [[TMP2]]
   11725 // CHECK:   ret void
   11726 void test_vst1_f16(float16_t *a, float16x4_t b) {
   11727   vst1_f16(a, b);
   11728 }
   11729 
   11730 // CHECK-LABEL: define void @test_vst1_f32(float* %a, <2 x float> %b) #0 {
   11731 // CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
   11732 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
   11733 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
   11734 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
   11735 // CHECK:   store <2 x float> [[TMP3]], <2 x float>* [[TMP2]]
   11736 // CHECK:   ret void
   11737 void test_vst1_f32(float32_t *a, float32x2_t b) {
   11738   vst1_f32(a, b);
   11739 }
   11740 
   11741 // CHECK-LABEL: define void @test_vst1_f64(double* %a, <1 x double> %b) #0 {
   11742 // CHECK:   [[TMP0:%.*]] = bitcast double* %a to i8*
   11743 // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
   11744 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <1 x double>*
   11745 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
   11746 // CHECK:   store <1 x double> [[TMP3]], <1 x double>* [[TMP2]]
   11747 // CHECK:   ret void
   11748 void test_vst1_f64(float64_t *a, float64x1_t b) {
   11749   vst1_f64(a, b);
   11750 }
   11751 
   11752 // CHECK-LABEL: define void @test_vst1_p8(i8* %a, <8 x i8> %b) #0 {
   11753 // CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <8 x i8>*
   11754 // CHECK:   store <8 x i8> %b, <8 x i8>* [[TMP0]]
   11755 // CHECK:   ret void
   11756 void test_vst1_p8(poly8_t *a, poly8x8_t b) {
   11757   vst1_p8(a, b);
   11758 }
   11759 
   11760 // CHECK-LABEL: define void @test_vst1_p16(i16* %a, <4 x i16> %b) #0 {
   11761 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
   11762 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   11763 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
   11764 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   11765 // CHECK:   store <4 x i16> [[TMP3]], <4 x i16>* [[TMP2]]
   11766 // CHECK:   ret void
   11767 void test_vst1_p16(poly16_t *a, poly16x4_t b) {
   11768   vst1_p16(a, b);
   11769 }
   11770 
   11771 // CHECK-LABEL: define void @test_vst2q_u8(i8* %a, [2 x <16 x i8>] %b.coerce) #0 {
   11772 // CHECK:   [[B:%.*]] = alloca %struct.uint8x16x2_t, align 16
   11773 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16
   11774 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[B]], i32 0, i32 0
   11775 // CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
   11776 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__S1]] to i8*
   11777 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x2_t* [[B]] to i8*
   11778 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
   11779 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
   11780 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
   11781 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
   11782 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
   11783 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i64 0, i64 1
   11784 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
   11785 // CHECK:   call void @llvm.aarch64.neon.st2.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i8* %a)
   11786 // CHECK:   ret void
   11787 void test_vst2q_u8(uint8_t *a, uint8x16x2_t b) {
   11788   vst2q_u8(a, b);
   11789 }
   11790 
   11791 // CHECK-LABEL: define void @test_vst2q_u16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
   11792 // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
   11793 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
   11794 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0
   11795 // CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
   11796 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
   11797 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
   11798 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
   11799 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
   11800 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
   11801 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
   11802 // CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
   11803 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
   11804 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
   11805 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i64 0, i64 1
   11806 // CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
   11807 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
   11808 // CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
   11809 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
   11810 // CHECK:   call void @llvm.aarch64.neon.st2.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i8* [[TMP2]])
   11811 // CHECK:   ret void
   11812 void test_vst2q_u16(uint16_t *a, uint16x8x2_t b) {
   11813   vst2q_u16(a, b);
   11814 }
   11815 
   11816 // CHECK-LABEL: define void @test_vst2q_u32(i32* %a, [2 x <4 x i32>] %b.coerce) #0 {
   11817 // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
   11818 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
   11819 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0
   11820 // CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
   11821 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
   11822 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
   11823 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
   11824 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
   11825 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
   11826 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
   11827 // CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
   11828 // CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
   11829 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
   11830 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i64 0, i64 1
   11831 // CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
   11832 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
   11833 // CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
   11834 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
   11835 // CHECK:   call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i8* [[TMP2]])
   11836 // CHECK:   ret void
   11837 void test_vst2q_u32(uint32_t *a, uint32x4x2_t b) {
   11838   vst2q_u32(a, b);
   11839 }
   11840 
   11841 // CHECK-LABEL: define void @test_vst2q_u64(i64* %a, [2 x <2 x i64>] %b.coerce) #0 {
   11842 // CHECK:   [[B:%.*]] = alloca %struct.uint64x2x2_t, align 16
   11843 // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x2_t, align 16
   11844 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[B]], i32 0, i32 0
   11845 // CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
   11846 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__S1]] to i8*
   11847 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x2_t* [[B]] to i8*
   11848 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
   11849 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
   11850 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0
   11851 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
   11852 // CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
   11853 // CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
   11854 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0
   11855 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL1]], i64 0, i64 1
   11856 // CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
   11857 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
   11858 // CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
   11859 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
   11860 // CHECK:   call void @llvm.aarch64.neon.st2.v2i64.p0i8(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i8* [[TMP2]])
   11861 // CHECK:   ret void
   11862 void test_vst2q_u64(uint64_t *a, uint64x2x2_t b) {
   11863   vst2q_u64(a, b);
   11864 }
   11865 
   11866 // CHECK-LABEL: define void @test_vst2q_s8(i8* %a, [2 x <16 x i8>] %b.coerce) #0 {
   11867 // CHECK:   [[B:%.*]] = alloca %struct.int8x16x2_t, align 16
   11868 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 16
   11869 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[B]], i32 0, i32 0
   11870 // CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
   11871 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__S1]] to i8*
   11872 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x2_t* [[B]] to i8*
   11873 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
   11874 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
   11875 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
   11876 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
   11877 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
   11878 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i64 0, i64 1
   11879 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
   11880 // CHECK:   call void @llvm.aarch64.neon.st2.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i8* %a)
   11881 // CHECK:   ret void
   11882 void test_vst2q_s8(int8_t *a, int8x16x2_t b) {
   11883   vst2q_s8(a, b);
   11884 }
   11885 
   11886 // CHECK-LABEL: define void @test_vst2q_s16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
   11887 // CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
   11888 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
   11889 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0
   11890 // CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
   11891 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
   11892 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
   11893 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
   11894 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
   11895 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
   11896 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
   11897 // CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
   11898 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
   11899 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
   11900 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i64 0, i64 1
   11901 // CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
   11902 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
   11903 // CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
   11904 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
   11905 // CHECK:   call void @llvm.aarch64.neon.st2.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i8* [[TMP2]])
   11906 // CHECK:   ret void
   11907 void test_vst2q_s16(int16_t *a, int16x8x2_t b) {
   11908   vst2q_s16(a, b);
   11909 }
   11910 
   11911 // CHECK-LABEL: define void @test_vst2q_s32(i32* %a, [2 x <4 x i32>] %b.coerce) #0 {
   11912 // CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
   11913 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
   11914 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0
   11915 // CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
   11916 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
   11917 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
   11918 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
   11919 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
   11920 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
   11921 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
   11922 // CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
   11923 // CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
   11924 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
   11925 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i64 0, i64 1
   11926 // CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
   11927 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
   11928 // CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
   11929 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
   11930 // CHECK:   call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i8* [[TMP2]])
   11931 // CHECK:   ret void
   11932 void test_vst2q_s32(int32_t *a, int32x4x2_t b) {
   11933   vst2q_s32(a, b);
   11934 }
   11935 
   11936 // CHECK-LABEL: define void @test_vst2q_s64(i64* %a, [2 x <2 x i64>] %b.coerce) #0 {
   11937 // CHECK:   [[B:%.*]] = alloca %struct.int64x2x2_t, align 16
   11938 // CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x2_t, align 16
   11939 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[B]], i32 0, i32 0
   11940 // CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
   11941 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__S1]] to i8*
   11942 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x2_t* [[B]] to i8*
   11943 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
   11944 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
   11945 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0
   11946 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
   11947 // CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
   11948 // CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
   11949 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0
   11950 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL1]], i64 0, i64 1
   11951 // CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
   11952 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
   11953 // CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
   11954 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
   11955 // CHECK:   call void @llvm.aarch64.neon.st2.v2i64.p0i8(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i8* [[TMP2]])
   11956 // CHECK:   ret void
   11957 void test_vst2q_s64(int64_t *a, int64x2x2_t b) {
   11958   vst2q_s64(a, b);
   11959 }
   11960 
   11961 // CHECK-LABEL: define void @test_vst2q_f16(half* %a, [2 x <8 x half>] %b.coerce) #0 {
   11962 // CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
   11963 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
   11964 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0
   11965 // CHECK:   store [2 x <8 x half>] [[B]].coerce, [2 x <8 x half>]* [[COERCE_DIVE]], align 16
   11966 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
   11967 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
   11968 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
   11969 // CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
   11970 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
   11971 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i64 0, i64 0
   11972 // CHECK:   [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
   11973 // CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
   11974 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
   11975 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL1]], i64 0, i64 1
   11976 // CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
   11977 // CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
   11978 // CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
   11979 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
   11980 // CHECK:   call void @llvm.aarch64.neon.st2.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i8* [[TMP2]])
   11981 // CHECK:   ret void
   11982 void test_vst2q_f16(float16_t *a, float16x8x2_t b) {
   11983   vst2q_f16(a, b);
   11984 }
   11985 
   11986 // CHECK-LABEL: define void @test_vst2q_f32(float* %a, [2 x <4 x float>] %b.coerce) #0 {
   11987 // CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
   11988 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
   11989 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0
   11990 // CHECK:   store [2 x <4 x float>] [[B]].coerce, [2 x <4 x float>]* [[COERCE_DIVE]], align 16
   11991 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
   11992 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
   11993 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
   11994 // CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
   11995 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
   11996 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i64 0, i64 0
   11997 // CHECK:   [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
   11998 // CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
   11999 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
   12000 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL1]], i64 0, i64 1
   12001 // CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
   12002 // CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
   12003 // CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
   12004 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
   12005 // CHECK:   call void @llvm.aarch64.neon.st2.v4f32.p0i8(<4 x float> [[TMP7]], <4 x float> [[TMP8]], i8* [[TMP2]])
   12006 // CHECK:   ret void
   12007 void test_vst2q_f32(float32_t *a, float32x4x2_t b) {
   12008   vst2q_f32(a, b);
   12009 }
   12010 
   12011 // CHECK-LABEL: define void @test_vst2q_f64(double* %a, [2 x <2 x double>] %b.coerce) #0 {
   12012 // CHECK:   [[B:%.*]] = alloca %struct.float64x2x2_t, align 16
   12013 // CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x2_t, align 16
   12014 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[B]], i32 0, i32 0
   12015 // CHECK:   store [2 x <2 x double>] [[B]].coerce, [2 x <2 x double>]* [[COERCE_DIVE]], align 16
   12016 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x2_t* [[__S1]] to i8*
   12017 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x2_t* [[B]] to i8*
   12018 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
   12019 // CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
   12020 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[__S1]], i32 0, i32 0
   12021 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>]* [[VAL]], i64 0, i64 0
   12022 // CHECK:   [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
   12023 // CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
   12024 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[__S1]], i32 0, i32 0
   12025 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>]* [[VAL1]], i64 0, i64 1
   12026 // CHECK:   [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX2]], align 16
   12027 // CHECK:   [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
   12028 // CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
   12029 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
   12030 // CHECK:   call void @llvm.aarch64.neon.st2.v2f64.p0i8(<2 x double> [[TMP7]], <2 x double> [[TMP8]], i8* [[TMP2]])
   12031 // CHECK:   ret void
   12032 void test_vst2q_f64(float64_t *a, float64x2x2_t b) {
   12033   vst2q_f64(a, b);
   12034 }
   12035 
   12036 // CHECK-LABEL: define void @test_vst2q_p8(i8* %a, [2 x <16 x i8>] %b.coerce) #0 {
   12037 // CHECK:   [[B:%.*]] = alloca %struct.poly8x16x2_t, align 16
   12038 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 16
   12039 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[B]], i32 0, i32 0
   12040 // CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
   12041 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__S1]] to i8*
   12042 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x2_t* [[B]] to i8*
   12043 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
   12044 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
   12045 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
   12046 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
   12047 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
   12048 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i64 0, i64 1
   12049 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
   12050 // CHECK:   call void @llvm.aarch64.neon.st2.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i8* %a)
   12051 // CHECK:   ret void
   12052 void test_vst2q_p8(poly8_t *a, poly8x16x2_t b) {
   12053   vst2q_p8(a, b);
   12054 }
   12055 
   12056 // CHECK-LABEL: define void @test_vst2q_p16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
   12057 // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
   12058 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
   12059 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0
   12060 // CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
   12061 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
   12062 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
   12063 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
   12064 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
   12065 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
   12066 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
   12067 // CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
   12068 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
   12069 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
   12070 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i64 0, i64 1
   12071 // CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
   12072 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
   12073 // CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
   12074 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
   12075 // CHECK:   call void @llvm.aarch64.neon.st2.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i8* [[TMP2]])
   12076 // CHECK:   ret void
   12077 void test_vst2q_p16(poly16_t *a, poly16x8x2_t b) {
   12078   vst2q_p16(a, b);
   12079 }
   12080 
   12081 // CHECK-LABEL: define void @test_vst2_u8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
   12082 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
   12083 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
   12084 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
   12085 // CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
   12086 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
   12087 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
   12088 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
   12089 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
   12090 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
   12091 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
   12092 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
   12093 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i64 0, i64 1
   12094 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
   12095 // CHECK:   call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i8* %a)
   12096 // CHECK:   ret void
   12097 void test_vst2_u8(uint8_t *a, uint8x8x2_t b) {
   12098   vst2_u8(a, b);
   12099 }
   12100 
   12101 // CHECK-LABEL: define void @test_vst2_u16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
   12102 // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
   12103 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
   12104 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[B]], i32 0, i32 0
   12105 // CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
   12106 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
   12107 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
   12108 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
   12109 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
   12110 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
   12111 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
   12112 // CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
   12113 // CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
   12114 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
   12115 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i64 0, i64 1
   12116 // CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
   12117 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
   12118 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
   12119 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
   12120 // CHECK:   call void @llvm.aarch64.neon.st2.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i8* [[TMP2]])
   12121 // CHECK:   ret void
   12122 void test_vst2_u16(uint16_t *a, uint16x4x2_t b) {
   12123   vst2_u16(a, b);
   12124 }
   12125 
   12126 // CHECK-LABEL: define void @test_vst2_u32(i32* %a, [2 x <2 x i32>] %b.coerce) #0 {
   12127 // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
   12128 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
   12129 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[B]], i32 0, i32 0
   12130 // CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
   12131 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
   12132 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
   12133 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
   12134 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
   12135 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
   12136 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
   12137 // CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
   12138 // CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
   12139 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
   12140 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i64 0, i64 1
   12141 // CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
   12142 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
   12143 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
   12144 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
   12145 // CHECK:   call void @llvm.aarch64.neon.st2.v2i32.p0i8(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i8* [[TMP2]])
   12146 // CHECK:   ret void
   12147 void test_vst2_u32(uint32_t *a, uint32x2x2_t b) {
   12148   vst2_u32(a, b);
   12149 }
   12150 
   12151 // CHECK-LABEL: define void @test_vst2_u64(i64* %a, [2 x <1 x i64>] %b.coerce) #0 {
   12152 // CHECK:   [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8
   12153 // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8
   12154 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[B]], i32 0, i32 0
   12155 // CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
   12156 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__S1]] to i8*
   12157 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x2_t* [[B]] to i8*
   12158 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
   12159 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
   12160 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
   12161 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
   12162 // CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
   12163 // CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
   12164 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
   12165 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i64 0, i64 1
   12166 // CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
   12167 // CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
   12168 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
   12169 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
   12170 // CHECK:   call void @llvm.aarch64.neon.st2.v1i64.p0i8(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i8* [[TMP2]])
   12171 // CHECK:   ret void
   12172 void test_vst2_u64(uint64_t *a, uint64x1x2_t b) {
   12173   vst2_u64(a, b);
   12174 }
   12175 
   12176 // CHECK-LABEL: define void @test_vst2_s8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
   12177 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
   12178 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
   12179 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
   12180 // CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
   12181 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
   12182 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
   12183 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
   12184 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
   12185 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
   12186 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
   12187 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
   12188 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i64 0, i64 1
   12189 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
   12190 // CHECK:   call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i8* %a)
   12191 // CHECK:   ret void
   12192 void test_vst2_s8(int8_t *a, int8x8x2_t b) {
   12193   vst2_s8(a, b);
   12194 }
   12195 
   12196 // CHECK-LABEL: define void @test_vst2_s16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
   12197 // CHECK:   [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
   12198 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
   12199 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[B]], i32 0, i32 0
   12200 // CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
   12201 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
   12202 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
   12203 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
   12204 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
   12205 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
   12206 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
   12207 // CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
   12208 // CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
   12209 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
   12210 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i64 0, i64 1
   12211 // CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
   12212 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
   12213 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
   12214 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
   12215 // CHECK:   call void @llvm.aarch64.neon.st2.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i8* [[TMP2]])
   12216 // CHECK:   ret void
   12217 void test_vst2_s16(int16_t *a, int16x4x2_t b) {
   12218   vst2_s16(a, b);
   12219 }
   12220 
   12221 // CHECK-LABEL: define void @test_vst2_s32(i32* %a, [2 x <2 x i32>] %b.coerce) #0 {
   12222 // CHECK:   [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
   12223 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
   12224 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[B]], i32 0, i32 0
   12225 // CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
   12226 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
   12227 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
   12228 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
   12229 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
   12230 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
   12231 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
   12232 // CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
   12233 // CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
   12234 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
   12235 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i64 0, i64 1
   12236 // CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
   12237 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
   12238 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
   12239 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
   12240 // CHECK:   call void @llvm.aarch64.neon.st2.v2i32.p0i8(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i8* [[TMP2]])
   12241 // CHECK:   ret void
   12242 void test_vst2_s32(int32_t *a, int32x2x2_t b) {
   12243   vst2_s32(a, b);
   12244 }
   12245 
   12246 // CHECK-LABEL: define void @test_vst2_s64(i64* %a, [2 x <1 x i64>] %b.coerce) #0 {
   12247 // CHECK:   [[B:%.*]] = alloca %struct.int64x1x2_t, align 8
   12248 // CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8
   12249 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[B]], i32 0, i32 0
   12250 // CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
   12251 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__S1]] to i8*
   12252 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x2_t* [[B]] to i8*
   12253 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
   12254 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
   12255 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
   12256 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
   12257 // CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
   12258 // CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
   12259 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
   12260 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i64 0, i64 1
   12261 // CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
   12262 // CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
   12263 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
   12264 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
   12265 // CHECK:   call void @llvm.aarch64.neon.st2.v1i64.p0i8(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i8* [[TMP2]])
   12266 // CHECK:   ret void
   12267 void test_vst2_s64(int64_t *a, int64x1x2_t b) {
   12268   vst2_s64(a, b);
   12269 }
   12270 
   12271 // CHECK-LABEL: define void @test_vst2_f16(half* %a, [2 x <4 x half>] %b.coerce) #0 {
   12272 // CHECK:   [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
   12273 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
   12274 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[B]], i32 0, i32 0
   12275 // CHECK:   store [2 x <4 x half>] [[B]].coerce, [2 x <4 x half>]* [[COERCE_DIVE]], align 8
   12276 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
   12277 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
   12278 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
   12279 // CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
   12280 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
   12281 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i64 0, i64 0
   12282 // CHECK:   [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
   12283 // CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
   12284 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
   12285 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL1]], i64 0, i64 1
   12286 // CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
   12287 // CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
   12288 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
   12289 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
   12290 // CHECK:   call void @llvm.aarch64.neon.st2.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i8* [[TMP2]])
   12291 // CHECK:   ret void
   12292 void test_vst2_f16(float16_t *a, float16x4x2_t b) {
   12293   vst2_f16(a, b);
   12294 }
   12295 
   12296 // CHECK-LABEL: define void @test_vst2_f32(float* %a, [2 x <2 x float>] %b.coerce) #0 {
   12297 // CHECK:   [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
   12298 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
   12299 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[B]], i32 0, i32 0
   12300 // CHECK:   store [2 x <2 x float>] [[B]].coerce, [2 x <2 x float>]* [[COERCE_DIVE]], align 8
   12301 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
   12302 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
   12303 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
   12304 // CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
   12305 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
   12306 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i64 0, i64 0
   12307 // CHECK:   [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
   12308 // CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
   12309 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
   12310 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL1]], i64 0, i64 1
   12311 // CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
   12312 // CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
   12313 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
   12314 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
   12315 // CHECK:   call void @llvm.aarch64.neon.st2.v2f32.p0i8(<2 x float> [[TMP7]], <2 x float> [[TMP8]], i8* [[TMP2]])
   12316 // CHECK:   ret void
   12317 void test_vst2_f32(float32_t *a, float32x2x2_t b) {
   12318   vst2_f32(a, b);
   12319 }
   12320 
   12321 // CHECK-LABEL: define void @test_vst2_f64(double* %a, [2 x <1 x double>] %b.coerce) #0 {
   12322 // CHECK:   [[B:%.*]] = alloca %struct.float64x1x2_t, align 8
   12323 // CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x2_t, align 8
   12324 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[B]], i32 0, i32 0
   12325 // CHECK:   store [2 x <1 x double>] [[B]].coerce, [2 x <1 x double>]* [[COERCE_DIVE]], align 8
   12326 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x2_t* [[__S1]] to i8*
   12327 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x2_t* [[B]] to i8*
   12328 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
   12329 // CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
   12330 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[__S1]], i32 0, i32 0
   12331 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>]* [[VAL]], i64 0, i64 0
   12332 // CHECK:   [[TMP3:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
   12333 // CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
   12334 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[__S1]], i32 0, i32 0
   12335 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>]* [[VAL1]], i64 0, i64 1
   12336 // CHECK:   [[TMP5:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX2]], align 8
   12337 // CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
   12338 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
   12339 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
   12340 // CHECK:   call void @llvm.aarch64.neon.st2.v1f64.p0i8(<1 x double> [[TMP7]], <1 x double> [[TMP8]], i8* [[TMP2]])
   12341 // CHECK:   ret void
   12342 void test_vst2_f64(float64_t *a, float64x1x2_t b) {
   12343   vst2_f64(a, b);
   12344 }
   12345 
   12346 // CHECK-LABEL: define void @test_vst2_p8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
   12347 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
   12348 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
   12349 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
   12350 // CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
   12351 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
   12352 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
   12353 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
   12354 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
   12355 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
   12356 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
   12357 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
   12358 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i64 0, i64 1
   12359 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
   12360 // CHECK:   call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i8* %a)
   12361 // CHECK:   ret void
   12362 void test_vst2_p8(poly8_t *a, poly8x8x2_t b) {
   12363   vst2_p8(a, b);
   12364 }
   12365 
   12366 // CHECK-LABEL: define void @test_vst2_p16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
   12367 // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
   12368 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
   12369 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[B]], i32 0, i32 0
   12370 // CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
   12371 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
   12372 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
   12373 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
   12374 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
   12375 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
   12376 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
   12377 // CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
   12378 // CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
   12379 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
   12380 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i64 0, i64 1
   12381 // CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
   12382 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
   12383 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
   12384 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
   12385 // CHECK:   call void @llvm.aarch64.neon.st2.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i8* [[TMP2]])
   12386 // CHECK:   ret void
   12387 void test_vst2_p16(poly16_t *a, poly16x4x2_t b) {
   12388   vst2_p16(a, b);
   12389 }
   12390 
   12391 // CHECK-LABEL: define void @test_vst3q_u8(i8* %a, [3 x <16 x i8>] %b.coerce) #0 {
   12392 // CHECK:   [[B:%.*]] = alloca %struct.uint8x16x3_t, align 16
   12393 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 16
   12394 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[B]], i32 0, i32 0
   12395 // CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
   12396 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__S1]] to i8*
   12397 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x3_t* [[B]] to i8*
   12398 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
   12399 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
   12400 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
   12401 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
   12402 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
   12403 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i64 0, i64 1
   12404 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
   12405 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
   12406 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i64 0, i64 2
   12407 // CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
   12408 // CHECK:   call void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i8* %a)
   12409 // CHECK:   ret void
   12410 void test_vst3q_u8(uint8_t *a, uint8x16x3_t b) {
   12411   vst3q_u8(a, b);
   12412 }
   12413 
   12414 // CHECK-LABEL: define void @test_vst3q_u16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
   12415 // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
   12416 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
   12417 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0
   12418 // CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
   12419 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
   12420 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
   12421 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
   12422 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
   12423 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
   12424 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
   12425 // CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
   12426 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
   12427 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
   12428 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i64 0, i64 1
   12429 // CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
   12430 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
   12431 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
   12432 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i64 0, i64 2
   12433 // CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
   12434 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
   12435 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
   12436 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
   12437 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
   12438 // CHECK:   call void @llvm.aarch64.neon.st3.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i8* [[TMP2]])
   12439 // CHECK:   ret void
   12440 void test_vst3q_u16(uint16_t *a, uint16x8x3_t b) {
   12441   vst3q_u16(a, b);
   12442 }
   12443 
   12444 // CHECK-LABEL: define void @test_vst3q_u32(i32* %a, [3 x <4 x i32>] %b.coerce) #0 {
   12445 // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
   12446 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
   12447 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0
   12448 // CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
   12449 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
   12450 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
   12451 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
   12452 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
   12453 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
   12454 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
   12455 // CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
   12456 // CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
   12457 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
   12458 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i64 0, i64 1
   12459 // CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
   12460 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
   12461 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
   12462 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i64 0, i64 2
   12463 // CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
   12464 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
   12465 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
   12466 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
   12467 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
   12468 // CHECK:   call void @llvm.aarch64.neon.st3.v4i32.p0i8(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i8* [[TMP2]])
   12469 // CHECK:   ret void
   12470 void test_vst3q_u32(uint32_t *a, uint32x4x3_t b) {
   12471   vst3q_u32(a, b);
   12472 }
   12473 
   12474 // CHECK-LABEL: define void @test_vst3q_u64(i64* %a, [3 x <2 x i64>] %b.coerce) #0 {
   12475 // CHECK:   [[B:%.*]] = alloca %struct.uint64x2x3_t, align 16
   12476 // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x3_t, align 16
   12477 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[B]], i32 0, i32 0
   12478 // CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
   12479 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__S1]] to i8*
   12480 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x3_t* [[B]] to i8*
   12481 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
   12482 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
   12483 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
   12484 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
   12485 // CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
   12486 // CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
   12487 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
   12488 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL1]], i64 0, i64 1
   12489 // CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
   12490 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
   12491 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
   12492 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL3]], i64 0, i64 2
   12493 // CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
   12494 // CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
   12495 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
   12496 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
   12497 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
   12498 // CHECK:   call void @llvm.aarch64.neon.st3.v2i64.p0i8(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i8* [[TMP2]])
   12499 // CHECK:   ret void
   12500 void test_vst3q_u64(uint64_t *a, uint64x2x3_t b) {
   12501   vst3q_u64(a, b);
   12502 }
   12503 
   12504 // CHECK-LABEL: define void @test_vst3q_s8(i8* %a, [3 x <16 x i8>] %b.coerce) #0 {
   12505 // CHECK:   [[B:%.*]] = alloca %struct.int8x16x3_t, align 16
   12506 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 16
   12507 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[B]], i32 0, i32 0
   12508 // CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
   12509 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__S1]] to i8*
   12510 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x3_t* [[B]] to i8*
   12511 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
   12512 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
   12513 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
   12514 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
   12515 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
   12516 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i64 0, i64 1
   12517 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
   12518 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
   12519 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i64 0, i64 2
   12520 // CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
   12521 // CHECK:   call void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i8* %a)
   12522 // CHECK:   ret void
   12523 void test_vst3q_s8(int8_t *a, int8x16x3_t b) {
   12524   vst3q_s8(a, b);
   12525 }
   12526 
   12527 // CHECK-LABEL: define void @test_vst3q_s16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
   12528 // CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
   12529 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
   12530 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0
   12531 // CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
   12532 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
   12533 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
   12534 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
   12535 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
   12536 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
   12537 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
   12538 // CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
   12539 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
   12540 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
   12541 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i64 0, i64 1
   12542 // CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
   12543 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
   12544 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
   12545 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i64 0, i64 2
   12546 // CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
   12547 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
   12548 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
   12549 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
   12550 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
   12551 // CHECK:   call void @llvm.aarch64.neon.st3.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i8* [[TMP2]])
   12552 // CHECK:   ret void
   12553 void test_vst3q_s16(int16_t *a, int16x8x3_t b) {
   12554   vst3q_s16(a, b);
   12555 }
   12556 
   12557 // CHECK-LABEL: define void @test_vst3q_s32(i32* %a, [3 x <4 x i32>] %b.coerce) #0 {
   12558 // CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
   12559 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
   12560 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0
   12561 // CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
   12562 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
   12563 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
   12564 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
   12565 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
   12566 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
   12567 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
   12568 // CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
   12569 // CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
   12570 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
   12571 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i64 0, i64 1
   12572 // CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
   12573 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
   12574 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
   12575 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i64 0, i64 2
   12576 // CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
   12577 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
   12578 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
   12579 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
   12580 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
   12581 // CHECK:   call void @llvm.aarch64.neon.st3.v4i32.p0i8(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i8* [[TMP2]])
   12582 // CHECK:   ret void
   12583 void test_vst3q_s32(int32_t *a, int32x4x3_t b) {
   12584   vst3q_s32(a, b);
   12585 }
   12586 
   12587 // CHECK-LABEL: define void @test_vst3q_s64(i64* %a, [3 x <2 x i64>] %b.coerce) #0 {
   12588 // CHECK:   [[B:%.*]] = alloca %struct.int64x2x3_t, align 16
   12589 // CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x3_t, align 16
   12590 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[B]], i32 0, i32 0
   12591 // CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
   12592 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__S1]] to i8*
   12593 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x3_t* [[B]] to i8*
   12594 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
   12595 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
   12596 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
   12597 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
   12598 // CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
   12599 // CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
   12600 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
   12601 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL1]], i64 0, i64 1
   12602 // CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
   12603 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
   12604 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
   12605 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL3]], i64 0, i64 2
   12606 // CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
   12607 // CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
   12608 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
   12609 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
   12610 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
   12611 // CHECK:   call void @llvm.aarch64.neon.st3.v2i64.p0i8(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i8* [[TMP2]])
   12612 // CHECK:   ret void
   12613 void test_vst3q_s64(int64_t *a, int64x2x3_t b) {
   12614   vst3q_s64(a, b);
   12615 }
   12616 
   12617 // CHECK-LABEL: define void @test_vst3q_f16(half* %a, [3 x <8 x half>] %b.coerce) #0 {
   12618 // CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
   12619 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
   12620 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0
   12621 // CHECK:   store [3 x <8 x half>] [[B]].coerce, [3 x <8 x half>]* [[COERCE_DIVE]], align 16
   12622 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
   12623 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
   12624 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
   12625 // CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
   12626 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
   12627 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i64 0, i64 0
   12628 // CHECK:   [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
   12629 // CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
   12630 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
   12631 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL1]], i64 0, i64 1
   12632 // CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
   12633 // CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
   12634 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
   12635 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL3]], i64 0, i64 2
   12636 // CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
   12637 // CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
   12638 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
   12639 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
   12640 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
   12641 // CHECK:   call void @llvm.aarch64.neon.st3.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i8* [[TMP2]])
   12642 // CHECK:   ret void
   12643 void test_vst3q_f16(float16_t *a, float16x8x3_t b) {
   12644   vst3q_f16(a, b);
   12645 }
   12646 
   12647 // CHECK-LABEL: define void @test_vst3q_f32(float* %a, [3 x <4 x float>] %b.coerce) #0 {
   12648 // CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
   12649 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
   12650 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0
   12651 // CHECK:   store [3 x <4 x float>] [[B]].coerce, [3 x <4 x float>]* [[COERCE_DIVE]], align 16
   12652 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
   12653 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
   12654 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
   12655 // CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
   12656 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
   12657 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i64 0, i64 0
   12658 // CHECK:   [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
   12659 // CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
   12660 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
   12661 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL1]], i64 0, i64 1
   12662 // CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
   12663 // CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
   12664 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
   12665 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL3]], i64 0, i64 2
   12666 // CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
   12667 // CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
   12668 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
   12669 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
   12670 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
   12671 // CHECK:   call void @llvm.aarch64.neon.st3.v4f32.p0i8(<4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x float> [[TMP11]], i8* [[TMP2]])
   12672 // CHECK:   ret void
   12673 void test_vst3q_f32(float32_t *a, float32x4x3_t b) {
   12674   vst3q_f32(a, b);
   12675 }
   12676 
   12677 // CHECK-LABEL: define void @test_vst3q_f64(double* %a, [3 x <2 x double>] %b.coerce) #0 {
   12678 // CHECK:   [[B:%.*]] = alloca %struct.float64x2x3_t, align 16
   12679 // CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x3_t, align 16
   12680 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[B]], i32 0, i32 0
   12681 // CHECK:   store [3 x <2 x double>] [[B]].coerce, [3 x <2 x double>]* [[COERCE_DIVE]], align 16
   12682 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x3_t* [[__S1]] to i8*
   12683 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x3_t* [[B]] to i8*
   12684 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
   12685 // CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
   12686 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
   12687 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL]], i64 0, i64 0
   12688 // CHECK:   [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
   12689 // CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
   12690 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
   12691 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL1]], i64 0, i64 1
   12692 // CHECK:   [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX2]], align 16
   12693 // CHECK:   [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
   12694 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
   12695 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL3]], i64 0, i64 2
   12696 // CHECK:   [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX4]], align 16
   12697 // CHECK:   [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8>
   12698 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
   12699 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
   12700 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double>
   12701 // CHECK:   call void @llvm.aarch64.neon.st3.v2f64.p0i8(<2 x double> [[TMP9]], <2 x double> [[TMP10]], <2 x double> [[TMP11]], i8* [[TMP2]])
   12702 // CHECK:   ret void
   12703 void test_vst3q_f64(float64_t *a, float64x2x3_t b) {
   12704   vst3q_f64(a, b);
   12705 }
   12706 
   12707 // CHECK-LABEL: define void @test_vst3q_p8(i8* %a, [3 x <16 x i8>] %b.coerce) #0 {
   12708 // CHECK:   [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16
   12709 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 16
   12710 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[B]], i32 0, i32 0
   12711 // CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
   12712 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__S1]] to i8*
   12713 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x3_t* [[B]] to i8*
   12714 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
   12715 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
   12716 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
   12717 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
   12718 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
   12719 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i64 0, i64 1
   12720 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
   12721 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
   12722 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i64 0, i64 2
   12723 // CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
   12724 // CHECK:   call void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i8* %a)
   12725 // CHECK:   ret void
   12726 void test_vst3q_p8(poly8_t *a, poly8x16x3_t b) {
   12727   vst3q_p8(a, b);
   12728 }
   12729 
   12730 // CHECK-LABEL: define void @test_vst3q_p16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
   12731 // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
   12732 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
   12733 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0
   12734 // CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
   12735 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
   12736 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
   12737 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
   12738 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
   12739 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
   12740 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
   12741 // CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
   12742 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
   12743 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
   12744 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i64 0, i64 1
   12745 // CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
   12746 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
   12747 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
   12748 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i64 0, i64 2
   12749 // CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
   12750 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
   12751 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
   12752 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
   12753 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
   12754 // CHECK:   call void @llvm.aarch64.neon.st3.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i8* [[TMP2]])
   12755 // CHECK:   ret void
   12756 void test_vst3q_p16(poly16_t *a, poly16x8x3_t b) {
   12757   vst3q_p16(a, b);
   12758 }
   12759 
   12760 // CHECK-LABEL: define void @test_vst3_u8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
   12761 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
   12762 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
   12763 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
   12764 // CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
   12765 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
   12766 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
   12767 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
   12768 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
   12769 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
   12770 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
   12771 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
   12772 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i64 0, i64 1
   12773 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
   12774 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
   12775 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i64 0, i64 2
   12776 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
   12777 // CHECK:   call void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i8* %a)
   12778 // CHECK:   ret void
   12779 void test_vst3_u8(uint8_t *a, uint8x8x3_t b) {
   12780   vst3_u8(a, b);
   12781 }
   12782 
   12783 // CHECK-LABEL: define void @test_vst3_u16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
   12784 // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
   12785 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
   12786 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[B]], i32 0, i32 0
   12787 // CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
   12788 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
   12789 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
   12790 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
   12791 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
   12792 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
   12793 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
   12794 // CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
   12795 // CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
   12796 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
   12797 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i64 0, i64 1
   12798 // CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
   12799 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
   12800 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
   12801 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i64 0, i64 2
   12802 // CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
   12803 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
   12804 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
   12805 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
   12806 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
   12807 // CHECK:   call void @llvm.aarch64.neon.st3.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i8* [[TMP2]])
   12808 // CHECK:   ret void
   12809 void test_vst3_u16(uint16_t *a, uint16x4x3_t b) {
   12810   vst3_u16(a, b);
   12811 }
   12812 
   12813 // CHECK-LABEL: define void @test_vst3_u32(i32* %a, [3 x <2 x i32>] %b.coerce) #0 {
   12814 // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
   12815 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
   12816 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[B]], i32 0, i32 0
   12817 // CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
   12818 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
   12819 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
   12820 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
   12821 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
   12822 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
   12823 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
   12824 // CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
   12825 // CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
   12826 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
   12827 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i64 0, i64 1
   12828 // CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
   12829 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
   12830 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
   12831 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i64 0, i64 2
   12832 // CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
   12833 // CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
   12834 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
   12835 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
   12836 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
   12837 // CHECK:   call void @llvm.aarch64.neon.st3.v2i32.p0i8(<2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i8* [[TMP2]])
   12838 // CHECK:   ret void
   12839 void test_vst3_u32(uint32_t *a, uint32x2x3_t b) {
   12840   vst3_u32(a, b);
   12841 }
   12842 
   12843 // CHECK-LABEL: define void @test_vst3_u64(i64* %a, [3 x <1 x i64>] %b.coerce) #0 {
   12844 // CHECK:   [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8
   12845 // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8
   12846 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[B]], i32 0, i32 0
   12847 // CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
   12848 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__S1]] to i8*
   12849 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x3_t* [[B]] to i8*
   12850 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
   12851 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
   12852 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
   12853 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
   12854 // CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
   12855 // CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
   12856 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
   12857 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i64 0, i64 1
   12858 // CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
   12859 // CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
   12860 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
   12861 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL3]], i64 0, i64 2
   12862 // CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
   12863 // CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
   12864 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
   12865 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
   12866 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
   12867 // CHECK:   call void @llvm.aarch64.neon.st3.v1i64.p0i8(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i8* [[TMP2]])
   12868 // CHECK:   ret void
   12869 void test_vst3_u64(uint64_t *a, uint64x1x3_t b) {
   12870   vst3_u64(a, b);
   12871 }
   12872 
   12873 // CHECK-LABEL: define void @test_vst3_s8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
   12874 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
   12875 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
   12876 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
   12877 // CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
   12878 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
   12879 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
   12880 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
   12881 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
   12882 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
   12883 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
   12884 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
   12885 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i64 0, i64 1
   12886 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
   12887 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
   12888 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i64 0, i64 2
   12889 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
   12890 // CHECK:   call void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i8* %a)
   12891 // CHECK:   ret void
   12892 void test_vst3_s8(int8_t *a, int8x8x3_t b) {
   12893   vst3_s8(a, b);
   12894 }
   12895 
   12896 // CHECK-LABEL: define void @test_vst3_s16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
   12897 // CHECK:   [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
   12898 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
   12899 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[B]], i32 0, i32 0
   12900 // CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
   12901 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
   12902 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
   12903 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
   12904 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
   12905 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
   12906 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
   12907 // CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
   12908 // CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
   12909 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
   12910 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i64 0, i64 1
   12911 // CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
   12912 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
   12913 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
   12914 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i64 0, i64 2
   12915 // CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
   12916 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
   12917 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
   12918 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
   12919 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
   12920 // CHECK:   call void @llvm.aarch64.neon.st3.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i8* [[TMP2]])
   12921 // CHECK:   ret void
   12922 void test_vst3_s16(int16_t *a, int16x4x3_t b) {
   12923   vst3_s16(a, b);
   12924 }
   12925 
   12926 // CHECK-LABEL: define void @test_vst3_s32(i32* %a, [3 x <2 x i32>] %b.coerce) #0 {
   12927 // CHECK:   [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
   12928 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
   12929 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[B]], i32 0, i32 0
   12930 // CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
   12931 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
   12932 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
   12933 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
   12934 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
   12935 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
   12936 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
   12937 // CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
   12938 // CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
   12939 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
   12940 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i64 0, i64 1
   12941 // CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
   12942 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
   12943 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
   12944 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i64 0, i64 2
   12945 // CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
   12946 // CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
   12947 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
   12948 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
   12949 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
   12950 // CHECK:   call void @llvm.aarch64.neon.st3.v2i32.p0i8(<2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i8* [[TMP2]])
   12951 // CHECK:   ret void
   12952 void test_vst3_s32(int32_t *a, int32x2x3_t b) {
   12953   vst3_s32(a, b);
   12954 }
   12955 
   12956 // CHECK-LABEL: define void @test_vst3_s64(i64* %a, [3 x <1 x i64>] %b.coerce) #0 {
   12957 // CHECK:   [[B:%.*]] = alloca %struct.int64x1x3_t, align 8
   12958 // CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8
   12959 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[B]], i32 0, i32 0
   12960 // CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
   12961 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__S1]] to i8*
   12962 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x3_t* [[B]] to i8*
   12963 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
   12964 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
   12965 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
   12966 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
   12967 // CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
   12968 // CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
   12969 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
   12970 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i64 0, i64 1
   12971 // CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
   12972 // CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
   12973 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
   12974 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL3]], i64 0, i64 2
   12975 // CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
   12976 // CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
   12977 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
   12978 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
   12979 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
   12980 // CHECK:   call void @llvm.aarch64.neon.st3.v1i64.p0i8(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i8* [[TMP2]])
   12981 // CHECK:   ret void
   12982 void test_vst3_s64(int64_t *a, int64x1x3_t b) {
   12983   vst3_s64(a, b);
   12984 }
   12985 
   12986 // CHECK-LABEL: define void @test_vst3_f16(half* %a, [3 x <4 x half>] %b.coerce) #0 {
   12987 // CHECK:   [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
   12988 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
   12989 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[B]], i32 0, i32 0
   12990 // CHECK:   store [3 x <4 x half>] [[B]].coerce, [3 x <4 x half>]* [[COERCE_DIVE]], align 8
   12991 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
   12992 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
   12993 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
   12994 // CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
   12995 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
   12996 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i64 0, i64 0
   12997 // CHECK:   [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
   12998 // CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
   12999 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
   13000 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL1]], i64 0, i64 1
   13001 // CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
   13002 // CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
   13003 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
   13004 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL3]], i64 0, i64 2
   13005 // CHECK:   [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
   13006 // CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
   13007 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
   13008 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
   13009 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
   13010 // CHECK:   call void @llvm.aarch64.neon.st3.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i8* [[TMP2]])
   13011 // CHECK:   ret void
   13012 void test_vst3_f16(float16_t *a, float16x4x3_t b) {
   13013   vst3_f16(a, b);
   13014 }
   13015 
   13016 // CHECK-LABEL: define void @test_vst3_f32(float* %a, [3 x <2 x float>] %b.coerce) #0 {
   13017 // CHECK:   [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
   13018 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
   13019 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[B]], i32 0, i32 0
   13020 // CHECK:   store [3 x <2 x float>] [[B]].coerce, [3 x <2 x float>]* [[COERCE_DIVE]], align 8
   13021 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
   13022 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
   13023 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
   13024 // CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
   13025 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
   13026 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i64 0, i64 0
   13027 // CHECK:   [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
   13028 // CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
   13029 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
   13030 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL1]], i64 0, i64 1
   13031 // CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
   13032 // CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
   13033 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
   13034 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL3]], i64 0, i64 2
   13035 // CHECK:   [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
   13036 // CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
   13037 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
   13038 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
   13039 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
   13040 // CHECK:   call void @llvm.aarch64.neon.st3.v2f32.p0i8(<2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x float> [[TMP11]], i8* [[TMP2]])
   13041 // CHECK:   ret void
   13042 void test_vst3_f32(float32_t *a, float32x2x3_t b) {
   13043   vst3_f32(a, b);
   13044 }
   13045 
   13046 // CHECK-LABEL: define void @test_vst3_f64(double* %a, [3 x <1 x double>] %b.coerce) #0 {
   13047 // CHECK:   [[B:%.*]] = alloca %struct.float64x1x3_t, align 8
   13048 // CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x3_t, align 8
   13049 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[B]], i32 0, i32 0
   13050 // CHECK:   store [3 x <1 x double>] [[B]].coerce, [3 x <1 x double>]* [[COERCE_DIVE]], align 8
   13051 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x3_t* [[__S1]] to i8*
   13052 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x3_t* [[B]] to i8*
   13053 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
   13054 // CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
   13055 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
   13056 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL]], i64 0, i64 0
   13057 // CHECK:   [[TMP3:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
   13058 // CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
   13059 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
   13060 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL1]], i64 0, i64 1
   13061 // CHECK:   [[TMP5:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX2]], align 8
   13062 // CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
   13063 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
   13064 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL3]], i64 0, i64 2
   13065 // CHECK:   [[TMP7:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX4]], align 8
   13066 // CHECK:   [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8>
   13067 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
   13068 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
   13069 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double>
   13070 // CHECK:   call void @llvm.aarch64.neon.st3.v1f64.p0i8(<1 x double> [[TMP9]], <1 x double> [[TMP10]], <1 x double> [[TMP11]], i8* [[TMP2]])
   13071 // CHECK:   ret void
   13072 void test_vst3_f64(float64_t *a, float64x1x3_t b) {
   13073   vst3_f64(a, b);
   13074 }
   13075 
   13076 // CHECK-LABEL: define void @test_vst3_p8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
   13077 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
   13078 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
   13079 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
   13080 // CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
   13081 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
   13082 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
   13083 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
   13084 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
   13085 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
   13086 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
   13087 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
   13088 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i64 0, i64 1
   13089 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
   13090 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
   13091 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i64 0, i64 2
   13092 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
   13093 // CHECK:   call void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i8* %a)
   13094 // CHECK:   ret void
   13095 void test_vst3_p8(poly8_t *a, poly8x8x3_t b) {
   13096   vst3_p8(a, b);
   13097 }
   13098 
   13099 // CHECK-LABEL: define void @test_vst3_p16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
   13100 // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
   13101 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
   13102 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[B]], i32 0, i32 0
   13103 // CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
   13104 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
   13105 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
   13106 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
   13107 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
   13108 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
   13109 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
   13110 // CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
   13111 // CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
   13112 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
   13113 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i64 0, i64 1
   13114 // CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
   13115 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
   13116 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
   13117 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i64 0, i64 2
   13118 // CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
   13119 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
   13120 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
   13121 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
   13122 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
   13123 // CHECK:   call void @llvm.aarch64.neon.st3.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i8* [[TMP2]])
   13124 // CHECK:   ret void
   13125 void test_vst3_p16(poly16_t *a, poly16x4x3_t b) {
   13126   vst3_p16(a, b);
   13127 }
   13128 
   13129 // CHECK-LABEL: define void @test_vst4q_u8(i8* %a, [4 x <16 x i8>] %b.coerce) #0 {
   13130 // CHECK:   [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16
   13131 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 16
   13132 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[B]], i32 0, i32 0
   13133 // CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
   13134 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__S1]] to i8*
   13135 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x4_t* [[B]] to i8*
   13136 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
   13137 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
   13138 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
   13139 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
   13140 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
   13141 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i64 0, i64 1
   13142 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
   13143 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
   13144 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i64 0, i64 2
   13145 // CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
   13146 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
   13147 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i64 0, i64 3
   13148 // CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
   13149 // CHECK:   call void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i8* %a)
   13150 // CHECK:   ret void
   13151 void test_vst4q_u8(uint8_t *a, uint8x16x4_t b) {
   13152   vst4q_u8(a, b);
   13153 }
   13154 
   13155 // CHECK-LABEL: define void @test_vst4q_u16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
   13156 // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
   13157 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
   13158 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0
   13159 // CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
   13160 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
   13161 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
   13162 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
   13163 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
   13164 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
   13165 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
   13166 // CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
   13167 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
   13168 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
   13169 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i64 0, i64 1
   13170 // CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
   13171 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
   13172 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
   13173 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i64 0, i64 2
   13174 // CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
   13175 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
   13176 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
   13177 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i64 0, i64 3
   13178 // CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
   13179 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
   13180 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
   13181 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
   13182 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
   13183 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
   13184 // CHECK:   call void @llvm.aarch64.neon.st4.v8i16.p0i8(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i8* [[TMP2]])
   13185 // CHECK:   ret void
   13186 void test_vst4q_u16(uint16_t *a, uint16x8x4_t b) {
   13187   vst4q_u16(a, b);
   13188 }
   13189 
   13190 // CHECK-LABEL: define void @test_vst4q_u32(i32* %a, [4 x <4 x i32>] %b.coerce) #0 {
   13191 // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
   13192 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
   13193 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0
   13194 // CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
   13195 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
   13196 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
   13197 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
   13198 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
   13199 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
   13200 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
   13201 // CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
   13202 // CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
   13203 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
   13204 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i64 0, i64 1
   13205 // CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
   13206 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
   13207 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
   13208 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i64 0, i64 2
   13209 // CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
   13210 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
   13211 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
   13212 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i64 0, i64 3
   13213 // CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
   13214 // CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
   13215 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
   13216 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
   13217 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
   13218 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
   13219 // CHECK:   call void @llvm.aarch64.neon.st4.v4i32.p0i8(<4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], i8* [[TMP2]])
   13220 // CHECK:   ret void
   13221 void test_vst4q_u32(uint32_t *a, uint32x4x4_t b) {
   13222   vst4q_u32(a, b);
   13223 }
   13224 
   13225 // CHECK-LABEL: define void @test_vst4q_u64(i64* %a, [4 x <2 x i64>] %b.coerce) #0 {
   13226 // CHECK:   [[B:%.*]] = alloca %struct.uint64x2x4_t, align 16
   13227 // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x4_t, align 16
   13228 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[B]], i32 0, i32 0
   13229 // CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
   13230 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__S1]] to i8*
   13231 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x4_t* [[B]] to i8*
   13232 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
   13233 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
   13234 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
   13235 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
   13236 // CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
   13237 // CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
   13238 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
   13239 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL1]], i64 0, i64 1
   13240 // CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
   13241 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
   13242 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
   13243 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL3]], i64 0, i64 2
   13244 // CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
   13245 // CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
   13246 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
   13247 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL5]], i64 0, i64 3
   13248 // CHECK:   [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX6]], align 16
   13249 // CHECK:   [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
   13250 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
   13251 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
   13252 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
   13253 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
   13254 // CHECK:   call void @llvm.aarch64.neon.st4.v2i64.p0i8(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i8* [[TMP2]])
   13255 // CHECK:   ret void
   13256 void test_vst4q_u64(uint64_t *a, uint64x2x4_t b) {
   13257   vst4q_u64(a, b);
   13258 }
   13259 
   13260 // CHECK-LABEL: define void @test_vst4q_s8(i8* %a, [4 x <16 x i8>] %b.coerce) #0 {
   13261 // CHECK:   [[B:%.*]] = alloca %struct.int8x16x4_t, align 16
   13262 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 16
   13263 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[B]], i32 0, i32 0
   13264 // CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
   13265 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__S1]] to i8*
   13266 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x4_t* [[B]] to i8*
   13267 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
   13268 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
   13269 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
   13270 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
   13271 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
   13272 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i64 0, i64 1
   13273 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
   13274 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
   13275 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i64 0, i64 2
   13276 // CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
   13277 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
   13278 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i64 0, i64 3
   13279 // CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
   13280 // CHECK:   call void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i8* %a)
   13281 // CHECK:   ret void
   13282 void test_vst4q_s8(int8_t *a, int8x16x4_t b) {
   13283   vst4q_s8(a, b);
   13284 }
   13285 
   13286 // CHECK-LABEL: define void @test_vst4q_s16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
   13287 // CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
   13288 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
   13289 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0
   13290 // CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
   13291 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
   13292 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
   13293 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
   13294 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
   13295 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
   13296 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
   13297 // CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
   13298 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
   13299 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
   13300 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i64 0, i64 1
   13301 // CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
   13302 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
   13303 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
   13304 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i64 0, i64 2
   13305 // CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
   13306 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
   13307 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
   13308 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i64 0, i64 3
   13309 // CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
   13310 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
   13311 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
   13312 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
   13313 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
   13314 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
   13315 // CHECK:   call void @llvm.aarch64.neon.st4.v8i16.p0i8(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i8* [[TMP2]])
   13316 // CHECK:   ret void
   13317 void test_vst4q_s16(int16_t *a, int16x8x4_t b) {
   13318   vst4q_s16(a, b);
   13319 }
   13320 
   13321 // CHECK-LABEL: define void @test_vst4q_s32(i32* %a, [4 x <4 x i32>] %b.coerce) #0 {
   13322 // CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
   13323 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
   13324 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0
   13325 // CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
   13326 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
   13327 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
   13328 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
   13329 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
   13330 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
   13331 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
   13332 // CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
   13333 // CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
   13334 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
   13335 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i64 0, i64 1
   13336 // CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
   13337 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
   13338 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
   13339 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i64 0, i64 2
   13340 // CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
   13341 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
   13342 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
   13343 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i64 0, i64 3
   13344 // CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
   13345 // CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
   13346 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
   13347 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
   13348 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
   13349 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
   13350 // CHECK:   call void @llvm.aarch64.neon.st4.v4i32.p0i8(<4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], i8* [[TMP2]])
   13351 // CHECK:   ret void
   13352 void test_vst4q_s32(int32_t *a, int32x4x4_t b) {
   13353   vst4q_s32(a, b);
   13354 }
   13355 
   13356 // CHECK-LABEL: define void @test_vst4q_s64(i64* %a, [4 x <2 x i64>] %b.coerce) #0 {
   13357 // CHECK:   [[B:%.*]] = alloca %struct.int64x2x4_t, align 16
   13358 // CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x4_t, align 16
   13359 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[B]], i32 0, i32 0
   13360 // CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
   13361 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__S1]] to i8*
   13362 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x4_t* [[B]] to i8*
   13363 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
   13364 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
   13365 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
   13366 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
   13367 // CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
   13368 // CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
   13369 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
   13370 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL1]], i64 0, i64 1
   13371 // CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
   13372 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
   13373 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
   13374 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL3]], i64 0, i64 2
   13375 // CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
   13376 // CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
   13377 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
   13378 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL5]], i64 0, i64 3
   13379 // CHECK:   [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX6]], align 16
   13380 // CHECK:   [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
   13381 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
   13382 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
   13383 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
   13384 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
   13385 // CHECK:   call void @llvm.aarch64.neon.st4.v2i64.p0i8(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i8* [[TMP2]])
   13386 // CHECK:   ret void
   13387 void test_vst4q_s64(int64_t *a, int64x2x4_t b) {
   13388   vst4q_s64(a, b);
   13389 }
   13390 
   13391 // CHECK-LABEL: define void @test_vst4q_f16(half* %a, [4 x <8 x half>] %b.coerce) #0 {
   13392 // CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
   13393 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
   13394 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0
   13395 // CHECK:   store [4 x <8 x half>] [[B]].coerce, [4 x <8 x half>]* [[COERCE_DIVE]], align 16
   13396 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
   13397 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
   13398 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
   13399 // CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
   13400 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
   13401 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i64 0, i64 0
   13402 // CHECK:   [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
   13403 // CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
   13404 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
   13405 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL1]], i64 0, i64 1
   13406 // CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
   13407 // CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
   13408 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
   13409 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL3]], i64 0, i64 2
   13410 // CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
   13411 // CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
   13412 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
   13413 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL5]], i64 0, i64 3
   13414 // CHECK:   [[TMP9:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX6]], align 16
   13415 // CHECK:   [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8>
   13416 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
   13417 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
   13418 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
   13419 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
   13420 // CHECK:   call void @llvm.aarch64.neon.st4.v8i16.p0i8(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i8* [[TMP2]])
   13421 // CHECK:   ret void
   13422 void test_vst4q_f16(float16_t *a, float16x8x4_t b) {
   13423   vst4q_f16(a, b);
   13424 }
   13425 
   13426 // CHECK-LABEL: define void @test_vst4q_f32(float* %a, [4 x <4 x float>] %b.coerce) #0 {
   13427 // CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
   13428 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
   13429 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0
   13430 // CHECK:   store [4 x <4 x float>] [[B]].coerce, [4 x <4 x float>]* [[COERCE_DIVE]], align 16
   13431 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
   13432 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
   13433 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
   13434 // CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
   13435 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
   13436 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i64 0, i64 0
   13437 // CHECK:   [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
   13438 // CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
   13439 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
   13440 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL1]], i64 0, i64 1
   13441 // CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
   13442 // CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
   13443 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
   13444 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL3]], i64 0, i64 2
   13445 // CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
   13446 // CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
   13447 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
   13448 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL5]], i64 0, i64 3
   13449 // CHECK:   [[TMP9:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX6]], align 16
   13450 // CHECK:   [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8>
   13451 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
   13452 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
   13453 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
   13454 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x float>
   13455 // CHECK:   call void @llvm.aarch64.neon.st4.v4f32.p0i8(<4 x float> [[TMP11]], <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], i8* [[TMP2]])
   13456 // CHECK:   ret void
   13457 void test_vst4q_f32(float32_t *a, float32x4x4_t b) {
   13458   vst4q_f32(a, b);
   13459 }
   13460 
   13461 // CHECK-LABEL: define void @test_vst4q_f64(double* %a, [4 x <2 x double>] %b.coerce) #0 {
   13462 // CHECK:   [[B:%.*]] = alloca %struct.float64x2x4_t, align 16
   13463 // CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x4_t, align 16
   13464 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[B]], i32 0, i32 0
   13465 // CHECK:   store [4 x <2 x double>] [[B]].coerce, [4 x <2 x double>]* [[COERCE_DIVE]], align 16
   13466 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x4_t* [[__S1]] to i8*
   13467 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x4_t* [[B]] to i8*
   13468 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
   13469 // CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
   13470 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
   13471 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]], i64 0, i64 0
   13472 // CHECK:   [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
   13473 // CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
   13474 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
   13475 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL1]], i64 0, i64 1
   13476 // CHECK:   [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX2]], align 16
   13477 // CHECK:   [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
   13478 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
   13479 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL3]], i64 0, i64 2
   13480 // CHECK:   [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX4]], align 16
   13481 // CHECK:   [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8>
   13482 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
   13483 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL5]], i64 0, i64 3
   13484 // CHECK:   [[TMP9:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX6]], align 16
   13485 // CHECK:   [[TMP10:%.*]] = bitcast <2 x double> [[TMP9]] to <16 x i8>
   13486 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
   13487 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
   13488 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double>
   13489 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x double>
   13490 // CHECK:   call void @llvm.aarch64.neon.st4.v2f64.p0i8(<2 x double> [[TMP11]], <2 x double> [[TMP12]], <2 x double> [[TMP13]], <2 x double> [[TMP14]], i8* [[TMP2]])
   13491 // CHECK:   ret void
   13492 void test_vst4q_f64(float64_t *a, float64x2x4_t b) {
   13493   vst4q_f64(a, b);
   13494 }
   13495 
   13496 // CHECK-LABEL: define void @test_vst4q_p8(i8* %a, [4 x <16 x i8>] %b.coerce) #0 {
   13497 // CHECK:   [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16
   13498 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 16
   13499 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[B]], i32 0, i32 0
   13500 // CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
   13501 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__S1]] to i8*
   13502 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x4_t* [[B]] to i8*
   13503 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
   13504 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
   13505 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
   13506 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
   13507 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
   13508 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i64 0, i64 1
   13509 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
   13510 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
   13511 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i64 0, i64 2
   13512 // CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
   13513 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
   13514 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i64 0, i64 3
   13515 // CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
   13516 // CHECK:   call void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i8* %a)
   13517 // CHECK:   ret void
   13518 void test_vst4q_p8(poly8_t *a, poly8x16x4_t b) {
   13519   vst4q_p8(a, b);
   13520 }
   13521 
   13522 // CHECK-LABEL: define void @test_vst4q_p16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
   13523 // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
   13524 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
   13525 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0
   13526 // CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
   13527 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
   13528 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
   13529 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
   13530 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
   13531 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
   13532 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
   13533 // CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
   13534 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
   13535 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
   13536 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i64 0, i64 1
   13537 // CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
   13538 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
   13539 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
   13540 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i64 0, i64 2
   13541 // CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
   13542 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
   13543 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
   13544 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i64 0, i64 3
   13545 // CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
   13546 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
   13547 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
   13548 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
   13549 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
   13550 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
   13551 // CHECK:   call void @llvm.aarch64.neon.st4.v8i16.p0i8(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i8* [[TMP2]])
   13552 // CHECK:   ret void
   13553 void test_vst4q_p16(poly16_t *a, poly16x8x4_t b) {
   13554   vst4q_p16(a, b);
   13555 }
   13556 
   13557 // CHECK-LABEL: define void @test_vst4_u8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
   13558 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
   13559 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
   13560 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
   13561 // CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
   13562 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
   13563 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
   13564 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
   13565 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
   13566 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
   13567 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
   13568 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
   13569 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i64 0, i64 1
   13570 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
   13571 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
   13572 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i64 0, i64 2
   13573 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
   13574 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
   13575 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i64 0, i64 3
   13576 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
   13577 // CHECK:   call void @llvm.aarch64.neon.st4.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i8* %a)
   13578 // CHECK:   ret void
   13579 void test_vst4_u8(uint8_t *a, uint8x8x4_t b) {
   13580   vst4_u8(a, b);
   13581 }
   13582 
   13583 // CHECK-LABEL: define void @test_vst4_u16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
   13584 // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
   13585 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
   13586 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[B]], i32 0, i32 0
   13587 // CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
   13588 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
   13589 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
   13590 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
   13591 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
   13592 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
   13593 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
   13594 // CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
   13595 // CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
   13596 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
   13597 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i64 0, i64 1
   13598 // CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
   13599 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
   13600 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
   13601 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i64 0, i64 2
   13602 // CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
   13603 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
   13604 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
   13605 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i64 0, i64 3
   13606 // CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
   13607 // CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
   13608 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
   13609 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
   13610 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
   13611 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
   13612 // CHECK:   call void @llvm.aarch64.neon.st4.v4i16.p0i8(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i8* [[TMP2]])
   13613 // CHECK:   ret void
   13614 void test_vst4_u16(uint16_t *a, uint16x4x4_t b) {
   13615   vst4_u16(a, b);
   13616 }
   13617 
   13618 // CHECK-LABEL: define void @test_vst4_u32(i32* %a, [4 x <2 x i32>] %b.coerce) #0 {
   13619 // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
   13620 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
   13621 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[B]], i32 0, i32 0
   13622 // CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
   13623 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
   13624 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
   13625 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
   13626 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
   13627 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
   13628 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
   13629 // CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
   13630 // CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
   13631 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
   13632 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i64 0, i64 1
   13633 // CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
   13634 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
   13635 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
   13636 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i64 0, i64 2
   13637 // CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
   13638 // CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
   13639 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
   13640 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i64 0, i64 3
   13641 // CHECK:   [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
   13642 // CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
   13643 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
   13644 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
   13645 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
   13646 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
   13647 // CHECK:   call void @llvm.aarch64.neon.st4.v2i32.p0i8(<2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], i8* [[TMP2]])
   13648 // CHECK:   ret void
   13649 void test_vst4_u32(uint32_t *a, uint32x2x4_t b) {
   13650   vst4_u32(a, b);
   13651 }
   13652 
   13653 // CHECK-LABEL: define void @test_vst4_u64(i64* %a, [4 x <1 x i64>] %b.coerce) #0 {
   13654 // CHECK:   [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8
   13655 // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8
   13656 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[B]], i32 0, i32 0
   13657 // CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
   13658 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__S1]] to i8*
   13659 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x4_t* [[B]] to i8*
   13660 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
   13661 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
   13662 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
   13663 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
   13664 // CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
   13665 // CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
   13666 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
   13667 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i64 0, i64 1
   13668 // CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
   13669 // CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
   13670 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
   13671 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL3]], i64 0, i64 2
   13672 // CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
   13673 // CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
   13674 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
   13675 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL5]], i64 0, i64 3
   13676 // CHECK:   [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX6]], align 8
   13677 // CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
   13678 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
   13679 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
   13680 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
   13681 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
   13682 // CHECK:   call void @llvm.aarch64.neon.st4.v1i64.p0i8(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i8* [[TMP2]])
   13683 // CHECK:   ret void
   13684 void test_vst4_u64(uint64_t *a, uint64x1x4_t b) {
   13685   vst4_u64(a, b);
   13686 }
   13687 
   13688 // CHECK-LABEL: define void @test_vst4_s8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
   13689 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
   13690 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
   13691 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
   13692 // CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
   13693 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
   13694 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
   13695 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
   13696 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
   13697 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
   13698 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
   13699 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
   13700 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i64 0, i64 1
   13701 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
   13702 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
   13703 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i64 0, i64 2
   13704 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
   13705 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
   13706 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i64 0, i64 3
   13707 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
   13708 // CHECK:   call void @llvm.aarch64.neon.st4.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i8* %a)
   13709 // CHECK:   ret void
   13710 void test_vst4_s8(int8_t *a, int8x8x4_t b) {
   13711   vst4_s8(a, b);
   13712 }
   13713 
   13714 // CHECK-LABEL: define void @test_vst4_s16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
   13715 // CHECK:   [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
   13716 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
   13717 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[B]], i32 0, i32 0
   13718 // CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
   13719 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
   13720 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
   13721 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
   13722 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
   13723 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
   13724 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
   13725 // CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
   13726 // CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
   13727 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
   13728 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i64 0, i64 1
   13729 // CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
   13730 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
   13731 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
   13732 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i64 0, i64 2
   13733 // CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
   13734 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
   13735 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
   13736 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i64 0, i64 3
   13737 // CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
   13738 // CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
   13739 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
   13740 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
   13741 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
   13742 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
   13743 // CHECK:   call void @llvm.aarch64.neon.st4.v4i16.p0i8(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i8* [[TMP2]])
   13744 // CHECK:   ret void
   13745 void test_vst4_s16(int16_t *a, int16x4x4_t b) {
   13746   vst4_s16(a, b);
   13747 }
   13748 
   13749 // CHECK-LABEL: define void @test_vst4_s32(i32* %a, [4 x <2 x i32>] %b.coerce) #0 {
   13750 // CHECK:   [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
   13751 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
   13752 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[B]], i32 0, i32 0
   13753 // CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
   13754 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
   13755 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
   13756 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
   13757 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
   13758 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
   13759 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
   13760 // CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
   13761 // CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
   13762 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
   13763 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i64 0, i64 1
   13764 // CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
   13765 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
   13766 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
   13767 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i64 0, i64 2
   13768 // CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
   13769 // CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
   13770 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
   13771 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i64 0, i64 3
   13772 // CHECK:   [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
   13773 // CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
   13774 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
   13775 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
   13776 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
   13777 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
   13778 // CHECK:   call void @llvm.aarch64.neon.st4.v2i32.p0i8(<2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], i8* [[TMP2]])
   13779 // CHECK:   ret void
   13780 void test_vst4_s32(int32_t *a, int32x2x4_t b) {
   13781   vst4_s32(a, b);
   13782 }
   13783 
   13784 // CHECK-LABEL: define void @test_vst4_s64(i64* %a, [4 x <1 x i64>] %b.coerce) #0 {
   13785 // CHECK:   [[B:%.*]] = alloca %struct.int64x1x4_t, align 8
   13786 // CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8
   13787 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[B]], i32 0, i32 0
   13788 // CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
   13789 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__S1]] to i8*
   13790 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x4_t* [[B]] to i8*
   13791 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
   13792 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
   13793 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
   13794 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
   13795 // CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
   13796 // CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
   13797 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
   13798 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i64 0, i64 1
   13799 // CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
   13800 // CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
   13801 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
   13802 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL3]], i64 0, i64 2
   13803 // CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
   13804 // CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
   13805 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
   13806 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL5]], i64 0, i64 3
   13807 // CHECK:   [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX6]], align 8
   13808 // CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
   13809 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
   13810 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
   13811 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
   13812 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
   13813 // CHECK:   call void @llvm.aarch64.neon.st4.v1i64.p0i8(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i8* [[TMP2]])
   13814 // CHECK:   ret void
   13815 void test_vst4_s64(int64_t *a, int64x1x4_t b) {
   13816   vst4_s64(a, b);
   13817 }
   13818 
   13819 // CHECK-LABEL: define void @test_vst4_f16(half* %a, [4 x <4 x half>] %b.coerce) #0 {
   13820 // CHECK:   [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
   13821 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
   13822 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[B]], i32 0, i32 0
   13823 // CHECK:   store [4 x <4 x half>] [[B]].coerce, [4 x <4 x half>]* [[COERCE_DIVE]], align 8
   13824 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
   13825 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
   13826 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
   13827 // CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
   13828 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
   13829 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i64 0, i64 0
   13830 // CHECK:   [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
   13831 // CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
   13832 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
   13833 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL1]], i64 0, i64 1
   13834 // CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
   13835 // CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
   13836 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
   13837 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL3]], i64 0, i64 2
   13838 // CHECK:   [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
   13839 // CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
   13840 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
   13841 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL5]], i64 0, i64 3
   13842 // CHECK:   [[TMP9:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX6]], align 8
   13843 // CHECK:   [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8>
   13844 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
   13845 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
   13846 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
   13847 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
   13848 // CHECK:   call void @llvm.aarch64.neon.st4.v4i16.p0i8(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i8* [[TMP2]])
   13849 // CHECK:   ret void
   13850 void test_vst4_f16(float16_t *a, float16x4x4_t b) {
   13851   vst4_f16(a, b);
   13852 }
   13853 
   13854 // CHECK-LABEL: define void @test_vst4_f32(float* %a, [4 x <2 x float>] %b.coerce) #0 {
   13855 // CHECK:   [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
   13856 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
   13857 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[B]], i32 0, i32 0
   13858 // CHECK:   store [4 x <2 x float>] [[B]].coerce, [4 x <2 x float>]* [[COERCE_DIVE]], align 8
   13859 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
   13860 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
   13861 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
   13862 // CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
   13863 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
   13864 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i64 0, i64 0
   13865 // CHECK:   [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
   13866 // CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
   13867 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
   13868 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL1]], i64 0, i64 1
   13869 // CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
   13870 // CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
   13871 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
   13872 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL3]], i64 0, i64 2
   13873 // CHECK:   [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
   13874 // CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
   13875 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
   13876 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL5]], i64 0, i64 3
   13877 // CHECK:   [[TMP9:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX6]], align 8
   13878 // CHECK:   [[TMP10:%.*]] = bitcast <2 x float> [[TMP9]] to <8 x i8>
   13879 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
   13880 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
   13881 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
   13882 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x float>
   13883 // CHECK:   call void @llvm.aarch64.neon.st4.v2f32.p0i8(<2 x float> [[TMP11]], <2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], i8* [[TMP2]])
   13884 // CHECK:   ret void
   13885 void test_vst4_f32(float32_t *a, float32x2x4_t b) {
   13886   vst4_f32(a, b);
   13887 }
   13888 
   13889 // CHECK-LABEL: define void @test_vst4_f64(double* %a, [4 x <1 x double>] %b.coerce) #0 {
   13890 // CHECK:   [[B:%.*]] = alloca %struct.float64x1x4_t, align 8
   13891 // CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x4_t, align 8
   13892 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[B]], i32 0, i32 0
   13893 // CHECK:   store [4 x <1 x double>] [[B]].coerce, [4 x <1 x double>]* [[COERCE_DIVE]], align 8
   13894 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x4_t* [[__S1]] to i8*
   13895 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x4_t* [[B]] to i8*
   13896 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
   13897 // CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
   13898 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
   13899 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]], i64 0, i64 0
   13900 // CHECK:   [[TMP3:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
   13901 // CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
   13902 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
   13903 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL1]], i64 0, i64 1
   13904 // CHECK:   [[TMP5:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX2]], align 8
   13905 // CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
   13906 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
   13907 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL3]], i64 0, i64 2
   13908 // CHECK:   [[TMP7:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX4]], align 8
   13909 // CHECK:   [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8>
   13910 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
   13911 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL5]], i64 0, i64 3
   13912 // CHECK:   [[TMP9:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX6]], align 8
   13913 // CHECK:   [[TMP10:%.*]] = bitcast <1 x double> [[TMP9]] to <8 x i8>
   13914 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
   13915 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
   13916 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double>
   13917 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x double>
   13918 // CHECK:   call void @llvm.aarch64.neon.st4.v1f64.p0i8(<1 x double> [[TMP11]], <1 x double> [[TMP12]], <1 x double> [[TMP13]], <1 x double> [[TMP14]], i8* [[TMP2]])
   13919 // CHECK:   ret void
   13920 void test_vst4_f64(float64_t *a, float64x1x4_t b) {
   13921   vst4_f64(a, b);
   13922 }
   13923 
   13924 // CHECK-LABEL: define void @test_vst4_p8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
   13925 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
   13926 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
   13927 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
   13928 // CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
   13929 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
   13930 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
   13931 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
   13932 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
   13933 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
   13934 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
   13935 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
   13936 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i64 0, i64 1
   13937 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
   13938 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
   13939 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i64 0, i64 2
   13940 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
   13941 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
   13942 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i64 0, i64 3
   13943 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
   13944 // CHECK:   call void @llvm.aarch64.neon.st4.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i8* %a)
   13945 // CHECK:   ret void
   13946 void test_vst4_p8(poly8_t *a, poly8x8x4_t b) {
   13947   vst4_p8(a, b);
   13948 }
   13949 
   13950 // CHECK-LABEL: define void @test_vst4_p16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
   13951 // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
   13952 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
   13953 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[B]], i32 0, i32 0
   13954 // CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
   13955 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
   13956 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
   13957 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
   13958 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
   13959 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
   13960 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
   13961 // CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
   13962 // CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
   13963 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
   13964 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i64 0, i64 1
   13965 // CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
   13966 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
   13967 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
   13968 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i64 0, i64 2
   13969 // CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
   13970 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
   13971 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
   13972 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i64 0, i64 3
   13973 // CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
   13974 // CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
   13975 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
   13976 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
   13977 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
   13978 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
   13979 // CHECK:   call void @llvm.aarch64.neon.st4.v4i16.p0i8(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i8* [[TMP2]])
   13980 // CHECK:   ret void
   13981 void test_vst4_p16(poly16_t *a, poly16x4x4_t b) {
   13982   vst4_p16(a, b);
   13983 }
   13984 
   13985 // CHECK-LABEL: define %struct.uint8x16x2_t @test_vld1q_u8_x2(i8* %a) #0 {
   13986 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
   13987 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align 16
   13988 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
   13989 // CHECK:   [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x2.v16i8.p0i8(i8* %a)
   13990 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
   13991 // CHECK:   store { <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8> }* [[TMP1]]
   13992 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL]] to i8*
   13993 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
   13994 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 16, i1 false)
   13995 // CHECK:   [[TMP4:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16
   13996 // CHECK:   ret %struct.uint8x16x2_t [[TMP4]]
   13997 uint8x16x2_t test_vld1q_u8_x2(uint8_t const *a) {
   13998   return vld1q_u8_x2(a);
   13999 }
   14000 
   14001 // CHECK-LABEL: define %struct.uint16x8x2_t @test_vld1q_u16_x2(i16* %a) #0 {
   14002 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
   14003 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16
   14004 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
   14005 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   14006 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
   14007 // CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x2.v8i16.p0i16(i16* [[TMP2]])
   14008 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
   14009 // CHECK:   store { <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16> }* [[TMP3]]
   14010 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL]] to i8*
   14011 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
   14012 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
   14013 // CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16
   14014 // CHECK:   ret %struct.uint16x8x2_t [[TMP6]]
   14015 uint16x8x2_t test_vld1q_u16_x2(uint16_t const *a) {
   14016   return vld1q_u16_x2(a);
   14017 }
   14018 
   14019 // CHECK-LABEL: define %struct.uint32x4x2_t @test_vld1q_u32_x2(i32* %a) #0 {
   14020 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
   14021 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16
   14022 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
   14023 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
   14024 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
   14025 // CHECK:   [[VLD1XN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x2.v4i32.p0i32(i32* [[TMP2]])
   14026 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }*
   14027 // CHECK:   store { <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32> }* [[TMP3]]
   14028 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL]] to i8*
   14029 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
   14030 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
   14031 // CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16
   14032 // CHECK:   ret %struct.uint32x4x2_t [[TMP6]]
   14033 uint32x4x2_t test_vld1q_u32_x2(uint32_t const *a) {
   14034   return vld1q_u32_x2(a);
   14035 }
   14036 
   14037 // CHECK-LABEL: define %struct.uint64x2x2_t @test_vld1q_u64_x2(i64* %a) #0 {
   14038 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x2_t, align 16
   14039 // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x2_t, align 16
   14040 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
   14041 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
   14042 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
   14043 // CHECK:   [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x2.v2i64.p0i64(i64* [[TMP2]])
   14044 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }*
   14045 // CHECK:   store { <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64> }* [[TMP3]]
   14046 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x2x2_t* [[RETVAL]] to i8*
   14047 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
   14048 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
   14049 // CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[RETVAL]], align 16
   14050 // CHECK:   ret %struct.uint64x2x2_t [[TMP6]]
   14051 uint64x2x2_t test_vld1q_u64_x2(uint64_t const *a) {
   14052   return vld1q_u64_x2(a);
   14053 }
   14054 
   14055 // CHECK-LABEL: define %struct.int8x16x2_t @test_vld1q_s8_x2(i8* %a) #0 {
   14056 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
   14057 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x2_t, align 16
   14058 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
   14059 // CHECK:   [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x2.v16i8.p0i8(i8* %a)
   14060 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
   14061 // CHECK:   store { <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8> }* [[TMP1]]
   14062 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL]] to i8*
   14063 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
   14064 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 16, i1 false)
   14065 // CHECK:   [[TMP4:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16
   14066 // CHECK:   ret %struct.int8x16x2_t [[TMP4]]
   14067 int8x16x2_t test_vld1q_s8_x2(int8_t const *a) {
   14068   return vld1q_s8_x2(a);
   14069 }
   14070 
   14071 // CHECK-LABEL: define %struct.int16x8x2_t @test_vld1q_s16_x2(i16* %a) #0 {
   14072 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
   14073 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16
   14074 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
   14075 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   14076 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
   14077 // CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x2.v8i16.p0i16(i16* [[TMP2]])
   14078 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
   14079 // CHECK:   store { <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16> }* [[TMP3]]
   14080 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL]] to i8*
   14081 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
   14082 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
   14083 // CHECK:   [[TMP6:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16
   14084 // CHECK:   ret %struct.int16x8x2_t [[TMP6]]
   14085 int16x8x2_t test_vld1q_s16_x2(int16_t const *a) {
   14086   return vld1q_s16_x2(a);
   14087 }
   14088 
   14089 // CHECK-LABEL: define %struct.int32x4x2_t @test_vld1q_s32_x2(i32* %a) #0 {
   14090 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
   14091 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16
   14092 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
   14093 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
   14094 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
   14095 // CHECK:   [[VLD1XN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x2.v4i32.p0i32(i32* [[TMP2]])
   14096 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }*
   14097 // CHECK:   store { <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32> }* [[TMP3]]
   14098 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL]] to i8*
   14099 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
   14100 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
   14101 // CHECK:   [[TMP6:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16
   14102 // CHECK:   ret %struct.int32x4x2_t [[TMP6]]
   14103 int32x4x2_t test_vld1q_s32_x2(int32_t const *a) {
   14104   return vld1q_s32_x2(a);
   14105 }
   14106 
   14107 // CHECK-LABEL: define %struct.int64x2x2_t @test_vld1q_s64_x2(i64* %a) #0 {
   14108 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x2_t, align 16
   14109 // CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x2_t, align 16
   14110 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
   14111 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
   14112 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
   14113 // CHECK:   [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x2.v2i64.p0i64(i64* [[TMP2]])
   14114 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }*
   14115 // CHECK:   store { <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64> }* [[TMP3]]
   14116 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x2x2_t* [[RETVAL]] to i8*
   14117 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
   14118 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
   14119 // CHECK:   [[TMP6:%.*]] = load %struct.int64x2x2_t, %struct.int64x2x2_t* [[RETVAL]], align 16
   14120 // CHECK:   ret %struct.int64x2x2_t [[TMP6]]
   14121 int64x2x2_t test_vld1q_s64_x2(int64_t const *a) {
   14122   return vld1q_s64_x2(a);
   14123 }
   14124 
   14125 // CHECK-LABEL: define %struct.float16x8x2_t @test_vld1q_f16_x2(half* %a) #0 {
   14126 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x2_t, align 16
   14127 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
   14128 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
   14129 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
   14130 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
   14131 // CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x2.v8i16.p0i16(i16* [[TMP2]])
   14132 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
   14133 // CHECK:   store { <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16> }* [[TMP3]]
   14134 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
   14135 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
   14136 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
   14137 // CHECK:   [[TMP6:%.*]] = load %struct.float16x8x2_t, %struct.float16x8x2_t* [[RETVAL]], align 16
   14138 // CHECK:   ret %struct.float16x8x2_t [[TMP6]]
   14139 float16x8x2_t test_vld1q_f16_x2(float16_t const *a) {
   14140   return vld1q_f16_x2(a);
   14141 }
   14142 
   14143 // CHECK-LABEL: define %struct.float32x4x2_t @test_vld1q_f32_x2(float* %a) #0 {
   14144 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
   14145 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16
   14146 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
   14147 // CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
   14148 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
   14149 // CHECK:   [[VLD1XN:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x2.v4f32.p0f32(float* [[TMP2]])
   14150 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float> }*
   14151 // CHECK:   store { <4 x float>, <4 x float> } [[VLD1XN]], { <4 x float>, <4 x float> }* [[TMP3]]
   14152 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL]] to i8*
   14153 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
   14154 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
   14155 // CHECK:   [[TMP6:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16
   14156 // CHECK:   ret %struct.float32x4x2_t [[TMP6]]
   14157 float32x4x2_t test_vld1q_f32_x2(float32_t const *a) {
   14158   return vld1q_f32_x2(a);
   14159 }
   14160 
   14161 // CHECK-LABEL: define %struct.float64x2x2_t @test_vld1q_f64_x2(double* %a) #0 {
   14162 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x2_t, align 16
   14163 // CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x2_t, align 16
   14164 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8*
   14165 // CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
   14166 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
   14167 // CHECK:   [[VLD1XN:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x2.v2f64.p0f64(double* [[TMP2]])
   14168 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x double>, <2 x double> }*
   14169 // CHECK:   store { <2 x double>, <2 x double> } [[VLD1XN]], { <2 x double>, <2 x double> }* [[TMP3]]
   14170 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x2x2_t* [[RETVAL]] to i8*
   14171 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8*
   14172 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
   14173 // CHECK:   [[TMP6:%.*]] = load %struct.float64x2x2_t, %struct.float64x2x2_t* [[RETVAL]], align 16
   14174 // CHECK:   ret %struct.float64x2x2_t [[TMP6]]
   14175 float64x2x2_t test_vld1q_f64_x2(float64_t const *a) {
   14176   return vld1q_f64_x2(a);
   14177 }
   14178 
   14179 // CHECK-LABEL: define %struct.poly8x16x2_t @test_vld1q_p8_x2(i8* %a) #0 {
   14180 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
   14181 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align 16
   14182 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
   14183 // CHECK:   [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x2.v16i8.p0i8(i8* %a)
   14184 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
   14185 // CHECK:   store { <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8> }* [[TMP1]]
   14186 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL]] to i8*
   14187 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
   14188 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 16, i1 false)
   14189 // CHECK:   [[TMP4:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16
   14190 // CHECK:   ret %struct.poly8x16x2_t [[TMP4]]
   14191 poly8x16x2_t test_vld1q_p8_x2(poly8_t const *a) {
   14192   return vld1q_p8_x2(a);
   14193 }
   14194 
   14195 // CHECK-LABEL: define %struct.poly16x8x2_t @test_vld1q_p16_x2(i16* %a) #0 {
   14196 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
   14197 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16
   14198 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
   14199 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   14200 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
   14201 // CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x2.v8i16.p0i16(i16* [[TMP2]])
   14202 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
   14203 // CHECK:   store { <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16> }* [[TMP3]]
   14204 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL]] to i8*
   14205 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
   14206 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
   14207 // CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16
   14208 // CHECK:   ret %struct.poly16x8x2_t [[TMP6]]
   14209 poly16x8x2_t test_vld1q_p16_x2(poly16_t const *a) {
   14210   return vld1q_p16_x2(a);
   14211 }
   14212 
   14213 // CHECK-LABEL: define %struct.poly64x2x2_t @test_vld1q_p64_x2(i64* %a) #0 {
   14214 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x2_t, align 16
   14215 // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x2_t, align 16
   14216 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
   14217 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
   14218 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
   14219 // CHECK:   [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x2.v2i64.p0i64(i64* [[TMP2]])
   14220 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }*
   14221 // CHECK:   store { <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64> }* [[TMP3]]
   14222 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x2_t* [[RETVAL]] to i8*
   14223 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
   14224 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
   14225 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[RETVAL]], align 16
   14226 // CHECK:   ret %struct.poly64x2x2_t [[TMP6]]
   14227 poly64x2x2_t test_vld1q_p64_x2(poly64_t const *a) {
   14228   return vld1q_p64_x2(a);
   14229 }
   14230 
   14231 // CHECK-LABEL: define %struct.uint8x8x2_t @test_vld1_u8_x2(i8* %a) #0 {
   14232 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
   14233 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
   14234 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
   14235 // CHECK:   [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x2.v8i8.p0i8(i8* %a)
   14236 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
   14237 // CHECK:   store { <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8> }* [[TMP1]]
   14238 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL]] to i8*
   14239 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
   14240 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 16, i32 8, i1 false)
   14241 // CHECK:   [[TMP4:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8
   14242 // CHECK:   ret %struct.uint8x8x2_t [[TMP4]]
   14243 uint8x8x2_t test_vld1_u8_x2(uint8_t const *a) {
   14244   return vld1_u8_x2(a);
   14245 }
   14246 
   14247 // CHECK-LABEL: define %struct.uint16x4x2_t @test_vld1_u16_x2(i16* %a) #0 {
   14248 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
   14249 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
   14250 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
   14251 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   14252 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
   14253 // CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x2.v4i16.p0i16(i16* [[TMP2]])
   14254 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
   14255 // CHECK:   store { <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16> }* [[TMP3]]
   14256 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL]] to i8*
   14257 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
   14258 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
   14259 // CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
   14260 // CHECK:   ret %struct.uint16x4x2_t [[TMP6]]
   14261 uint16x4x2_t test_vld1_u16_x2(uint16_t const *a) {
   14262   return vld1_u16_x2(a);
   14263 }
   14264 
   14265 // CHECK-LABEL: define %struct.uint32x2x2_t @test_vld1_u32_x2(i32* %a) #0 {
   14266 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
   14267 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
   14268 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
   14269 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
   14270 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
   14271 // CHECK:   [[VLD1XN:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x2.v2i32.p0i32(i32* [[TMP2]])
   14272 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
   14273 // CHECK:   store { <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32> }* [[TMP3]]
   14274 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL]] to i8*
   14275 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
   14276 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
   14277 // CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
   14278 // CHECK:   ret %struct.uint32x2x2_t [[TMP6]]
   14279 uint32x2x2_t test_vld1_u32_x2(uint32_t const *a) {
   14280   return vld1_u32_x2(a);
   14281 }
   14282 
   14283 // CHECK-LABEL: define %struct.uint64x1x2_t @test_vld1_u64_x2(i64* %a) #0 {
   14284 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x2_t, align 8
   14285 // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8
   14286 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
   14287 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
   14288 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
   14289 // CHECK:   [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x2.v1i64.p0i64(i64* [[TMP2]])
   14290 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
   14291 // CHECK:   store { <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64> }* [[TMP3]]
   14292 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x2_t* [[RETVAL]] to i8*
   14293 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
   14294 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
   14295 // CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[RETVAL]], align 8
   14296 // CHECK:   ret %struct.uint64x1x2_t [[TMP6]]
   14297 uint64x1x2_t test_vld1_u64_x2(uint64_t const *a) {
   14298   return vld1_u64_x2(a);
   14299 }
   14300 
   14301 // CHECK-LABEL: define %struct.int8x8x2_t @test_vld1_s8_x2(i8* %a) #0 {
   14302 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
   14303 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
   14304 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
   14305 // CHECK:   [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x2.v8i8.p0i8(i8* %a)
   14306 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
   14307 // CHECK:   store { <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8> }* [[TMP1]]
   14308 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL]] to i8*
   14309 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
   14310 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 16, i32 8, i1 false)
   14311 // CHECK:   [[TMP4:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8
   14312 // CHECK:   ret %struct.int8x8x2_t [[TMP4]]
   14313 int8x8x2_t test_vld1_s8_x2(int8_t const *a) {
   14314   return vld1_s8_x2(a);
   14315 }
   14316 
   14317 // CHECK-LABEL: define %struct.int16x4x2_t @test_vld1_s16_x2(i16* %a) #0 {
   14318 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
   14319 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
   14320 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
   14321 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   14322 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
   14323 // CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x2.v4i16.p0i16(i16* [[TMP2]])
   14324 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
   14325 // CHECK:   store { <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16> }* [[TMP3]]
   14326 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL]] to i8*
   14327 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
   14328 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
   14329 // CHECK:   [[TMP6:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
   14330 // CHECK:   ret %struct.int16x4x2_t [[TMP6]]
   14331 int16x4x2_t test_vld1_s16_x2(int16_t const *a) {
   14332   return vld1_s16_x2(a);
   14333 }
   14334 
   14335 // CHECK-LABEL: define %struct.int32x2x2_t @test_vld1_s32_x2(i32* %a) #0 {
   14336 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
   14337 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
   14338 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
   14339 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
   14340 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
   14341 // CHECK:   [[VLD1XN:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x2.v2i32.p0i32(i32* [[TMP2]])
   14342 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
   14343 // CHECK:   store { <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32> }* [[TMP3]]
   14344 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL]] to i8*
   14345 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
   14346 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
   14347 // CHECK:   [[TMP6:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
   14348 // CHECK:   ret %struct.int32x2x2_t [[TMP6]]
   14349 int32x2x2_t test_vld1_s32_x2(int32_t const *a) {
   14350   return vld1_s32_x2(a);
   14351 }
   14352 
   14353 // CHECK-LABEL: define %struct.int64x1x2_t @test_vld1_s64_x2(i64* %a) #0 {
   14354 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x2_t, align 8
   14355 // CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8
   14356 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
   14357 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
   14358 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
   14359 // CHECK:   [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x2.v1i64.p0i64(i64* [[TMP2]])
   14360 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
   14361 // CHECK:   store { <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64> }* [[TMP3]]
   14362 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x2_t* [[RETVAL]] to i8*
   14363 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
   14364 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
   14365 // CHECK:   [[TMP6:%.*]] = load %struct.int64x1x2_t, %struct.int64x1x2_t* [[RETVAL]], align 8
   14366 // CHECK:   ret %struct.int64x1x2_t [[TMP6]]
   14367 int64x1x2_t test_vld1_s64_x2(int64_t const *a) {
   14368   return vld1_s64_x2(a);
   14369 }
   14370 
   14371 // CHECK-LABEL: define %struct.float16x4x2_t @test_vld1_f16_x2(half* %a) #0 {
   14372 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x2_t, align 8
   14373 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
   14374 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
   14375 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
   14376 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
   14377 // CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x2.v4i16.p0i16(i16* [[TMP2]])
   14378 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
   14379 // CHECK:   store { <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16> }* [[TMP3]]
   14380 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
   14381 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
   14382 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
   14383 // CHECK:   [[TMP6:%.*]] = load %struct.float16x4x2_t, %struct.float16x4x2_t* [[RETVAL]], align 8
   14384 // CHECK:   ret %struct.float16x4x2_t [[TMP6]]
   14385 float16x4x2_t test_vld1_f16_x2(float16_t const *a) {
   14386   return vld1_f16_x2(a);
   14387 }
   14388 
   14389 // CHECK-LABEL: define %struct.float32x2x2_t @test_vld1_f32_x2(float* %a) #0 {
   14390 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
   14391 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
   14392 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
   14393 // CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
   14394 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
   14395 // CHECK:   [[VLD1XN:%.*]] = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x2.v2f32.p0f32(float* [[TMP2]])
   14396 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float> }*
   14397 // CHECK:   store { <2 x float>, <2 x float> } [[VLD1XN]], { <2 x float>, <2 x float> }* [[TMP3]]
   14398 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL]] to i8*
   14399 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
   14400 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
   14401 // CHECK:   [[TMP6:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
   14402 // CHECK:   ret %struct.float32x2x2_t [[TMP6]]
   14403 float32x2x2_t test_vld1_f32_x2(float32_t const *a) {
   14404   return vld1_f32_x2(a);
   14405 }
   14406 
   14407 // CHECK-LABEL: define %struct.float64x1x2_t @test_vld1_f64_x2(double* %a) #0 {
   14408 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x2_t, align 8
   14409 // CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x2_t, align 8
   14410 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8*
   14411 // CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
   14412 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
   14413 // CHECK:   [[VLD1XN:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x2.v1f64.p0f64(double* [[TMP2]])
   14414 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x double>, <1 x double> }*
   14415 // CHECK:   store { <1 x double>, <1 x double> } [[VLD1XN]], { <1 x double>, <1 x double> }* [[TMP3]]
   14416 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x1x2_t* [[RETVAL]] to i8*
   14417 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8*
   14418 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
   14419 // CHECK:   [[TMP6:%.*]] = load %struct.float64x1x2_t, %struct.float64x1x2_t* [[RETVAL]], align 8
   14420 // CHECK:   ret %struct.float64x1x2_t [[TMP6]]
   14421 float64x1x2_t test_vld1_f64_x2(float64_t const *a) {
   14422   return vld1_f64_x2(a);
   14423 }
   14424 
   14425 // CHECK-LABEL: define %struct.poly8x8x2_t @test_vld1_p8_x2(i8* %a) #0 {
   14426 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
   14427 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
   14428 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
   14429 // CHECK:   [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x2.v8i8.p0i8(i8* %a)
   14430 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
   14431 // CHECK:   store { <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8> }* [[TMP1]]
   14432 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL]] to i8*
   14433 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
   14434 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 16, i32 8, i1 false)
   14435 // CHECK:   [[TMP4:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8
   14436 // CHECK:   ret %struct.poly8x8x2_t [[TMP4]]
   14437 poly8x8x2_t test_vld1_p8_x2(poly8_t const *a) {
   14438   return vld1_p8_x2(a);
   14439 }
   14440 
   14441 // CHECK-LABEL: define %struct.poly16x4x2_t @test_vld1_p16_x2(i16* %a) #0 {
   14442 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
   14443 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
   14444 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
   14445 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   14446 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
   14447 // CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x2.v4i16.p0i16(i16* [[TMP2]])
   14448 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
   14449 // CHECK:   store { <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16> }* [[TMP3]]
   14450 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL]] to i8*
   14451 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
   14452 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
   14453 // CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
   14454 // CHECK:   ret %struct.poly16x4x2_t [[TMP6]]
   14455 poly16x4x2_t test_vld1_p16_x2(poly16_t const *a) {
   14456   return vld1_p16_x2(a);
   14457 }
   14458 
   14459 // CHECK-LABEL: define %struct.poly64x1x2_t @test_vld1_p64_x2(i64* %a) #0 {
   14460 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x2_t, align 8
   14461 // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x2_t, align 8
   14462 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
   14463 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
   14464 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
   14465 // CHECK:   [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x2.v1i64.p0i64(i64* [[TMP2]])
   14466 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
   14467 // CHECK:   store { <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64> }* [[TMP3]]
   14468 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x2_t* [[RETVAL]] to i8*
   14469 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
   14470 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
   14471 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[RETVAL]], align 8
   14472 // CHECK:   ret %struct.poly64x1x2_t [[TMP6]]
   14473 poly64x1x2_t test_vld1_p64_x2(poly64_t const *a) {
   14474   return vld1_p64_x2(a);
   14475 }
   14476 
   14477 // CHECK-LABEL: define %struct.uint8x16x3_t @test_vld1q_u8_x3(i8* %a) #0 {
   14478 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x3_t, align 16
   14479 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align 16
   14480 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
   14481 // CHECK:   [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x3.v16i8.p0i8(i8* %a)
   14482 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
   14483 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
   14484 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x3_t* [[RETVAL]] to i8*
   14485 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
   14486 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 48, i32 16, i1 false)
   14487 // CHECK:   [[TMP4:%.*]] = load %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[RETVAL]], align 16
   14488 // CHECK:   ret %struct.uint8x16x3_t [[TMP4]]
   14489 uint8x16x3_t test_vld1q_u8_x3(uint8_t const *a) {
   14490   return vld1q_u8_x3(a);
   14491 }
   14492 
   14493 // CHECK-LABEL: define %struct.uint16x8x3_t @test_vld1q_u16_x3(i16* %a) #0 {
   14494 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x3_t, align 16
   14495 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16
   14496 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
   14497 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   14498 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
   14499 // CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x3.v8i16.p0i16(i16* [[TMP2]])
   14500 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
   14501 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
   14502 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x3_t* [[RETVAL]] to i8*
   14503 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
   14504 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
   14505 // CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[RETVAL]], align 16
   14506 // CHECK:   ret %struct.uint16x8x3_t [[TMP6]]
   14507 uint16x8x3_t test_vld1q_u16_x3(uint16_t const *a) {
   14508   return vld1q_u16_x3(a);
   14509 }
   14510 
   14511 // CHECK-LABEL: define %struct.uint32x4x3_t @test_vld1q_u32_x3(i32* %a) #0 {
   14512 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x3_t, align 16
   14513 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16
   14514 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
   14515 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
   14516 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
   14517 // CHECK:   [[VLD1XN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x3.v4i32.p0i32(i32* [[TMP2]])
   14518 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
   14519 // CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
   14520 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x3_t* [[RETVAL]] to i8*
   14521 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
   14522 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
   14523 // CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[RETVAL]], align 16
   14524 // CHECK:   ret %struct.uint32x4x3_t [[TMP6]]
   14525 uint32x4x3_t test_vld1q_u32_x3(uint32_t const *a) {
   14526   return vld1q_u32_x3(a);
   14527 }
   14528 
   14529 // CHECK-LABEL: define %struct.uint64x2x3_t @test_vld1q_u64_x3(i64* %a) #0 {
   14530 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x3_t, align 16
   14531 // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x3_t, align 16
   14532 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
   14533 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
   14534 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
   14535 // CHECK:   [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x3.v2i64.p0i64(i64* [[TMP2]])
   14536 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
   14537 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
   14538 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x2x3_t* [[RETVAL]] to i8*
   14539 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
   14540 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
   14541 // CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[RETVAL]], align 16
   14542 // CHECK:   ret %struct.uint64x2x3_t [[TMP6]]
   14543 uint64x2x3_t test_vld1q_u64_x3(uint64_t const *a) {
   14544   return vld1q_u64_x3(a);
   14545 }
   14546 
   14547 // CHECK-LABEL: define %struct.int8x16x3_t @test_vld1q_s8_x3(i8* %a) #0 {
   14548 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x3_t, align 16
   14549 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x3_t, align 16
   14550 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
   14551 // CHECK:   [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x3.v16i8.p0i8(i8* %a)
   14552 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
   14553 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
   14554 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x3_t* [[RETVAL]] to i8*
   14555 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
   14556 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 48, i32 16, i1 false)
   14557 // CHECK:   [[TMP4:%.*]] = load %struct.int8x16x3_t, %struct.int8x16x3_t* [[RETVAL]], align 16
   14558 // CHECK:   ret %struct.int8x16x3_t [[TMP4]]
   14559 int8x16x3_t test_vld1q_s8_x3(int8_t const *a) {
   14560   return vld1q_s8_x3(a);
   14561 }
   14562 
   14563 // CHECK-LABEL: define %struct.int16x8x3_t @test_vld1q_s16_x3(i16* %a) #0 {
   14564 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x3_t, align 16
   14565 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16
   14566 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
   14567 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   14568 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
   14569 // CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x3.v8i16.p0i16(i16* [[TMP2]])
   14570 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
   14571 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
   14572 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x3_t* [[RETVAL]] to i8*
   14573 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
   14574 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
   14575 // CHECK:   [[TMP6:%.*]] = load %struct.int16x8x3_t, %struct.int16x8x3_t* [[RETVAL]], align 16
   14576 // CHECK:   ret %struct.int16x8x3_t [[TMP6]]
   14577 int16x8x3_t test_vld1q_s16_x3(int16_t const *a) {
   14578   return vld1q_s16_x3(a);
   14579 }
   14580 
   14581 // CHECK-LABEL: define %struct.int32x4x3_t @test_vld1q_s32_x3(i32* %a) #0 {
   14582 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x3_t, align 16
   14583 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16
   14584 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
   14585 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
   14586 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
   14587 // CHECK:   [[VLD1XN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x3.v4i32.p0i32(i32* [[TMP2]])
   14588 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
   14589 // CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
   14590 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x3_t* [[RETVAL]] to i8*
   14591 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
   14592 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
   14593 // CHECK:   [[TMP6:%.*]] = load %struct.int32x4x3_t, %struct.int32x4x3_t* [[RETVAL]], align 16
   14594 // CHECK:   ret %struct.int32x4x3_t [[TMP6]]
   14595 int32x4x3_t test_vld1q_s32_x3(int32_t const *a) {
   14596   return vld1q_s32_x3(a);
   14597 }
   14598 
   14599 // CHECK-LABEL: define %struct.int64x2x3_t @test_vld1q_s64_x3(i64* %a) #0 {
   14600 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x3_t, align 16
   14601 // CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x3_t, align 16
   14602 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
   14603 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
   14604 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
   14605 // CHECK:   [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x3.v2i64.p0i64(i64* [[TMP2]])
   14606 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
   14607 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
   14608 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x2x3_t* [[RETVAL]] to i8*
   14609 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
   14610 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
   14611 // CHECK:   [[TMP6:%.*]] = load %struct.int64x2x3_t, %struct.int64x2x3_t* [[RETVAL]], align 16
   14612 // CHECK:   ret %struct.int64x2x3_t [[TMP6]]
   14613 int64x2x3_t test_vld1q_s64_x3(int64_t const *a) {
   14614   return vld1q_s64_x3(a);
   14615 }
   14616 
   14617 // CHECK-LABEL: define %struct.float16x8x3_t @test_vld1q_f16_x3(half* %a) #0 {
   14618 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x3_t, align 16
   14619 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
   14620 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
   14621 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
   14622 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
   14623 // CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x3.v8i16.p0i16(i16* [[TMP2]])
   14624 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
   14625 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
   14626 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x3_t* [[RETVAL]] to i8*
   14627 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
   14628 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
   14629 // CHECK:   [[TMP6:%.*]] = load %struct.float16x8x3_t, %struct.float16x8x3_t* [[RETVAL]], align 16
   14630 // CHECK:   ret %struct.float16x8x3_t [[TMP6]]
   14631 float16x8x3_t test_vld1q_f16_x3(float16_t const *a) {
   14632   return vld1q_f16_x3(a);
   14633 }
   14634 
   14635 // CHECK-LABEL: define %struct.float32x4x3_t @test_vld1q_f32_x3(float* %a) #0 {
   14636 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x3_t, align 16
   14637 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16
   14638 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
   14639 // CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
   14640 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
   14641 // CHECK:   [[VLD1XN:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x3.v4f32.p0f32(float* [[TMP2]])
   14642 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float> }*
   14643 // CHECK:   store { <4 x float>, <4 x float>, <4 x float> } [[VLD1XN]], { <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
   14644 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x3_t* [[RETVAL]] to i8*
   14645 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
   14646 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
   14647 // CHECK:   [[TMP6:%.*]] = load %struct.float32x4x3_t, %struct.float32x4x3_t* [[RETVAL]], align 16
   14648 // CHECK:   ret %struct.float32x4x3_t [[TMP6]]
   14649 float32x4x3_t test_vld1q_f32_x3(float32_t const *a) {
   14650   return vld1q_f32_x3(a);
   14651 }
   14652 
   14653 // CHECK-LABEL: define %struct.float64x2x3_t @test_vld1q_f64_x3(double* %a) #0 {
   14654 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x3_t, align 16
   14655 // CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x3_t, align 16
   14656 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8*
   14657 // CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
   14658 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
   14659 // CHECK:   [[VLD1XN:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x3.v2f64.p0f64(double* [[TMP2]])
   14660 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x double>, <2 x double>, <2 x double> }*
   14661 // CHECK:   store { <2 x double>, <2 x double>, <2 x double> } [[VLD1XN]], { <2 x double>, <2 x double>, <2 x double> }* [[TMP3]]
   14662 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x2x3_t* [[RETVAL]] to i8*
   14663 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8*
   14664 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
   14665 // CHECK:   [[TMP6:%.*]] = load %struct.float64x2x3_t, %struct.float64x2x3_t* [[RETVAL]], align 16
   14666 // CHECK:   ret %struct.float64x2x3_t [[TMP6]]
   14667 float64x2x3_t test_vld1q_f64_x3(float64_t const *a) {
   14668   return vld1q_f64_x3(a);
   14669 }
   14670 
   14671 // CHECK-LABEL: define %struct.poly8x16x3_t @test_vld1q_p8_x3(i8* %a) #0 {
   14672 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x3_t, align 16
   14673 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align 16
   14674 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
   14675 // CHECK:   [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x3.v16i8.p0i8(i8* %a)
   14676 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
   14677 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
   14678 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x3_t* [[RETVAL]] to i8*
   14679 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
   14680 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 48, i32 16, i1 false)
   14681 // CHECK:   [[TMP4:%.*]] = load %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[RETVAL]], align 16
   14682 // CHECK:   ret %struct.poly8x16x3_t [[TMP4]]
   14683 poly8x16x3_t test_vld1q_p8_x3(poly8_t const *a) {
   14684   return vld1q_p8_x3(a);
   14685 }
   14686 
   14687 // CHECK-LABEL: define %struct.poly16x8x3_t @test_vld1q_p16_x3(i16* %a) #0 {
   14688 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x3_t, align 16
   14689 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16
   14690 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
   14691 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   14692 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
   14693 // CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x3.v8i16.p0i16(i16* [[TMP2]])
   14694 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
   14695 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
   14696 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x3_t* [[RETVAL]] to i8*
   14697 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
   14698 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
   14699 // CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[RETVAL]], align 16
   14700 // CHECK:   ret %struct.poly16x8x3_t [[TMP6]]
   14701 poly16x8x3_t test_vld1q_p16_x3(poly16_t const *a) {
   14702   return vld1q_p16_x3(a);
   14703 }
   14704 
   14705 // CHECK-LABEL: define %struct.poly64x2x3_t @test_vld1q_p64_x3(i64* %a) #0 {
   14706 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x3_t, align 16
   14707 // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x3_t, align 16
   14708 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
   14709 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
   14710 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
   14711 // CHECK:   [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x3.v2i64.p0i64(i64* [[TMP2]])
   14712 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
   14713 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
   14714 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x3_t* [[RETVAL]] to i8*
   14715 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
   14716 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
   14717 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[RETVAL]], align 16
   14718 // CHECK:   ret %struct.poly64x2x3_t [[TMP6]]
   14719 poly64x2x3_t test_vld1q_p64_x3(poly64_t const *a) {
   14720   return vld1q_p64_x3(a);
   14721 }
   14722 
   14723 // CHECK-LABEL: define %struct.uint8x8x3_t @test_vld1_u8_x3(i8* %a) #0 {
   14724 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x3_t, align 8
   14725 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
   14726 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
   14727 // CHECK:   [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x3.v8i8.p0i8(i8* %a)
   14728 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
   14729 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
   14730 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[RETVAL]] to i8*
   14731 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
   14732 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 24, i32 8, i1 false)
   14733 // CHECK:   [[TMP4:%.*]] = load %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[RETVAL]], align 8
   14734 // CHECK:   ret %struct.uint8x8x3_t [[TMP4]]
   14735 uint8x8x3_t test_vld1_u8_x3(uint8_t const *a) {
   14736   return vld1_u8_x3(a);
   14737 }
   14738 
   14739 // CHECK-LABEL: define %struct.uint16x4x3_t @test_vld1_u16_x3(i16* %a) #0 {
   14740 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x3_t, align 8
   14741 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
   14742 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
   14743 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   14744 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
   14745 // CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x3.v4i16.p0i16(i16* [[TMP2]])
   14746 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
   14747 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
   14748 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x3_t* [[RETVAL]] to i8*
   14749 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
   14750 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
   14751 // CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[RETVAL]], align 8
   14752 // CHECK:   ret %struct.uint16x4x3_t [[TMP6]]
   14753 uint16x4x3_t test_vld1_u16_x3(uint16_t const *a) {
   14754   return vld1_u16_x3(a);
   14755 }
   14756 
   14757 // CHECK-LABEL: define %struct.uint32x2x3_t @test_vld1_u32_x3(i32* %a) #0 {
   14758 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x3_t, align 8
   14759 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
   14760 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
   14761 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
   14762 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
   14763 // CHECK:   [[VLD1XN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x3.v2i32.p0i32(i32* [[TMP2]])
   14764 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
   14765 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
   14766 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x3_t* [[RETVAL]] to i8*
   14767 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
   14768 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
   14769 // CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[RETVAL]], align 8
   14770 // CHECK:   ret %struct.uint32x2x3_t [[TMP6]]
   14771 uint32x2x3_t test_vld1_u32_x3(uint32_t const *a) {
   14772   return vld1_u32_x3(a);
   14773 }
   14774 
   14775 // CHECK-LABEL: define %struct.uint64x1x3_t @test_vld1_u64_x3(i64* %a) #0 {
   14776 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x3_t, align 8
   14777 // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8
   14778 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
   14779 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
   14780 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
   14781 // CHECK:   [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x3.v1i64.p0i64(i64* [[TMP2]])
   14782 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
   14783 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
   14784 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x3_t* [[RETVAL]] to i8*
   14785 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
   14786 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
   14787 // CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[RETVAL]], align 8
   14788 // CHECK:   ret %struct.uint64x1x3_t [[TMP6]]
   14789 uint64x1x3_t test_vld1_u64_x3(uint64_t const *a) {
   14790   return vld1_u64_x3(a);
   14791 }
   14792 
   14793 // CHECK-LABEL: define %struct.int8x8x3_t @test_vld1_s8_x3(i8* %a) #0 {
   14794 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x3_t, align 8
   14795 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
   14796 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
   14797 // CHECK:   [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x3.v8i8.p0i8(i8* %a)
   14798 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
   14799 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
   14800 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[RETVAL]] to i8*
   14801 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
   14802 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 24, i32 8, i1 false)
   14803 // CHECK:   [[TMP4:%.*]] = load %struct.int8x8x3_t, %struct.int8x8x3_t* [[RETVAL]], align 8
   14804 // CHECK:   ret %struct.int8x8x3_t [[TMP4]]
   14805 int8x8x3_t test_vld1_s8_x3(int8_t const *a) {
   14806   return vld1_s8_x3(a);
   14807 }
   14808 
   14809 // CHECK-LABEL: define %struct.int16x4x3_t @test_vld1_s16_x3(i16* %a) #0 {
   14810 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x3_t, align 8
   14811 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
   14812 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
   14813 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   14814 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
   14815 // CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x3.v4i16.p0i16(i16* [[TMP2]])
   14816 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
   14817 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
   14818 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x3_t* [[RETVAL]] to i8*
   14819 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
   14820 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
   14821 // CHECK:   [[TMP6:%.*]] = load %struct.int16x4x3_t, %struct.int16x4x3_t* [[RETVAL]], align 8
   14822 // CHECK:   ret %struct.int16x4x3_t [[TMP6]]
   14823 int16x4x3_t test_vld1_s16_x3(int16_t const *a) {
   14824   return vld1_s16_x3(a);
   14825 }
   14826 
   14827 // CHECK-LABEL: define %struct.int32x2x3_t @test_vld1_s32_x3(i32* %a) #0 {
   14828 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x3_t, align 8
   14829 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
   14830 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
   14831 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
   14832 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
   14833 // CHECK:   [[VLD1XN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x3.v2i32.p0i32(i32* [[TMP2]])
   14834 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
   14835 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
   14836 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x3_t* [[RETVAL]] to i8*
   14837 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
   14838 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
   14839 // CHECK:   [[TMP6:%.*]] = load %struct.int32x2x3_t, %struct.int32x2x3_t* [[RETVAL]], align 8
   14840 // CHECK:   ret %struct.int32x2x3_t [[TMP6]]
   14841 int32x2x3_t test_vld1_s32_x3(int32_t const *a) {
   14842   return vld1_s32_x3(a);
   14843 }
   14844 
   14845 // CHECK-LABEL: define %struct.int64x1x3_t @test_vld1_s64_x3(i64* %a) #0 {
   14846 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x3_t, align 8
   14847 // CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8
   14848 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
   14849 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
   14850 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
   14851 // CHECK:   [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x3.v1i64.p0i64(i64* [[TMP2]])
   14852 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
   14853 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
   14854 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x3_t* [[RETVAL]] to i8*
   14855 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
   14856 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
   14857 // CHECK:   [[TMP6:%.*]] = load %struct.int64x1x3_t, %struct.int64x1x3_t* [[RETVAL]], align 8
   14858 // CHECK:   ret %struct.int64x1x3_t [[TMP6]]
   14859 int64x1x3_t test_vld1_s64_x3(int64_t const *a) {
   14860   return vld1_s64_x3(a);
   14861 }
   14862 
   14863 // CHECK-LABEL: define %struct.float16x4x3_t @test_vld1_f16_x3(half* %a) #0 {
   14864 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x3_t, align 8
   14865 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
   14866 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
   14867 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
   14868 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
   14869 // CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x3.v4i16.p0i16(i16* [[TMP2]])
   14870 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
   14871 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
   14872 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x3_t* [[RETVAL]] to i8*
   14873 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
   14874 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
   14875 // CHECK:   [[TMP6:%.*]] = load %struct.float16x4x3_t, %struct.float16x4x3_t* [[RETVAL]], align 8
   14876 // CHECK:   ret %struct.float16x4x3_t [[TMP6]]
   14877 float16x4x3_t test_vld1_f16_x3(float16_t const *a) {
   14878   return vld1_f16_x3(a);
   14879 }
   14880 
   14881 // CHECK-LABEL: define %struct.float32x2x3_t @test_vld1_f32_x3(float* %a) #0 {
   14882 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x3_t, align 8
   14883 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
   14884 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
   14885 // CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
   14886 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
   14887 // CHECK:   [[VLD1XN:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x3.v2f32.p0f32(float* [[TMP2]])
   14888 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float> }*
   14889 // CHECK:   store { <2 x float>, <2 x float>, <2 x float> } [[VLD1XN]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
   14890 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x3_t* [[RETVAL]] to i8*
   14891 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
   14892 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
   14893 // CHECK:   [[TMP6:%.*]] = load %struct.float32x2x3_t, %struct.float32x2x3_t* [[RETVAL]], align 8
   14894 // CHECK:   ret %struct.float32x2x3_t [[TMP6]]
   14895 float32x2x3_t test_vld1_f32_x3(float32_t const *a) {
   14896   return vld1_f32_x3(a);
   14897 }
   14898 
   14899 // CHECK-LABEL: define %struct.float64x1x3_t @test_vld1_f64_x3(double* %a) #0 {
   14900 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x3_t, align 8
   14901 // CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x3_t, align 8
   14902 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8*
   14903 // CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
   14904 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
   14905 // CHECK:   [[VLD1XN:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x3.v1f64.p0f64(double* [[TMP2]])
   14906 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x double>, <1 x double>, <1 x double> }*
   14907 // CHECK:   store { <1 x double>, <1 x double>, <1 x double> } [[VLD1XN]], { <1 x double>, <1 x double>, <1 x double> }* [[TMP3]]
   14908 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x1x3_t* [[RETVAL]] to i8*
   14909 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8*
   14910 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
   14911 // CHECK:   [[TMP6:%.*]] = load %struct.float64x1x3_t, %struct.float64x1x3_t* [[RETVAL]], align 8
   14912 // CHECK:   ret %struct.float64x1x3_t [[TMP6]]
   14913 float64x1x3_t test_vld1_f64_x3(float64_t const *a) {
   14914   return vld1_f64_x3(a);
   14915 }
   14916 
   14917 // CHECK-LABEL: define %struct.poly8x8x3_t @test_vld1_p8_x3(i8* %a) #0 {
   14918 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x3_t, align 8
   14919 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
   14920 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
   14921 // CHECK:   [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x3.v8i8.p0i8(i8* %a)
   14922 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
   14923 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
   14924 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[RETVAL]] to i8*
   14925 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
   14926 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 24, i32 8, i1 false)
   14927 // CHECK:   [[TMP4:%.*]] = load %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[RETVAL]], align 8
   14928 // CHECK:   ret %struct.poly8x8x3_t [[TMP4]]
   14929 poly8x8x3_t test_vld1_p8_x3(poly8_t const *a) {
   14930   return vld1_p8_x3(a);
   14931 }
   14932 
   14933 // CHECK-LABEL: define %struct.poly16x4x3_t @test_vld1_p16_x3(i16* %a) #0 {
   14934 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x3_t, align 8
   14935 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
   14936 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
   14937 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   14938 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
   14939 // CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x3.v4i16.p0i16(i16* [[TMP2]])
   14940 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
   14941 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
   14942 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x3_t* [[RETVAL]] to i8*
   14943 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
   14944 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
   14945 // CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[RETVAL]], align 8
   14946 // CHECK:   ret %struct.poly16x4x3_t [[TMP6]]
   14947 poly16x4x3_t test_vld1_p16_x3(poly16_t const *a) {
   14948   return vld1_p16_x3(a);
   14949 }
   14950 
   14951 // CHECK-LABEL: define %struct.poly64x1x3_t @test_vld1_p64_x3(i64* %a) #0 {
   14952 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x3_t, align 8
   14953 // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x3_t, align 8
   14954 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
   14955 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
   14956 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
   14957 // CHECK:   [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x3.v1i64.p0i64(i64* [[TMP2]])
   14958 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
   14959 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
   14960 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x3_t* [[RETVAL]] to i8*
   14961 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
   14962 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
   14963 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[RETVAL]], align 8
   14964 // CHECK:   ret %struct.poly64x1x3_t [[TMP6]]
   14965 poly64x1x3_t test_vld1_p64_x3(poly64_t const *a) {
   14966   return vld1_p64_x3(a);
   14967 }
   14968 
   14969 // CHECK-LABEL: define %struct.uint8x16x4_t @test_vld1q_u8_x4(i8* %a) #0 {
   14970 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x4_t, align 16
   14971 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align 16
   14972 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
   14973 // CHECK:   [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x4.v16i8.p0i8(i8* %a)
   14974 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
   14975 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
   14976 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x4_t* [[RETVAL]] to i8*
   14977 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
   14978 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 64, i32 16, i1 false)
   14979 // CHECK:   [[TMP4:%.*]] = load %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[RETVAL]], align 16
   14980 // CHECK:   ret %struct.uint8x16x4_t [[TMP4]]
   14981 uint8x16x4_t test_vld1q_u8_x4(uint8_t const *a) {
   14982   return vld1q_u8_x4(a);
   14983 }
   14984 
   14985 // CHECK-LABEL: define %struct.uint16x8x4_t @test_vld1q_u16_x4(i16* %a) #0 {
   14986 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x4_t, align 16
   14987 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16
   14988 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
   14989 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   14990 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
   14991 // CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x4.v8i16.p0i16(i16* [[TMP2]])
   14992 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
   14993 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
   14994 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x4_t* [[RETVAL]] to i8*
   14995 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
   14996 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
   14997 // CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[RETVAL]], align 16
   14998 // CHECK:   ret %struct.uint16x8x4_t [[TMP6]]
   14999 uint16x8x4_t test_vld1q_u16_x4(uint16_t const *a) {
   15000   return vld1q_u16_x4(a);
   15001 }
   15002 
   15003 // CHECK-LABEL: define %struct.uint32x4x4_t @test_vld1q_u32_x4(i32* %a) #0 {
   15004 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x4_t, align 16
   15005 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16
   15006 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
   15007 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
   15008 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
   15009 // CHECK:   [[VLD1XN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x4.v4i32.p0i32(i32* [[TMP2]])
   15010 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
   15011 // CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
   15012 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x4_t* [[RETVAL]] to i8*
   15013 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
   15014 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
   15015 // CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[RETVAL]], align 16
   15016 // CHECK:   ret %struct.uint32x4x4_t [[TMP6]]
   15017 uint32x4x4_t test_vld1q_u32_x4(uint32_t const *a) {
   15018   return vld1q_u32_x4(a);
   15019 }
   15020 
   15021 // CHECK-LABEL: define %struct.uint64x2x4_t @test_vld1q_u64_x4(i64* %a) #0 {
   15022 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x4_t, align 16
   15023 // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x4_t, align 16
   15024 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
   15025 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
   15026 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
   15027 // CHECK:   [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x4.v2i64.p0i64(i64* [[TMP2]])
   15028 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
   15029 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
   15030 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x2x4_t* [[RETVAL]] to i8*
   15031 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
   15032 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
   15033 // CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[RETVAL]], align 16
   15034 // CHECK:   ret %struct.uint64x2x4_t [[TMP6]]
   15035 uint64x2x4_t test_vld1q_u64_x4(uint64_t const *a) {
   15036   return vld1q_u64_x4(a);
   15037 }
   15038 
   15039 // CHECK-LABEL: define %struct.int8x16x4_t @test_vld1q_s8_x4(i8* %a) #0 {
   15040 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x4_t, align 16
   15041 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x4_t, align 16
   15042 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
   15043 // CHECK:   [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x4.v16i8.p0i8(i8* %a)
   15044 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
   15045 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
   15046 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x4_t* [[RETVAL]] to i8*
   15047 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
   15048 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 64, i32 16, i1 false)
   15049 // CHECK:   [[TMP4:%.*]] = load %struct.int8x16x4_t, %struct.int8x16x4_t* [[RETVAL]], align 16
   15050 // CHECK:   ret %struct.int8x16x4_t [[TMP4]]
   15051 int8x16x4_t test_vld1q_s8_x4(int8_t const *a) {
   15052   return vld1q_s8_x4(a);
   15053 }
   15054 
   15055 // CHECK-LABEL: define %struct.int16x8x4_t @test_vld1q_s16_x4(i16* %a) #0 {
   15056 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x4_t, align 16
   15057 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16
   15058 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
   15059 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   15060 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
   15061 // CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x4.v8i16.p0i16(i16* [[TMP2]])
   15062 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
   15063 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
   15064 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x4_t* [[RETVAL]] to i8*
   15065 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
   15066 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
   15067 // CHECK:   [[TMP6:%.*]] = load %struct.int16x8x4_t, %struct.int16x8x4_t* [[RETVAL]], align 16
   15068 // CHECK:   ret %struct.int16x8x4_t [[TMP6]]
   15069 int16x8x4_t test_vld1q_s16_x4(int16_t const *a) {
   15070   return vld1q_s16_x4(a);
   15071 }
   15072 
   15073 // CHECK-LABEL: define %struct.int32x4x4_t @test_vld1q_s32_x4(i32* %a) #0 {
   15074 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x4_t, align 16
   15075 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16
   15076 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
   15077 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
   15078 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
   15079 // CHECK:   [[VLD1XN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x4.v4i32.p0i32(i32* [[TMP2]])
   15080 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
   15081 // CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
   15082 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x4_t* [[RETVAL]] to i8*
   15083 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
   15084 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
   15085 // CHECK:   [[TMP6:%.*]] = load %struct.int32x4x4_t, %struct.int32x4x4_t* [[RETVAL]], align 16
   15086 // CHECK:   ret %struct.int32x4x4_t [[TMP6]]
   15087 int32x4x4_t test_vld1q_s32_x4(int32_t const *a) {
   15088   return vld1q_s32_x4(a);
   15089 }
   15090 
   15091 // CHECK-LABEL: define %struct.int64x2x4_t @test_vld1q_s64_x4(i64* %a) #0 {
   15092 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x4_t, align 16
   15093 // CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x4_t, align 16
   15094 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
   15095 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
   15096 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
   15097 // CHECK:   [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x4.v2i64.p0i64(i64* [[TMP2]])
   15098 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
   15099 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
   15100 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x2x4_t* [[RETVAL]] to i8*
   15101 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
   15102 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
   15103 // CHECK:   [[TMP6:%.*]] = load %struct.int64x2x4_t, %struct.int64x2x4_t* [[RETVAL]], align 16
   15104 // CHECK:   ret %struct.int64x2x4_t [[TMP6]]
   15105 int64x2x4_t test_vld1q_s64_x4(int64_t const *a) {
   15106   return vld1q_s64_x4(a);
   15107 }
   15108 
   15109 // CHECK-LABEL: define %struct.float16x8x4_t @test_vld1q_f16_x4(half* %a) #0 {
   15110 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x4_t, align 16
   15111 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
   15112 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
   15113 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
   15114 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
   15115 // CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x4.v8i16.p0i16(i16* [[TMP2]])
   15116 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
   15117 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
   15118 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x4_t* [[RETVAL]] to i8*
   15119 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
   15120 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
   15121 // CHECK:   [[TMP6:%.*]] = load %struct.float16x8x4_t, %struct.float16x8x4_t* [[RETVAL]], align 16
   15122 // CHECK:   ret %struct.float16x8x4_t [[TMP6]]
   15123 float16x8x4_t test_vld1q_f16_x4(float16_t const *a) {
   15124   return vld1q_f16_x4(a);
   15125 }
   15126 
   15127 // CHECK-LABEL: define %struct.float32x4x4_t @test_vld1q_f32_x4(float* %a) #0 {
   15128 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x4_t, align 16
   15129 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16
   15130 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
   15131 // CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
   15132 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
   15133 // CHECK:   [[VLD1XN:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x4.v4f32.p0f32(float* [[TMP2]])
   15134 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float>, <4 x float> }*
   15135 // CHECK:   store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD1XN]], { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
   15136 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x4_t* [[RETVAL]] to i8*
   15137 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
   15138 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
   15139 // CHECK:   [[TMP6:%.*]] = load %struct.float32x4x4_t, %struct.float32x4x4_t* [[RETVAL]], align 16
   15140 // CHECK:   ret %struct.float32x4x4_t [[TMP6]]
   15141 float32x4x4_t test_vld1q_f32_x4(float32_t const *a) {
   15142   return vld1q_f32_x4(a);
   15143 }
   15144 
   15145 // CHECK-LABEL: define %struct.float64x2x4_t @test_vld1q_f64_x4(double* %a) #0 {
   15146 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x4_t, align 16
   15147 // CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x4_t, align 16
   15148 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8*
   15149 // CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
   15150 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
   15151 // CHECK:   [[VLD1XN:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x4.v2f64.p0f64(double* [[TMP2]])
   15152 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x double>, <2 x double>, <2 x double>, <2 x double> }*
   15153 // CHECK:   store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD1XN]], { <2 x double>, <2 x double>, <2 x double>, <2 x double> }* [[TMP3]]
   15154 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x2x4_t* [[RETVAL]] to i8*
   15155 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8*
   15156 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
   15157 // CHECK:   [[TMP6:%.*]] = load %struct.float64x2x4_t, %struct.float64x2x4_t* [[RETVAL]], align 16
   15158 // CHECK:   ret %struct.float64x2x4_t [[TMP6]]
   15159 float64x2x4_t test_vld1q_f64_x4(float64_t const *a) {
   15160   return vld1q_f64_x4(a);
   15161 }
   15162 
   15163 // CHECK-LABEL: define %struct.poly8x16x4_t @test_vld1q_p8_x4(i8* %a) #0 {
   15164 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x4_t, align 16
   15165 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align 16
   15166 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
   15167 // CHECK:   [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x4.v16i8.p0i8(i8* %a)
   15168 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
   15169 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
   15170 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x4_t* [[RETVAL]] to i8*
   15171 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
   15172 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 64, i32 16, i1 false)
   15173 // CHECK:   [[TMP4:%.*]] = load %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[RETVAL]], align 16
   15174 // CHECK:   ret %struct.poly8x16x4_t [[TMP4]]
   15175 poly8x16x4_t test_vld1q_p8_x4(poly8_t const *a) {
   15176   return vld1q_p8_x4(a);
   15177 }
   15178 
   15179 // CHECK-LABEL: define %struct.poly16x8x4_t @test_vld1q_p16_x4(i16* %a) #0 {
   15180 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x4_t, align 16
   15181 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16
   15182 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
   15183 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   15184 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
   15185 // CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x4.v8i16.p0i16(i16* [[TMP2]])
   15186 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
   15187 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
   15188 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x4_t* [[RETVAL]] to i8*
   15189 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
   15190 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
   15191 // CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[RETVAL]], align 16
   15192 // CHECK:   ret %struct.poly16x8x4_t [[TMP6]]
   15193 poly16x8x4_t test_vld1q_p16_x4(poly16_t const *a) {
   15194   return vld1q_p16_x4(a);
   15195 }
   15196 
   15197 // CHECK-LABEL: define %struct.poly64x2x4_t @test_vld1q_p64_x4(i64* %a) #0 {
   15198 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x4_t, align 16
   15199 // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x4_t, align 16
   15200 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
   15201 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
   15202 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
   15203 // CHECK:   [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x4.v2i64.p0i64(i64* [[TMP2]])
   15204 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
   15205 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
   15206 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x4_t* [[RETVAL]] to i8*
   15207 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
   15208 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
   15209 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[RETVAL]], align 16
   15210 // CHECK:   ret %struct.poly64x2x4_t [[TMP6]]
   15211 poly64x2x4_t test_vld1q_p64_x4(poly64_t const *a) {
   15212   return vld1q_p64_x4(a);
   15213 }
   15214 
   15215 // CHECK-LABEL: define %struct.uint8x8x4_t @test_vld1_u8_x4(i8* %a) #0 {
   15216 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x4_t, align 8
   15217 // CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
   15218 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
   15219 // CHECK:   [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x4.v8i8.p0i8(i8* %a)
   15220 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
   15221 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
   15222 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[RETVAL]] to i8*
   15223 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
   15224 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 8, i1 false)
   15225 // CHECK:   [[TMP4:%.*]] = load %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[RETVAL]], align 8
   15226 // CHECK:   ret %struct.uint8x8x4_t [[TMP4]]
   15227 uint8x8x4_t test_vld1_u8_x4(uint8_t const *a) {
   15228   return vld1_u8_x4(a);
   15229 }
   15230 
   15231 // CHECK-LABEL: define %struct.uint16x4x4_t @test_vld1_u16_x4(i16* %a) #0 {
   15232 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x4_t, align 8
   15233 // CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
   15234 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
   15235 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   15236 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
   15237 // CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x4.v4i16.p0i16(i16* [[TMP2]])
   15238 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
   15239 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
   15240 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x4_t* [[RETVAL]] to i8*
   15241 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
   15242 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
   15243 // CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[RETVAL]], align 8
   15244 // CHECK:   ret %struct.uint16x4x4_t [[TMP6]]
   15245 uint16x4x4_t test_vld1_u16_x4(uint16_t const *a) {
   15246   return vld1_u16_x4(a);
   15247 }
   15248 
   15249 // CHECK-LABEL: define %struct.uint32x2x4_t @test_vld1_u32_x4(i32* %a) #0 {
   15250 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x4_t, align 8
   15251 // CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
   15252 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
   15253 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
   15254 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
   15255 // CHECK:   [[VLD1XN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x4.v2i32.p0i32(i32* [[TMP2]])
   15256 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
   15257 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
   15258 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x4_t* [[RETVAL]] to i8*
   15259 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
   15260 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
   15261 // CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[RETVAL]], align 8
   15262 // CHECK:   ret %struct.uint32x2x4_t [[TMP6]]
   15263 uint32x2x4_t test_vld1_u32_x4(uint32_t const *a) {
   15264   return vld1_u32_x4(a);
   15265 }
   15266 
   15267 // CHECK-LABEL: define %struct.uint64x1x4_t @test_vld1_u64_x4(i64* %a) #0 {
   15268 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x4_t, align 8
   15269 // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8
   15270 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
   15271 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
   15272 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
   15273 // CHECK:   [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x4.v1i64.p0i64(i64* [[TMP2]])
   15274 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
   15275 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
   15276 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x4_t* [[RETVAL]] to i8*
   15277 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
   15278 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
   15279 // CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[RETVAL]], align 8
   15280 // CHECK:   ret %struct.uint64x1x4_t [[TMP6]]
   15281 uint64x1x4_t test_vld1_u64_x4(uint64_t const *a) {
   15282   return vld1_u64_x4(a);
   15283 }
   15284 
   15285 // CHECK-LABEL: define %struct.int8x8x4_t @test_vld1_s8_x4(i8* %a) #0 {
   15286 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x4_t, align 8
   15287 // CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
   15288 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
   15289 // CHECK:   [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x4.v8i8.p0i8(i8* %a)
   15290 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
   15291 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
   15292 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[RETVAL]] to i8*
   15293 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
   15294 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 8, i1 false)
   15295 // CHECK:   [[TMP4:%.*]] = load %struct.int8x8x4_t, %struct.int8x8x4_t* [[RETVAL]], align 8
   15296 // CHECK:   ret %struct.int8x8x4_t [[TMP4]]
   15297 int8x8x4_t test_vld1_s8_x4(int8_t const *a) {
   15298   return vld1_s8_x4(a);
   15299 }
   15300 
   15301 // CHECK-LABEL: define %struct.int16x4x4_t @test_vld1_s16_x4(i16* %a) #0 {
   15302 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x4_t, align 8
   15303 // CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
   15304 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
   15305 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   15306 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
   15307 // CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x4.v4i16.p0i16(i16* [[TMP2]])
   15308 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
   15309 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
   15310 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x4_t* [[RETVAL]] to i8*
   15311 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
   15312 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
   15313 // CHECK:   [[TMP6:%.*]] = load %struct.int16x4x4_t, %struct.int16x4x4_t* [[RETVAL]], align 8
   15314 // CHECK:   ret %struct.int16x4x4_t [[TMP6]]
   15315 int16x4x4_t test_vld1_s16_x4(int16_t const *a) {
   15316   return vld1_s16_x4(a);
   15317 }
   15318 
   15319 // CHECK-LABEL: define %struct.int32x2x4_t @test_vld1_s32_x4(i32* %a) #0 {
   15320 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x4_t, align 8
   15321 // CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
   15322 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
   15323 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
   15324 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
   15325 // CHECK:   [[VLD1XN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x4.v2i32.p0i32(i32* [[TMP2]])
   15326 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
   15327 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
   15328 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x4_t* [[RETVAL]] to i8*
   15329 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
   15330 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
   15331 // CHECK:   [[TMP6:%.*]] = load %struct.int32x2x4_t, %struct.int32x2x4_t* [[RETVAL]], align 8
   15332 // CHECK:   ret %struct.int32x2x4_t [[TMP6]]
   15333 int32x2x4_t test_vld1_s32_x4(int32_t const *a) {
   15334   return vld1_s32_x4(a);
   15335 }
   15336 
   15337 // CHECK-LABEL: define %struct.int64x1x4_t @test_vld1_s64_x4(i64* %a) #0 {
   15338 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x4_t, align 8
   15339 // CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8
   15340 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
   15341 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
   15342 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
   15343 // CHECK:   [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x4.v1i64.p0i64(i64* [[TMP2]])
   15344 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
   15345 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
   15346 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x4_t* [[RETVAL]] to i8*
   15347 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
   15348 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
   15349 // CHECK:   [[TMP6:%.*]] = load %struct.int64x1x4_t, %struct.int64x1x4_t* [[RETVAL]], align 8
   15350 // CHECK:   ret %struct.int64x1x4_t [[TMP6]]
   15351 int64x1x4_t test_vld1_s64_x4(int64_t const *a) {
   15352   return vld1_s64_x4(a);
   15353 }
   15354 
   15355 // CHECK-LABEL: define %struct.float16x4x4_t @test_vld1_f16_x4(half* %a) #0 {
   15356 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x4_t, align 8
   15357 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
   15358 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
   15359 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
   15360 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
   15361 // CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x4.v4i16.p0i16(i16* [[TMP2]])
   15362 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
   15363 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
   15364 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x4_t* [[RETVAL]] to i8*
   15365 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
   15366 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
   15367 // CHECK:   [[TMP6:%.*]] = load %struct.float16x4x4_t, %struct.float16x4x4_t* [[RETVAL]], align 8
   15368 // CHECK:   ret %struct.float16x4x4_t [[TMP6]]
   15369 float16x4x4_t test_vld1_f16_x4(float16_t const *a) {
   15370   return vld1_f16_x4(a);
   15371 }
   15372 
   15373 // CHECK-LABEL: define %struct.float32x2x4_t @test_vld1_f32_x4(float* %a) #0 {
   15374 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x4_t, align 8
   15375 // CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
   15376 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
   15377 // CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
   15378 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
   15379 // CHECK:   [[VLD1XN:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x4.v2f32.p0f32(float* [[TMP2]])
   15380 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float>, <2 x float> }*
   15381 // CHECK:   store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD1XN]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
   15382 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x4_t* [[RETVAL]] to i8*
   15383 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
   15384 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
   15385 // CHECK:   [[TMP6:%.*]] = load %struct.float32x2x4_t, %struct.float32x2x4_t* [[RETVAL]], align 8
   15386 // CHECK:   ret %struct.float32x2x4_t [[TMP6]]
   15387 float32x2x4_t test_vld1_f32_x4(float32_t const *a) {
   15388   return vld1_f32_x4(a);
   15389 }
   15390 
   15391 // CHECK-LABEL: define %struct.float64x1x4_t @test_vld1_f64_x4(double* %a) #0 {
   15392 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x4_t, align 8
   15393 // CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x4_t, align 8
   15394 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8*
   15395 // CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
   15396 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
   15397 // CHECK:   [[VLD1XN:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x4.v1f64.p0f64(double* [[TMP2]])
   15398 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x double>, <1 x double>, <1 x double>, <1 x double> }*
   15399 // CHECK:   store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD1XN]], { <1 x double>, <1 x double>, <1 x double>, <1 x double> }* [[TMP3]]
   15400 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x1x4_t* [[RETVAL]] to i8*
   15401 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8*
   15402 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
   15403 // CHECK:   [[TMP6:%.*]] = load %struct.float64x1x4_t, %struct.float64x1x4_t* [[RETVAL]], align 8
   15404 // CHECK:   ret %struct.float64x1x4_t [[TMP6]]
   15405 float64x1x4_t test_vld1_f64_x4(float64_t const *a) {
   15406   return vld1_f64_x4(a);
   15407 }
   15408 
   15409 // CHECK-LABEL: define %struct.poly8x8x4_t @test_vld1_p8_x4(i8* %a) #0 {
   15410 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x4_t, align 8
   15411 // CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
   15412 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
   15413 // CHECK:   [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x4.v8i8.p0i8(i8* %a)
   15414 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
   15415 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
   15416 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[RETVAL]] to i8*
   15417 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
   15418 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 8, i1 false)
   15419 // CHECK:   [[TMP4:%.*]] = load %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[RETVAL]], align 8
   15420 // CHECK:   ret %struct.poly8x8x4_t [[TMP4]]
   15421 poly8x8x4_t test_vld1_p8_x4(poly8_t const *a) {
   15422   return vld1_p8_x4(a);
   15423 }
   15424 
   15425 // CHECK-LABEL: define %struct.poly16x4x4_t @test_vld1_p16_x4(i16* %a) #0 {
   15426 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x4_t, align 8
   15427 // CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
   15428 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
   15429 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
   15430 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
   15431 // CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x4.v4i16.p0i16(i16* [[TMP2]])
   15432 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
   15433 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
   15434 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x4_t* [[RETVAL]] to i8*
   15435 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
   15436 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
   15437 // CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[RETVAL]], align 8
   15438 // CHECK:   ret %struct.poly16x4x4_t [[TMP6]]
   15439 poly16x4x4_t test_vld1_p16_x4(poly16_t const *a) {
   15440   return vld1_p16_x4(a);
   15441 }
   15442 
   15443 // CHECK-LABEL: define %struct.poly64x1x4_t @test_vld1_p64_x4(i64* %a) #0 {
   15444 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x4_t, align 8
   15445 // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x4_t, align 8
   15446 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
   15447 // CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
   15448 // CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
   15449 // CHECK:   [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x4.v1i64.p0i64(i64* [[TMP2]])
   15450 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
   15451 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
   15452 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x4_t* [[RETVAL]] to i8*
   15453 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
   15454 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
   15455 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[RETVAL]], align 8
   15456 // CHECK:   ret %struct.poly64x1x4_t [[TMP6]]
   15457 poly64x1x4_t test_vld1_p64_x4(poly64_t const *a) {
   15458   return vld1_p64_x4(a);
   15459 }
   15460 
   15461 // CHECK-LABEL: define void @test_vst1q_u8_x2(i8* %a, [2 x <16 x i8>] %b.coerce) #0 {
   15462 // CHECK:   [[B:%.*]] = alloca %struct.uint8x16x2_t, align 16
   15463 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16
   15464 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[B]], i32 0, i32 0
   15465 // CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
   15466 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__S1]] to i8*
   15467 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x2_t* [[B]] to i8*
   15468 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
   15469 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
   15470 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
   15471 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
   15472 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
   15473 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i64 0, i64 1
   15474 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
   15475 // CHECK:   call void @llvm.aarch64.neon.st1x2.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i8* %a)
   15476 // CHECK:   ret void
   15477 void test_vst1q_u8_x2(uint8_t *a, uint8x16x2_t b) {
   15478   vst1q_u8_x2(a, b);
   15479 }
   15480 
   15481 // CHECK-LABEL: define void @test_vst1q_u16_x2(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
   15482 // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
   15483 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
   15484 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0
   15485 // CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
   15486 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
   15487 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
   15488 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
   15489 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
   15490 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
   15491 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
   15492 // CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
   15493 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
   15494 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
   15495 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i64 0, i64 1
   15496 // CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
   15497 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
   15498 // CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
   15499 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
   15500 // CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16*
   15501 // CHECK:   call void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i16* [[TMP9]])
   15502 // CHECK:   ret void
   15503 void test_vst1q_u16_x2(uint16_t *a, uint16x8x2_t b) {
   15504   vst1q_u16_x2(a, b);
   15505 }
   15506 
   15507 // CHECK-LABEL: define void @test_vst1q_u32_x2(i32* %a, [2 x <4 x i32>] %b.coerce) #0 {
   15508 // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
   15509 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
   15510 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0
   15511 // CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
   15512 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
   15513 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
   15514 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
   15515 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
   15516 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
   15517 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
   15518 // CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
   15519 // CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
   15520 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
   15521 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i64 0, i64 1
   15522 // CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
   15523 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
   15524 // CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
   15525 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
   15526 // CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i32*
   15527 // CHECK:   call void @llvm.aarch64.neon.st1x2.v4i32.p0i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i32* [[TMP9]])
   15528 // CHECK:   ret void
   15529 void test_vst1q_u32_x2(uint32_t *a, uint32x4x2_t b) {
   15530   vst1q_u32_x2(a, b);
   15531 }
   15532 
   15533 // CHECK-LABEL: define void @test_vst1q_u64_x2(i64* %a, [2 x <2 x i64>] %b.coerce) #0 {
   15534 // CHECK:   [[B:%.*]] = alloca %struct.uint64x2x2_t, align 16
   15535 // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x2_t, align 16
   15536 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[B]], i32 0, i32 0
   15537 // CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
   15538 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__S1]] to i8*
   15539 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x2_t* [[B]] to i8*
   15540 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
   15541 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
   15542 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0
   15543 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
   15544 // CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
   15545 // CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
   15546 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0
   15547 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL1]], i64 0, i64 1
   15548 // CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
   15549 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
   15550 // CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
   15551 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
   15552 // CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i64*
   15553 // CHECK:   call void @llvm.aarch64.neon.st1x2.v2i64.p0i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64* [[TMP9]])
   15554 // CHECK:   ret void
   15555 void test_vst1q_u64_x2(uint64_t *a, uint64x2x2_t b) {
   15556   vst1q_u64_x2(a, b);
   15557 }
   15558 
   15559 // CHECK-LABEL: define void @test_vst1q_s8_x2(i8* %a, [2 x <16 x i8>] %b.coerce) #0 {
   15560 // CHECK:   [[B:%.*]] = alloca %struct.int8x16x2_t, align 16
   15561 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 16
   15562 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[B]], i32 0, i32 0
   15563 // CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
   15564 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__S1]] to i8*
   15565 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x2_t* [[B]] to i8*
   15566 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
   15567 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
   15568 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
   15569 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
   15570 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
   15571 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i64 0, i64 1
   15572 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
   15573 // CHECK:   call void @llvm.aarch64.neon.st1x2.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i8* %a)
   15574 // CHECK:   ret void
   15575 void test_vst1q_s8_x2(int8_t *a, int8x16x2_t b) {
   15576   vst1q_s8_x2(a, b);
   15577 }
   15578 
   15579 // CHECK-LABEL: define void @test_vst1q_s16_x2(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
   15580 // CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
   15581 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
   15582 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0
   15583 // CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
   15584 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
   15585 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
   15586 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
   15587 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
   15588 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
   15589 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
   15590 // CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
   15591 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
   15592 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
   15593 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i64 0, i64 1
   15594 // CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
   15595 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
   15596 // CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
   15597 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
   15598 // CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16*
   15599 // CHECK:   call void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i16* [[TMP9]])
   15600 // CHECK:   ret void
   15601 void test_vst1q_s16_x2(int16_t *a, int16x8x2_t b) {
   15602   vst1q_s16_x2(a, b);
   15603 }
   15604 
   15605 // CHECK-LABEL: define void @test_vst1q_s32_x2(i32* %a, [2 x <4 x i32>] %b.coerce) #0 {
   15606 // CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
   15607 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
   15608 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0
   15609 // CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
   15610 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
   15611 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
   15612 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
   15613 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
   15614 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
   15615 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
   15616 // CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
   15617 // CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
   15618 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
   15619 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i64 0, i64 1
   15620 // CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
   15621 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
   15622 // CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
   15623 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
   15624 // CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i32*
   15625 // CHECK:   call void @llvm.aarch64.neon.st1x2.v4i32.p0i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i32* [[TMP9]])
   15626 // CHECK:   ret void
   15627 void test_vst1q_s32_x2(int32_t *a, int32x4x2_t b) {
   15628   vst1q_s32_x2(a, b);
   15629 }
   15630 
   15631 // CHECK-LABEL: define void @test_vst1q_s64_x2(i64* %a, [2 x <2 x i64>] %b.coerce) #0 {
   15632 // CHECK:   [[B:%.*]] = alloca %struct.int64x2x2_t, align 16
   15633 // CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x2_t, align 16
   15634 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[B]], i32 0, i32 0
   15635 // CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
   15636 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__S1]] to i8*
   15637 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x2_t* [[B]] to i8*
   15638 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
   15639 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
   15640 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0
   15641 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
   15642 // CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
   15643 // CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
   15644 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0
   15645 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL1]], i64 0, i64 1
   15646 // CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
   15647 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
   15648 // CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
   15649 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
   15650 // CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i64*
   15651 // CHECK:   call void @llvm.aarch64.neon.st1x2.v2i64.p0i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64* [[TMP9]])
   15652 // CHECK:   ret void
   15653 void test_vst1q_s64_x2(int64_t *a, int64x2x2_t b) {
   15654   vst1q_s64_x2(a, b);
   15655 }
   15656 
   15657 // CHECK-LABEL: define void @test_vst1q_f16_x2(half* %a, [2 x <8 x half>] %b.coerce) #0 {
   15658 // CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
   15659 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
   15660 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0
   15661 // CHECK:   store [2 x <8 x half>] [[B]].coerce, [2 x <8 x half>]* [[COERCE_DIVE]], align 16
   15662 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
   15663 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
   15664 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
   15665 // CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
   15666 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
   15667 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i64 0, i64 0
   15668 // CHECK:   [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
   15669 // CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
   15670 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
   15671 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL1]], i64 0, i64 1
   15672 // CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
   15673 // CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
   15674 // CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
   15675 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
   15676 // CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16*
   15677 // CHECK:   call void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i16* [[TMP9]])
   15678 // CHECK:   ret void
   15679 void test_vst1q_f16_x2(float16_t *a, float16x8x2_t b) {
   15680   vst1q_f16_x2(a, b);
   15681 }
   15682 
   15683 // CHECK-LABEL: define void @test_vst1q_f32_x2(float* %a, [2 x <4 x float>] %b.coerce) #0 {
   15684 // CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
   15685 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
   15686 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0
   15687 // CHECK:   store [2 x <4 x float>] [[B]].coerce, [2 x <4 x float>]* [[COERCE_DIVE]], align 16
   15688 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
   15689 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
   15690 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
   15691 // CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
   15692 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
   15693 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i64 0, i64 0
   15694 // CHECK:   [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
   15695 // CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
   15696 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
   15697 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL1]], i64 0, i64 1
   15698 // CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
   15699 // CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
   15700 // CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
   15701 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
   15702 // CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to float*
   15703 // CHECK:   call void @llvm.aarch64.neon.st1x2.v4f32.p0f32(<4 x float> [[TMP7]], <4 x float> [[TMP8]], float* [[TMP9]])
   15704 // CHECK:   ret void
   15705 void test_vst1q_f32_x2(float32_t *a, float32x4x2_t b) {
   15706   vst1q_f32_x2(a, b);
   15707 }
   15708 
   15709 // CHECK-LABEL: define void @test_vst1q_f64_x2(double* %a, [2 x <2 x double>] %b.coerce) #0 {
   15710 // CHECK:   [[B:%.*]] = alloca %struct.float64x2x2_t, align 16
   15711 // CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x2_t, align 16
   15712 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[B]], i32 0, i32 0
   15713 // CHECK:   store [2 x <2 x double>] [[B]].coerce, [2 x <2 x double>]* [[COERCE_DIVE]], align 16
   15714 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x2_t* [[__S1]] to i8*
   15715 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x2_t* [[B]] to i8*
   15716 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
   15717 // CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
   15718 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[__S1]], i32 0, i32 0
   15719 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>]* [[VAL]], i64 0, i64 0
   15720 // CHECK:   [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
   15721 // CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
   15722 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[__S1]], i32 0, i32 0
   15723 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>]* [[VAL1]], i64 0, i64 1
   15724 // CHECK:   [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX2]], align 16
   15725 // CHECK:   [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
   15726 // CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
   15727 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
   15728 // CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to double*
   15729 // CHECK:   call void @llvm.aarch64.neon.st1x2.v2f64.p0f64(<2 x double> [[TMP7]], <2 x double> [[TMP8]], double* [[TMP9]])
   15730 // CHECK:   ret void
   15731 void test_vst1q_f64_x2(float64_t *a, float64x2x2_t b) {
   15732   vst1q_f64_x2(a, b);
   15733 }
   15734 
   15735 // CHECK-LABEL: define void @test_vst1q_p8_x2(i8* %a, [2 x <16 x i8>] %b.coerce) #0 {
   15736 // CHECK:   [[B:%.*]] = alloca %struct.poly8x16x2_t, align 16
   15737 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 16
   15738 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[B]], i32 0, i32 0
   15739 // CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
   15740 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__S1]] to i8*
   15741 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x2_t* [[B]] to i8*
   15742 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
   15743 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
   15744 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
   15745 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
   15746 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
   15747 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i64 0, i64 1
   15748 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
   15749 // CHECK:   call void @llvm.aarch64.neon.st1x2.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i8* %a)
   15750 // CHECK:   ret void
   15751 void test_vst1q_p8_x2(poly8_t *a, poly8x16x2_t b) {
   15752   vst1q_p8_x2(a, b);
   15753 }
   15754 
   15755 // CHECK-LABEL: define void @test_vst1q_p16_x2(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
   15756 // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
   15757 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
   15758 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0
   15759 // CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
   15760 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
   15761 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
   15762 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
   15763 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
   15764 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
   15765 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
   15766 // CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
   15767 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
   15768 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
   15769 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i64 0, i64 1
   15770 // CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
   15771 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
   15772 // CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
   15773 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
   15774 // CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16*
   15775 // CHECK:   call void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i16* [[TMP9]])
   15776 // CHECK:   ret void
   15777 void test_vst1q_p16_x2(poly16_t *a, poly16x8x2_t b) {
   15778   vst1q_p16_x2(a, b);
   15779 }
   15780 
   15781 // CHECK-LABEL: define void @test_vst1q_p64_x2(i64* %a, [2 x <2 x i64>] %b.coerce) #0 {
   15782 // CHECK:   [[B:%.*]] = alloca %struct.poly64x2x2_t, align 16
   15783 // CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x2_t, align 16
   15784 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[B]], i32 0, i32 0
   15785 // CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
   15786 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__S1]] to i8*
   15787 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x2_t* [[B]] to i8*
   15788 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
   15789 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
   15790 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0
   15791 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
   15792 // CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
   15793 // CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
   15794 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0
   15795 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL1]], i64 0, i64 1
   15796 // CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
   15797 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
   15798 // CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
   15799 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
   15800 // CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i64*
   15801 // CHECK:   call void @llvm.aarch64.neon.st1x2.v2i64.p0i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64* [[TMP9]])
   15802 // CHECK:   ret void
   15803 void test_vst1q_p64_x2(poly64_t *a, poly64x2x2_t b) {
   15804   vst1q_p64_x2(a, b);
   15805 }
   15806 
   15807 // CHECK-LABEL: define void @test_vst1_u8_x2(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
   15808 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
   15809 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
   15810 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
   15811 // CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
   15812 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
   15813 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
   15814 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
   15815 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
   15816 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
   15817 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
   15818 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
   15819 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i64 0, i64 1
   15820 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
   15821 // CHECK:   call void @llvm.aarch64.neon.st1x2.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i8* %a)
   15822 // CHECK:   ret void
   15823 void test_vst1_u8_x2(uint8_t *a, uint8x8x2_t b) {
   15824   vst1_u8_x2(a, b);
   15825 }
   15826 
   15827 // CHECK-LABEL: define void @test_vst1_u16_x2(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
   15828 // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
   15829 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
   15830 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[B]], i32 0, i32 0
   15831 // CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
   15832 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
   15833 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
   15834 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
   15835 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
   15836 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
   15837 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
   15838 // CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
   15839 // CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
   15840 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
   15841 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i64 0, i64 1
   15842 // CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
   15843 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
   15844 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
   15845 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
   15846 // CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16*
   15847 // CHECK:   call void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i16* [[TMP9]])
   15848 // CHECK:   ret void
   15849 void test_vst1_u16_x2(uint16_t *a, uint16x4x2_t b) {
   15850   vst1_u16_x2(a, b);
   15851 }
   15852 
   15853 // CHECK-LABEL: define void @test_vst1_u32_x2(i32* %a, [2 x <2 x i32>] %b.coerce) #0 {
   15854 // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
   15855 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
   15856 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[B]], i32 0, i32 0
   15857 // CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
   15858 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
   15859 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
   15860 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
   15861 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
   15862 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
   15863 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
   15864 // CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
   15865 // CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
   15866 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
   15867 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i64 0, i64 1
   15868 // CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
   15869 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
   15870 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
   15871 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
   15872 // CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i32*
   15873 // CHECK:   call void @llvm.aarch64.neon.st1x2.v2i32.p0i32(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i32* [[TMP9]])
   15874 // CHECK:   ret void
   15875 void test_vst1_u32_x2(uint32_t *a, uint32x2x2_t b) {
   15876   vst1_u32_x2(a, b);
   15877 }
   15878 
   15879 // CHECK-LABEL: define void @test_vst1_u64_x2(i64* %a, [2 x <1 x i64>] %b.coerce) #0 {
   15880 // CHECK:   [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8
   15881 // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8
   15882 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[B]], i32 0, i32 0
   15883 // CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
   15884 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__S1]] to i8*
   15885 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x2_t* [[B]] to i8*
   15886 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
   15887 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
   15888 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
   15889 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
   15890 // CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
   15891 // CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
   15892 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
   15893 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i64 0, i64 1
   15894 // CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
   15895 // CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
   15896 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
   15897 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
   15898 // CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i64*
   15899 // CHECK:   call void @llvm.aarch64.neon.st1x2.v1i64.p0i64(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64* [[TMP9]])
   15900 // CHECK:   ret void
   15901 void test_vst1_u64_x2(uint64_t *a, uint64x1x2_t b) {
   15902   vst1_u64_x2(a, b);
   15903 }
   15904 
   15905 // CHECK-LABEL: define void @test_vst1_s8_x2(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
   15906 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
   15907 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
   15908 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
   15909 // CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
   15910 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
   15911 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
   15912 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
   15913 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
   15914 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
   15915 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
   15916 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
   15917 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i64 0, i64 1
   15918 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
   15919 // CHECK:   call void @llvm.aarch64.neon.st1x2.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i8* %a)
   15920 // CHECK:   ret void
   15921 void test_vst1_s8_x2(int8_t *a, int8x8x2_t b) {
   15922   vst1_s8_x2(a, b);
   15923 }
   15924 
   15925 // CHECK-LABEL: define void @test_vst1_s16_x2(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
   15926 // CHECK:   [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
   15927 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
   15928 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[B]], i32 0, i32 0
   15929 // CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
   15930 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
   15931 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
   15932 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
   15933 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
   15934 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
   15935 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
   15936 // CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
   15937 // CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
   15938 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
   15939 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i64 0, i64 1
   15940 // CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
   15941 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
   15942 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
   15943 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
   15944 // CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16*
   15945 // CHECK:   call void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i16* [[TMP9]])
   15946 // CHECK:   ret void
   15947 void test_vst1_s16_x2(int16_t *a, int16x4x2_t b) {
   15948   vst1_s16_x2(a, b);
   15949 }
   15950 
   15951 // CHECK-LABEL: define void @test_vst1_s32_x2(i32* %a, [2 x <2 x i32>] %b.coerce) #0 {
   15952 // CHECK:   [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
   15953 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
   15954 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[B]], i32 0, i32 0
   15955 // CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
   15956 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
   15957 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
   15958 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
   15959 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
   15960 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
   15961 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
   15962 // CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
   15963 // CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
   15964 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
   15965 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i64 0, i64 1
   15966 // CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
   15967 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
   15968 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
   15969 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
   15970 // CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i32*
   15971 // CHECK:   call void @llvm.aarch64.neon.st1x2.v2i32.p0i32(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i32* [[TMP9]])
   15972 // CHECK:   ret void
   15973 void test_vst1_s32_x2(int32_t *a, int32x2x2_t b) {
   15974   vst1_s32_x2(a, b);
   15975 }
   15976 
   15977 // CHECK-LABEL: define void @test_vst1_s64_x2(i64* %a, [2 x <1 x i64>] %b.coerce) #0 {
   15978 // CHECK:   [[B:%.*]] = alloca %struct.int64x1x2_t, align 8
   15979 // CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8
   15980 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[B]], i32 0, i32 0
   15981 // CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
   15982 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__S1]] to i8*
   15983 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x2_t* [[B]] to i8*
   15984 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
   15985 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
   15986 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
   15987 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
   15988 // CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
   15989 // CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
   15990 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
   15991 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i64 0, i64 1
   15992 // CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
   15993 // CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
   15994 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
   15995 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
   15996 // CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i64*
   15997 // CHECK:   call void @llvm.aarch64.neon.st1x2.v1i64.p0i64(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64* [[TMP9]])
   15998 // CHECK:   ret void
   15999 void test_vst1_s64_x2(int64_t *a, int64x1x2_t b) {
   16000   vst1_s64_x2(a, b);
   16001 }
   16002 
   16003 // CHECK-LABEL: define void @test_vst1_f16_x2(half* %a, [2 x <4 x half>] %b.coerce) #0 {
   16004 // CHECK:   [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
   16005 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
   16006 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[B]], i32 0, i32 0
   16007 // CHECK:   store [2 x <4 x half>] [[B]].coerce, [2 x <4 x half>]* [[COERCE_DIVE]], align 8
   16008 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
   16009 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
   16010 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
   16011 // CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
   16012 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
   16013 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i64 0, i64 0
   16014 // CHECK:   [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
   16015 // CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
   16016 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
   16017 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL1]], i64 0, i64 1
   16018 // CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
   16019 // CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
   16020 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
   16021 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
   16022 // CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16*
   16023 // CHECK:   call void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i16* [[TMP9]])
   16024 // CHECK:   ret void
   16025 void test_vst1_f16_x2(float16_t *a, float16x4x2_t b) {
   16026   vst1_f16_x2(a, b);
   16027 }
   16028 
   16029 // CHECK-LABEL: define void @test_vst1_f32_x2(float* %a, [2 x <2 x float>] %b.coerce) #0 {
   16030 // CHECK:   [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
   16031 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
   16032 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[B]], i32 0, i32 0
   16033 // CHECK:   store [2 x <2 x float>] [[B]].coerce, [2 x <2 x float>]* [[COERCE_DIVE]], align 8
   16034 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
   16035 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
   16036 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
   16037 // CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
   16038 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
   16039 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i64 0, i64 0
   16040 // CHECK:   [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
   16041 // CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
   16042 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
   16043 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL1]], i64 0, i64 1
   16044 // CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
   16045 // CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
   16046 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
   16047 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
   16048 // CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to float*
   16049 // CHECK:   call void @llvm.aarch64.neon.st1x2.v2f32.p0f32(<2 x float> [[TMP7]], <2 x float> [[TMP8]], float* [[TMP9]])
   16050 // CHECK:   ret void
   16051 void test_vst1_f32_x2(float32_t *a, float32x2x2_t b) {
   16052   vst1_f32_x2(a, b);
   16053 }
   16054 
   16055 // CHECK-LABEL: define void @test_vst1_f64_x2(double* %a, [2 x <1 x double>] %b.coerce) #0 {
   16056 // CHECK:   [[B:%.*]] = alloca %struct.float64x1x2_t, align 8
   16057 // CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x2_t, align 8
   16058 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[B]], i32 0, i32 0
   16059 // CHECK:   store [2 x <1 x double>] [[B]].coerce, [2 x <1 x double>]* [[COERCE_DIVE]], align 8
   16060 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x2_t* [[__S1]] to i8*
   16061 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x2_t* [[B]] to i8*
   16062 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
   16063 // CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
   16064 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[__S1]], i32 0, i32 0
   16065 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>]* [[VAL]], i64 0, i64 0
   16066 // CHECK:   [[TMP3:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
   16067 // CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
   16068 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[__S1]], i32 0, i32 0
   16069 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>]* [[VAL1]], i64 0, i64 1
   16070 // CHECK:   [[TMP5:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX2]], align 8
   16071 // CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
   16072 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
   16073 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
   16074 // CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to double*
   16075 // CHECK:   call void @llvm.aarch64.neon.st1x2.v1f64.p0f64(<1 x double> [[TMP7]], <1 x double> [[TMP8]], double* [[TMP9]])
   16076 // CHECK:   ret void
   16077 void test_vst1_f64_x2(float64_t *a, float64x1x2_t b) {
   16078   vst1_f64_x2(a, b);
   16079 }
   16080 
   16081 // CHECK-LABEL: define void @test_vst1_p8_x2(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
   16082 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
   16083 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
   16084 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
   16085 // CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
   16086 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
   16087 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
   16088 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
   16089 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
   16090 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
   16091 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
   16092 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
   16093 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i64 0, i64 1
   16094 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
   16095 // CHECK:   call void @llvm.aarch64.neon.st1x2.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i8* %a)
   16096 // CHECK:   ret void
   16097 void test_vst1_p8_x2(poly8_t *a, poly8x8x2_t b) {
   16098   vst1_p8_x2(a, b);
   16099 }
   16100 
   16101 // CHECK-LABEL: define void @test_vst1_p16_x2(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
   16102 // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
   16103 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
   16104 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[B]], i32 0, i32 0
   16105 // CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
   16106 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
   16107 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
   16108 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
   16109 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
   16110 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
   16111 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
   16112 // CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
   16113 // CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
   16114 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
   16115 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i64 0, i64 1
   16116 // CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
   16117 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
   16118 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
   16119 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
   16120 // CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16*
   16121 // CHECK:   call void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i16* [[TMP9]])
   16122 // CHECK:   ret void
   16123 void test_vst1_p16_x2(poly16_t *a, poly16x4x2_t b) {
   16124   vst1_p16_x2(a, b);
   16125 }
   16126 
   16127 // CHECK-LABEL: define void @test_vst1_p64_x2(i64* %a, [2 x <1 x i64>] %b.coerce) #0 {
   16128 // CHECK:   [[B:%.*]] = alloca %struct.poly64x1x2_t, align 8
   16129 // CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x2_t, align 8
   16130 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[B]], i32 0, i32 0
   16131 // CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
   16132 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__S1]] to i8*
   16133 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x2_t* [[B]] to i8*
   16134 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
   16135 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
   16136 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0
   16137 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
   16138 // CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
   16139 // CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
   16140 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0
   16141 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i64 0, i64 1
   16142 // CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
   16143 // CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
   16144 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
   16145 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
   16146 // CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i64*
   16147 // CHECK:   call void @llvm.aarch64.neon.st1x2.v1i64.p0i64(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64* [[TMP9]])
   16148 // CHECK:   ret void
   16149 void test_vst1_p64_x2(poly64_t *a, poly64x1x2_t b) {
   16150   vst1_p64_x2(a, b);
   16151 }
   16152 
   16153 // CHECK-LABEL: define void @test_vst1q_u8_x3(i8* %a, [3 x <16 x i8>] %b.coerce) #0 {
   16154 // CHECK:   [[B:%.*]] = alloca %struct.uint8x16x3_t, align 16
   16155 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 16
   16156 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[B]], i32 0, i32 0
   16157 // CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
   16158 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__S1]] to i8*
   16159 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x3_t* [[B]] to i8*
   16160 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
   16161 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
   16162 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
   16163 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
   16164 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
   16165 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i64 0, i64 1
   16166 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
   16167 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
   16168 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i64 0, i64 2
   16169 // CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
   16170 // CHECK:   call void @llvm.aarch64.neon.st1x3.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i8* %a)
   16171 // CHECK:   ret void
   16172 void test_vst1q_u8_x3(uint8_t *a, uint8x16x3_t b) {
   16173   vst1q_u8_x3(a, b);
   16174 }
   16175 
   16176 // CHECK-LABEL: define void @test_vst1q_u16_x3(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
   16177 // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
   16178 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
   16179 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0
   16180 // CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
   16181 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
   16182 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
   16183 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
   16184 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
   16185 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
   16186 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
   16187 // CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
   16188 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
   16189 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
   16190 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i64 0, i64 1
   16191 // CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
   16192 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
   16193 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
   16194 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i64 0, i64 2
   16195 // CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
   16196 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
   16197 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
   16198 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
   16199 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
   16200 // CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16*
   16201 // CHECK:   call void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i16* [[TMP12]])
   16202 // CHECK:   ret void
   16203 void test_vst1q_u16_x3(uint16_t *a, uint16x8x3_t b) {
   16204   vst1q_u16_x3(a, b);
   16205 }
   16206 
   16207 // CHECK-LABEL: define void @test_vst1q_u32_x3(i32* %a, [3 x <4 x i32>] %b.coerce) #0 {
   16208 // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
   16209 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
   16210 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0
   16211 // CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
   16212 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
   16213 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
   16214 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
   16215 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
   16216 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
   16217 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
   16218 // CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
   16219 // CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
   16220 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
   16221 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i64 0, i64 1
   16222 // CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
   16223 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
   16224 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
   16225 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i64 0, i64 2
   16226 // CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
   16227 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
   16228 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
   16229 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
   16230 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
   16231 // CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i32*
   16232 // CHECK:   call void @llvm.aarch64.neon.st1x3.v4i32.p0i32(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i32* [[TMP12]])
   16233 // CHECK:   ret void
   16234 void test_vst1q_u32_x3(uint32_t *a, uint32x4x3_t b) {
   16235   vst1q_u32_x3(a, b);
   16236 }
   16237 
   16238 // CHECK-LABEL: define void @test_vst1q_u64_x3(i64* %a, [3 x <2 x i64>] %b.coerce) #0 {
   16239 // CHECK:   [[B:%.*]] = alloca %struct.uint64x2x3_t, align 16
   16240 // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x3_t, align 16
   16241 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[B]], i32 0, i32 0
   16242 // CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
   16243 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__S1]] to i8*
   16244 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x3_t* [[B]] to i8*
   16245 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
   16246 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
   16247 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
   16248 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
   16249 // CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
   16250 // CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
   16251 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
   16252 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL1]], i64 0, i64 1
   16253 // CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
   16254 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
   16255 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
   16256 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL3]], i64 0, i64 2
   16257 // CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
   16258 // CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
   16259 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
   16260 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
   16261 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
   16262 // CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i64*
   16263 // CHECK:   call void @llvm.aarch64.neon.st1x3.v2i64.p0i64(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i64* [[TMP12]])
   16264 // CHECK:   ret void
   16265 void test_vst1q_u64_x3(uint64_t *a, uint64x2x3_t b) {
   16266   vst1q_u64_x3(a, b);
   16267 }
   16268 
   16269 // CHECK-LABEL: define void @test_vst1q_s8_x3(i8* %a, [3 x <16 x i8>] %b.coerce) #0 {
   16270 // CHECK:   [[B:%.*]] = alloca %struct.int8x16x3_t, align 16
   16271 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 16
   16272 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[B]], i32 0, i32 0
   16273 // CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
   16274 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__S1]] to i8*
   16275 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x3_t* [[B]] to i8*
   16276 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
   16277 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
   16278 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
   16279 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
   16280 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
   16281 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i64 0, i64 1
   16282 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
   16283 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
   16284 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i64 0, i64 2
   16285 // CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
   16286 // CHECK:   call void @llvm.aarch64.neon.st1x3.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i8* %a)
   16287 // CHECK:   ret void
   16288 void test_vst1q_s8_x3(int8_t *a, int8x16x3_t b) {
   16289   vst1q_s8_x3(a, b);
   16290 }
   16291 
   16292 // CHECK-LABEL: define void @test_vst1q_s16_x3(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
   16293 // CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
   16294 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
   16295 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0
   16296 // CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
   16297 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
   16298 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
   16299 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
   16300 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
   16301 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
   16302 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
   16303 // CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
   16304 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
   16305 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
   16306 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i64 0, i64 1
   16307 // CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
   16308 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
   16309 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
   16310 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i64 0, i64 2
   16311 // CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
   16312 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
   16313 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
   16314 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
   16315 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
   16316 // CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16*
   16317 // CHECK:   call void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i16* [[TMP12]])
   16318 // CHECK:   ret void
   16319 void test_vst1q_s16_x3(int16_t *a, int16x8x3_t b) {
   16320   vst1q_s16_x3(a, b);
   16321 }
   16322 
   16323 // CHECK-LABEL: define void @test_vst1q_s32_x3(i32* %a, [3 x <4 x i32>] %b.coerce) #0 {
   16324 // CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
   16325 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
   16326 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0
   16327 // CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
   16328 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
   16329 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
   16330 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
   16331 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
   16332 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
   16333 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
   16334 // CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
   16335 // CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
   16336 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
   16337 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i64 0, i64 1
   16338 // CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
   16339 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
   16340 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
   16341 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i64 0, i64 2
   16342 // CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
   16343 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
   16344 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
   16345 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
   16346 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
   16347 // CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i32*
   16348 // CHECK:   call void @llvm.aarch64.neon.st1x3.v4i32.p0i32(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i32* [[TMP12]])
   16349 // CHECK:   ret void
   16350 void test_vst1q_s32_x3(int32_t *a, int32x4x3_t b) {
   16351   vst1q_s32_x3(a, b);
   16352 }
   16353 
   16354 // CHECK-LABEL: define void @test_vst1q_s64_x3(i64* %a, [3 x <2 x i64>] %b.coerce) #0 {
   16355 // CHECK:   [[B:%.*]] = alloca %struct.int64x2x3_t, align 16
   16356 // CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x3_t, align 16
   16357 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[B]], i32 0, i32 0
   16358 // CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
   16359 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__S1]] to i8*
   16360 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x3_t* [[B]] to i8*
   16361 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
   16362 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
   16363 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
   16364 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
   16365 // CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
   16366 // CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
   16367 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
   16368 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL1]], i64 0, i64 1
   16369 // CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
   16370 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
   16371 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
   16372 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL3]], i64 0, i64 2
   16373 // CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
   16374 // CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
   16375 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
   16376 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
   16377 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
   16378 // CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i64*
   16379 // CHECK:   call void @llvm.aarch64.neon.st1x3.v2i64.p0i64(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i64* [[TMP12]])
   16380 // CHECK:   ret void
   16381 void test_vst1q_s64_x3(int64_t *a, int64x2x3_t b) {
   16382   vst1q_s64_x3(a, b);
   16383 }
   16384 
   16385 // CHECK-LABEL: define void @test_vst1q_f16_x3(half* %a, [3 x <8 x half>] %b.coerce) #0 {
   16386 // CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
   16387 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
   16388 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0
   16389 // CHECK:   store [3 x <8 x half>] [[B]].coerce, [3 x <8 x half>]* [[COERCE_DIVE]], align 16
   16390 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
   16391 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
   16392 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
   16393 // CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
   16394 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
   16395 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i64 0, i64 0
   16396 // CHECK:   [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
   16397 // CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
   16398 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
   16399 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL1]], i64 0, i64 1
   16400 // CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
   16401 // CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
   16402 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
   16403 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL3]], i64 0, i64 2
   16404 // CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
   16405 // CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
   16406 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
   16407 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
   16408 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
   16409 // CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16*
   16410 // CHECK:   call void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i16* [[TMP12]])
   16411 // CHECK:   ret void
   16412 void test_vst1q_f16_x3(float16_t *a, float16x8x3_t b) {
   16413   vst1q_f16_x3(a, b);
   16414 }
   16415 
   16416 // CHECK-LABEL: define void @test_vst1q_f32_x3(float* %a, [3 x <4 x float>] %b.coerce) #0 {
   16417 // CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
   16418 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
   16419 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0
   16420 // CHECK:   store [3 x <4 x float>] [[B]].coerce, [3 x <4 x float>]* [[COERCE_DIVE]], align 16
   16421 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
   16422 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
   16423 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
   16424 // CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
   16425 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
   16426 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i64 0, i64 0
   16427 // CHECK:   [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
   16428 // CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
   16429 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
   16430 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL1]], i64 0, i64 1
   16431 // CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
   16432 // CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
   16433 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
   16434 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL3]], i64 0, i64 2
   16435 // CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
   16436 // CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
   16437 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
   16438 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
   16439 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
   16440 // CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to float*
   16441 // CHECK:   call void @llvm.aarch64.neon.st1x3.v4f32.p0f32(<4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x float> [[TMP11]], float* [[TMP12]])
   16442 // CHECK:   ret void
   16443 void test_vst1q_f32_x3(float32_t *a, float32x4x3_t b) {
   16444   vst1q_f32_x3(a, b);
   16445 }
   16446 
   16447 // CHECK-LABEL: define void @test_vst1q_f64_x3(double* %a, [3 x <2 x double>] %b.coerce) #0 {
   16448 // CHECK:   [[B:%.*]] = alloca %struct.float64x2x3_t, align 16
   16449 // CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x3_t, align 16
   16450 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[B]], i32 0, i32 0
   16451 // CHECK:   store [3 x <2 x double>] [[B]].coerce, [3 x <2 x double>]* [[COERCE_DIVE]], align 16
   16452 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x3_t* [[__S1]] to i8*
   16453 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x3_t* [[B]] to i8*
   16454 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
   16455 // CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
   16456 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
   16457 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL]], i64 0, i64 0
   16458 // CHECK:   [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
   16459 // CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
   16460 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
   16461 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL1]], i64 0, i64 1
   16462 // CHECK:   [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX2]], align 16
   16463 // CHECK:   [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
   16464 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
   16465 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL3]], i64 0, i64 2
   16466 // CHECK:   [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX4]], align 16
   16467 // CHECK:   [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8>
   16468 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
   16469 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
   16470 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double>
   16471 // CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to double*
   16472 // CHECK:   call void @llvm.aarch64.neon.st1x3.v2f64.p0f64(<2 x double> [[TMP9]], <2 x double> [[TMP10]], <2 x double> [[TMP11]], double* [[TMP12]])
   16473 // CHECK:   ret void
   16474 void test_vst1q_f64_x3(float64_t *a, float64x2x3_t b) {
   16475   vst1q_f64_x3(a, b);
   16476 }
   16477 
   16478 // CHECK-LABEL: define void @test_vst1q_p8_x3(i8* %a, [3 x <16 x i8>] %b.coerce) #0 {
   16479 // CHECK:   [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16
   16480 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 16
   16481 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[B]], i32 0, i32 0
   16482 // CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
   16483 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__S1]] to i8*
   16484 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x3_t* [[B]] to i8*
   16485 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
   16486 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
   16487 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
   16488 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
   16489 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
   16490 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i64 0, i64 1
   16491 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
   16492 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
   16493 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i64 0, i64 2
   16494 // CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
   16495 // CHECK:   call void @llvm.aarch64.neon.st1x3.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i8* %a)
   16496 // CHECK:   ret void
   16497 void test_vst1q_p8_x3(poly8_t *a, poly8x16x3_t b) {
   16498   vst1q_p8_x3(a, b);
   16499 }
   16500 
   16501 // CHECK-LABEL: define void @test_vst1q_p16_x3(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
   16502 // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
   16503 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
   16504 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0
   16505 // CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
   16506 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
   16507 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
   16508 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
   16509 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
   16510 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
   16511 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
   16512 // CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
   16513 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
   16514 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
   16515 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i64 0, i64 1
   16516 // CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
   16517 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
   16518 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
   16519 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i64 0, i64 2
   16520 // CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
   16521 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
   16522 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
   16523 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
   16524 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
   16525 // CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16*
   16526 // CHECK:   call void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i16* [[TMP12]])
   16527 // CHECK:   ret void
   16528 void test_vst1q_p16_x3(poly16_t *a, poly16x8x3_t b) {
   16529   vst1q_p16_x3(a, b);
   16530 }
   16531 
   16532 // CHECK-LABEL: define void @test_vst1q_p64_x3(i64* %a, [3 x <2 x i64>] %b.coerce) #0 {
   16533 // CHECK:   [[B:%.*]] = alloca %struct.poly64x2x3_t, align 16
   16534 // CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x3_t, align 16
   16535 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[B]], i32 0, i32 0
   16536 // CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
   16537 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__S1]] to i8*
   16538 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x3_t* [[B]] to i8*
   16539 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
   16540 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
   16541 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
   16542 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
   16543 // CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
   16544 // CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
   16545 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
   16546 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL1]], i64 0, i64 1
   16547 // CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
   16548 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
   16549 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
   16550 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL3]], i64 0, i64 2
   16551 // CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
   16552 // CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
   16553 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
   16554 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
   16555 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
   16556 // CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i64*
   16557 // CHECK:   call void @llvm.aarch64.neon.st1x3.v2i64.p0i64(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i64* [[TMP12]])
   16558 // CHECK:   ret void
   16559 void test_vst1q_p64_x3(poly64_t *a, poly64x2x3_t b) {
   16560   vst1q_p64_x3(a, b);
   16561 }
   16562 
   16563 // CHECK-LABEL: define void @test_vst1_u8_x3(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
   16564 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
   16565 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
   16566 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
   16567 // CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
   16568 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
   16569 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
   16570 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
   16571 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
   16572 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
   16573 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
   16574 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
   16575 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i64 0, i64 1
   16576 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
   16577 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
   16578 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i64 0, i64 2
   16579 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
   16580 // CHECK:   call void @llvm.aarch64.neon.st1x3.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i8* %a)
   16581 // CHECK:   ret void
   16582 void test_vst1_u8_x3(uint8_t *a, uint8x8x3_t b) {
   16583   vst1_u8_x3(a, b);
   16584 }
   16585 
   16586 // CHECK-LABEL: define void @test_vst1_u16_x3(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
   16587 // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
   16588 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
   16589 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[B]], i32 0, i32 0
   16590 // CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
   16591 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
   16592 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
   16593 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
   16594 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
   16595 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
   16596 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
   16597 // CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
   16598 // CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
   16599 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
   16600 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i64 0, i64 1
   16601 // CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
   16602 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
   16603 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
   16604 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i64 0, i64 2
   16605 // CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
   16606 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
   16607 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
   16608 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
   16609 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
   16610 // CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16*
   16611 // CHECK:   call void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i16* [[TMP12]])
   16612 // CHECK:   ret void
   16613 void test_vst1_u16_x3(uint16_t *a, uint16x4x3_t b) {
   16614   vst1_u16_x3(a, b);
   16615 }
   16616 
   16617 // CHECK-LABEL: define void @test_vst1_u32_x3(i32* %a, [3 x <2 x i32>] %b.coerce) #0 {
   16618 // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
   16619 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
   16620 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[B]], i32 0, i32 0
   16621 // CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
   16622 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
   16623 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
   16624 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
   16625 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
   16626 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
   16627 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
   16628 // CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
   16629 // CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
   16630 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
   16631 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i64 0, i64 1
   16632 // CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
   16633 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
   16634 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
   16635 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i64 0, i64 2
   16636 // CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
   16637 // CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
   16638 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
   16639 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
   16640 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
   16641 // CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i32*
   16642 // CHECK:   call void @llvm.aarch64.neon.st1x3.v2i32.p0i32(<2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i32* [[TMP12]])
   16643 // CHECK:   ret void
   16644 void test_vst1_u32_x3(uint32_t *a, uint32x2x3_t b) {
   16645   vst1_u32_x3(a, b);
   16646 }
   16647 
   16648 // CHECK-LABEL: define void @test_vst1_u64_x3(i64* %a, [3 x <1 x i64>] %b.coerce) #0 {
   16649 // CHECK:   [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8
   16650 // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8
   16651 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[B]], i32 0, i32 0
   16652 // CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
   16653 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__S1]] to i8*
   16654 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x3_t* [[B]] to i8*
   16655 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
   16656 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
   16657 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
   16658 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
   16659 // CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
   16660 // CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
   16661 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
   16662 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i64 0, i64 1
   16663 // CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
   16664 // CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
   16665 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
   16666 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL3]], i64 0, i64 2
   16667 // CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
   16668 // CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
   16669 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
   16670 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
   16671 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
   16672 // CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i64*
   16673 // CHECK:   call void @llvm.aarch64.neon.st1x3.v1i64.p0i64(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i64* [[TMP12]])
   16674 // CHECK:   ret void
   16675 void test_vst1_u64_x3(uint64_t *a, uint64x1x3_t b) {
   16676   vst1_u64_x3(a, b);
   16677 }
   16678 
   16679 // CHECK-LABEL: define void @test_vst1_s8_x3(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
   16680 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
   16681 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
   16682 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
   16683 // CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
   16684 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
   16685 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
   16686 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
   16687 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
   16688 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
   16689 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
   16690 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
   16691 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i64 0, i64 1
   16692 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
   16693 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
   16694 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i64 0, i64 2
   16695 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
   16696 // CHECK:   call void @llvm.aarch64.neon.st1x3.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i8* %a)
   16697 // CHECK:   ret void
   16698 void test_vst1_s8_x3(int8_t *a, int8x8x3_t b) {
   16699   vst1_s8_x3(a, b);
   16700 }
   16701 
   16702 // CHECK-LABEL: define void @test_vst1_s16_x3(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
   16703 // CHECK:   [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
   16704 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
   16705 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[B]], i32 0, i32 0
   16706 // CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
   16707 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
   16708 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
   16709 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
   16710 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
   16711 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
   16712 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
   16713 // CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
   16714 // CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
   16715 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
   16716 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i64 0, i64 1
   16717 // CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
   16718 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
   16719 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
   16720 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i64 0, i64 2
   16721 // CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
   16722 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
   16723 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
   16724 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
   16725 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
   16726 // CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16*
   16727 // CHECK:   call void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i16* [[TMP12]])
   16728 // CHECK:   ret void
   16729 void test_vst1_s16_x3(int16_t *a, int16x4x3_t b) {
   16730   vst1_s16_x3(a, b);
   16731 }
   16732 
   16733 // CHECK-LABEL: define void @test_vst1_s32_x3(i32* %a, [3 x <2 x i32>] %b.coerce) #0 {
   16734 // CHECK:   [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
   16735 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
   16736 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[B]], i32 0, i32 0
   16737 // CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
   16738 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
   16739 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
   16740 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
   16741 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
   16742 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
   16743 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
   16744 // CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
   16745 // CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
   16746 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
   16747 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i64 0, i64 1
   16748 // CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
   16749 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
   16750 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
   16751 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i64 0, i64 2
   16752 // CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
   16753 // CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
   16754 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
   16755 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
   16756 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
   16757 // CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i32*
   16758 // CHECK:   call void @llvm.aarch64.neon.st1x3.v2i32.p0i32(<2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i32* [[TMP12]])
   16759 // CHECK:   ret void
   16760 void test_vst1_s32_x3(int32_t *a, int32x2x3_t b) {
   16761   vst1_s32_x3(a, b);
   16762 }
   16763 
   16764 // CHECK-LABEL: define void @test_vst1_s64_x3(i64* %a, [3 x <1 x i64>] %b.coerce) #0 {
   16765 // CHECK:   [[B:%.*]] = alloca %struct.int64x1x3_t, align 8
   16766 // CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8
   16767 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[B]], i32 0, i32 0
   16768 // CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
   16769 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__S1]] to i8*
   16770 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x3_t* [[B]] to i8*
   16771 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
   16772 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
   16773 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
   16774 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
   16775 // CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
   16776 // CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
   16777 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
   16778 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i64 0, i64 1
   16779 // CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
   16780 // CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
   16781 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
   16782 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL3]], i64 0, i64 2
   16783 // CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
   16784 // CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
   16785 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
   16786 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
   16787 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
   16788 // CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i64*
   16789 // CHECK:   call void @llvm.aarch64.neon.st1x3.v1i64.p0i64(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i64* [[TMP12]])
   16790 // CHECK:   ret void
   16791 void test_vst1_s64_x3(int64_t *a, int64x1x3_t b) {
   16792   vst1_s64_x3(a, b);
   16793 }
   16794 
   16795 // CHECK-LABEL: define void @test_vst1_f16_x3(half* %a, [3 x <4 x half>] %b.coerce) #0 {
   16796 // CHECK:   [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
   16797 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
   16798 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[B]], i32 0, i32 0
   16799 // CHECK:   store [3 x <4 x half>] [[B]].coerce, [3 x <4 x half>]* [[COERCE_DIVE]], align 8
   16800 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
   16801 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
   16802 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
   16803 // CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
   16804 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
   16805 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i64 0, i64 0
   16806 // CHECK:   [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
   16807 // CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
   16808 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
   16809 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL1]], i64 0, i64 1
   16810 // CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
   16811 // CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
   16812 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
   16813 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL3]], i64 0, i64 2
   16814 // CHECK:   [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
   16815 // CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
   16816 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
   16817 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
   16818 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
   16819 // CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16*
   16820 // CHECK:   call void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i16* [[TMP12]])
   16821 // CHECK:   ret void
   16822 void test_vst1_f16_x3(float16_t *a, float16x4x3_t b) {
   16823   vst1_f16_x3(a, b);
   16824 }
   16825 
   16826 // CHECK-LABEL: define void @test_vst1_f32_x3(float* %a, [3 x <2 x float>] %b.coerce) #0 {
   16827 // CHECK:   [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
   16828 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
   16829 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[B]], i32 0, i32 0
   16830 // CHECK:   store [3 x <2 x float>] [[B]].coerce, [3 x <2 x float>]* [[COERCE_DIVE]], align 8
   16831 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
   16832 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
   16833 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
   16834 // CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
   16835 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
   16836 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i64 0, i64 0
   16837 // CHECK:   [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
   16838 // CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
   16839 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
   16840 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL1]], i64 0, i64 1
   16841 // CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
   16842 // CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
   16843 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
   16844 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL3]], i64 0, i64 2
   16845 // CHECK:   [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
   16846 // CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
   16847 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
   16848 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
   16849 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
   16850 // CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to float*
   16851 // CHECK:   call void @llvm.aarch64.neon.st1x3.v2f32.p0f32(<2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x float> [[TMP11]], float* [[TMP12]])
   16852 // CHECK:   ret void
   16853 void test_vst1_f32_x3(float32_t *a, float32x2x3_t b) {
   16854   vst1_f32_x3(a, b);
   16855 }
   16856 
   16857 // CHECK-LABEL: define void @test_vst1_f64_x3(double* %a, [3 x <1 x double>] %b.coerce) #0 {
   16858 // CHECK:   [[B:%.*]] = alloca %struct.float64x1x3_t, align 8
   16859 // CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x3_t, align 8
   16860 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[B]], i32 0, i32 0
   16861 // CHECK:   store [3 x <1 x double>] [[B]].coerce, [3 x <1 x double>]* [[COERCE_DIVE]], align 8
   16862 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x3_t* [[__S1]] to i8*
   16863 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x3_t* [[B]] to i8*
   16864 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
   16865 // CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
   16866 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
   16867 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL]], i64 0, i64 0
   16868 // CHECK:   [[TMP3:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
   16869 // CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
   16870 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
   16871 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL1]], i64 0, i64 1
   16872 // CHECK:   [[TMP5:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX2]], align 8
   16873 // CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
   16874 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
   16875 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL3]], i64 0, i64 2
   16876 // CHECK:   [[TMP7:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX4]], align 8
   16877 // CHECK:   [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8>
   16878 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
   16879 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
   16880 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double>
   16881 // CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to double*
   16882 // CHECK:   call void @llvm.aarch64.neon.st1x3.v1f64.p0f64(<1 x double> [[TMP9]], <1 x double> [[TMP10]], <1 x double> [[TMP11]], double* [[TMP12]])
   16883 // CHECK:   ret void
   16884 void test_vst1_f64_x3(float64_t *a, float64x1x3_t b) {
   16885   vst1_f64_x3(a, b);
   16886 }
   16887 
   16888 // CHECK-LABEL: define void @test_vst1_p8_x3(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
   16889 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
   16890 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
   16891 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
   16892 // CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
   16893 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
   16894 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
   16895 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
   16896 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
   16897 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
   16898 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
   16899 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
   16900 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i64 0, i64 1
   16901 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
   16902 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
   16903 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i64 0, i64 2
   16904 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
   16905 // CHECK:   call void @llvm.aarch64.neon.st1x3.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i8* %a)
   16906 // CHECK:   ret void
   16907 void test_vst1_p8_x3(poly8_t *a, poly8x8x3_t b) {
   16908   vst1_p8_x3(a, b);
   16909 }
   16910 
   16911 // CHECK-LABEL: define void @test_vst1_p16_x3(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
   16912 // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
   16913 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
   16914 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[B]], i32 0, i32 0
   16915 // CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
   16916 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
   16917 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
   16918 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
   16919 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
   16920 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
   16921 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
   16922 // CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
   16923 // CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
   16924 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
   16925 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i64 0, i64 1
   16926 // CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
   16927 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
   16928 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
   16929 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i64 0, i64 2
   16930 // CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
   16931 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
   16932 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
   16933 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
   16934 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
   16935 // CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16*
   16936 // CHECK:   call void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i16* [[TMP12]])
   16937 // CHECK:   ret void
   16938 void test_vst1_p16_x3(poly16_t *a, poly16x4x3_t b) {
   16939   vst1_p16_x3(a, b);
   16940 }
   16941 
   16942 // CHECK-LABEL: define void @test_vst1_p64_x3(i64* %a, [3 x <1 x i64>] %b.coerce) #0 {
   16943 // CHECK:   [[B:%.*]] = alloca %struct.poly64x1x3_t, align 8
   16944 // CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x3_t, align 8
   16945 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[B]], i32 0, i32 0
   16946 // CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
   16947 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__S1]] to i8*
   16948 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x3_t* [[B]] to i8*
   16949 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
   16950 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
   16951 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
   16952 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
   16953 // CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
   16954 // CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
   16955 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
   16956 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i64 0, i64 1
   16957 // CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
   16958 // CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
   16959 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
   16960 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL3]], i64 0, i64 2
   16961 // CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
   16962 // CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
   16963 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
   16964 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
   16965 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
   16966 // CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i64*
   16967 // CHECK:   call void @llvm.aarch64.neon.st1x3.v1i64.p0i64(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i64* [[TMP12]])
   16968 // CHECK:   ret void
   16969 void test_vst1_p64_x3(poly64_t *a, poly64x1x3_t b) {
   16970   vst1_p64_x3(a, b);
   16971 }
   16972 
   16973 // CHECK-LABEL: define void @test_vst1q_u8_x4(i8* %a, [4 x <16 x i8>] %b.coerce) #0 {
   16974 // CHECK:   [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16
   16975 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 16
   16976 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[B]], i32 0, i32 0
   16977 // CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
   16978 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__S1]] to i8*
   16979 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x4_t* [[B]] to i8*
   16980 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
   16981 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
   16982 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
   16983 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
   16984 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
   16985 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i64 0, i64 1
   16986 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
   16987 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
   16988 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i64 0, i64 2
   16989 // CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
   16990 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
   16991 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i64 0, i64 3
   16992 // CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
   16993 // CHECK:   call void @llvm.aarch64.neon.st1x4.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i8* %a)
   16994 // CHECK:   ret void
   16995 void test_vst1q_u8_x4(uint8_t *a, uint8x16x4_t b) {
   16996   vst1q_u8_x4(a, b);
   16997 }
   16998 
   16999 // CHECK-LABEL: define void @test_vst1q_u16_x4(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
   17000 // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
   17001 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
   17002 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0
   17003 // CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
   17004 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
   17005 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
   17006 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
   17007 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
   17008 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
   17009 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
   17010 // CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
   17011 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
   17012 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
   17013 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i64 0, i64 1
   17014 // CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
   17015 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
   17016 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
   17017 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i64 0, i64 2
   17018 // CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
   17019 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
   17020 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
   17021 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i64 0, i64 3
   17022 // CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
   17023 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
   17024 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
   17025 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
   17026 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
   17027 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
   17028 // CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16*
   17029 // CHECK:   call void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i16* [[TMP15]])
   17030 // CHECK:   ret void
   17031 void test_vst1q_u16_x4(uint16_t *a, uint16x8x4_t b) {
   17032   vst1q_u16_x4(a, b);
   17033 }
   17034 
   17035 // CHECK-LABEL: define void @test_vst1q_u32_x4(i32* %a, [4 x <4 x i32>] %b.coerce) #0 {
   17036 // CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
   17037 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
   17038 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0
   17039 // CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
   17040 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
   17041 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
   17042 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
   17043 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
   17044 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
   17045 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
   17046 // CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
   17047 // CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
   17048 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
   17049 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i64 0, i64 1
   17050 // CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
   17051 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
   17052 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
   17053 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i64 0, i64 2
   17054 // CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
   17055 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
   17056 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
   17057 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i64 0, i64 3
   17058 // CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
   17059 // CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
   17060 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
   17061 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
   17062 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
   17063 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
   17064 // CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i32*
   17065 // CHECK:   call void @llvm.aarch64.neon.st1x4.v4i32.p0i32(<4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], i32* [[TMP15]])
   17066 // CHECK:   ret void
   17067 void test_vst1q_u32_x4(uint32_t *a, uint32x4x4_t b) {
   17068   vst1q_u32_x4(a, b);
   17069 }
   17070 
   17071 // CHECK-LABEL: define void @test_vst1q_u64_x4(i64* %a, [4 x <2 x i64>] %b.coerce) #0 {
   17072 // CHECK:   [[B:%.*]] = alloca %struct.uint64x2x4_t, align 16
   17073 // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x4_t, align 16
   17074 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[B]], i32 0, i32 0
   17075 // CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
   17076 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__S1]] to i8*
   17077 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x4_t* [[B]] to i8*
   17078 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
   17079 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
   17080 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
   17081 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
   17082 // CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
   17083 // CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
   17084 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
   17085 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL1]], i64 0, i64 1
   17086 // CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
   17087 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
   17088 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
   17089 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL3]], i64 0, i64 2
   17090 // CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
   17091 // CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
   17092 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
   17093 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL5]], i64 0, i64 3
   17094 // CHECK:   [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX6]], align 16
   17095 // CHECK:   [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
   17096 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
   17097 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
   17098 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
   17099 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
   17100 // CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i64*
   17101 // CHECK:   call void @llvm.aarch64.neon.st1x4.v2i64.p0i64(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i64* [[TMP15]])
   17102 // CHECK:   ret void
   17103 void test_vst1q_u64_x4(uint64_t *a, uint64x2x4_t b) {
   17104   vst1q_u64_x4(a, b);
   17105 }
   17106 
   17107 // CHECK-LABEL: define void @test_vst1q_s8_x4(i8* %a, [4 x <16 x i8>] %b.coerce) #0 {
   17108 // CHECK:   [[B:%.*]] = alloca %struct.int8x16x4_t, align 16
   17109 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 16
   17110 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[B]], i32 0, i32 0
   17111 // CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
   17112 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__S1]] to i8*
   17113 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x4_t* [[B]] to i8*
   17114 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
   17115 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
   17116 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
   17117 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
   17118 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
   17119 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i64 0, i64 1
   17120 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
   17121 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
   17122 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i64 0, i64 2
   17123 // CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
   17124 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
   17125 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i64 0, i64 3
   17126 // CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
   17127 // CHECK:   call void @llvm.aarch64.neon.st1x4.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i8* %a)
   17128 // CHECK:   ret void
   17129 void test_vst1q_s8_x4(int8_t *a, int8x16x4_t b) {
   17130   vst1q_s8_x4(a, b);
   17131 }
   17132 
   17133 // CHECK-LABEL: define void @test_vst1q_s16_x4(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
   17134 // CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
   17135 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
   17136 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0
   17137 // CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
   17138 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
   17139 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
   17140 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
   17141 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
   17142 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
   17143 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
   17144 // CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
   17145 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
   17146 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
   17147 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i64 0, i64 1
   17148 // CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
   17149 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
   17150 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
   17151 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i64 0, i64 2
   17152 // CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
   17153 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
   17154 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
   17155 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i64 0, i64 3
   17156 // CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
   17157 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
   17158 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
   17159 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
   17160 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
   17161 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
   17162 // CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16*
   17163 // CHECK:   call void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i16* [[TMP15]])
   17164 // CHECK:   ret void
   17165 void test_vst1q_s16_x4(int16_t *a, int16x8x4_t b) {
   17166   vst1q_s16_x4(a, b);
   17167 }
   17168 
   17169 // CHECK-LABEL: define void @test_vst1q_s32_x4(i32* %a, [4 x <4 x i32>] %b.coerce) #0 {
   17170 // CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
   17171 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
   17172 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0
   17173 // CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
   17174 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
   17175 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
   17176 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
   17177 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
   17178 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
   17179 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
   17180 // CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
   17181 // CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
   17182 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
   17183 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i64 0, i64 1
   17184 // CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
   17185 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
   17186 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
   17187 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i64 0, i64 2
   17188 // CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
   17189 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
   17190 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
   17191 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i64 0, i64 3
   17192 // CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
   17193 // CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
   17194 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
   17195 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
   17196 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
   17197 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
   17198 // CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i32*
   17199 // CHECK:   call void @llvm.aarch64.neon.st1x4.v4i32.p0i32(<4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], i32* [[TMP15]])
   17200 // CHECK:   ret void
   17201 void test_vst1q_s32_x4(int32_t *a, int32x4x4_t b) {
   17202   vst1q_s32_x4(a, b);
   17203 }
   17204 
   17205 // CHECK-LABEL: define void @test_vst1q_s64_x4(i64* %a, [4 x <2 x i64>] %b.coerce) #0 {
   17206 // CHECK:   [[B:%.*]] = alloca %struct.int64x2x4_t, align 16
   17207 // CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x4_t, align 16
   17208 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[B]], i32 0, i32 0
   17209 // CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
   17210 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__S1]] to i8*
   17211 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x4_t* [[B]] to i8*
   17212 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
   17213 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
   17214 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
   17215 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
   17216 // CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
   17217 // CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
   17218 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
   17219 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL1]], i64 0, i64 1
   17220 // CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
   17221 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
   17222 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
   17223 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL3]], i64 0, i64 2
   17224 // CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
   17225 // CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
   17226 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
   17227 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL5]], i64 0, i64 3
   17228 // CHECK:   [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX6]], align 16
   17229 // CHECK:   [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
   17230 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
   17231 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
   17232 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
   17233 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
   17234 // CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i64*
   17235 // CHECK:   call void @llvm.aarch64.neon.st1x4.v2i64.p0i64(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i64* [[TMP15]])
   17236 // CHECK:   ret void
   17237 void test_vst1q_s64_x4(int64_t *a, int64x2x4_t b) {
   17238   vst1q_s64_x4(a, b);
   17239 }
   17240 
   17241 // CHECK-LABEL: define void @test_vst1q_f16_x4(half* %a, [4 x <8 x half>] %b.coerce) #0 {
   17242 // CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
   17243 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
   17244 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0
   17245 // CHECK:   store [4 x <8 x half>] [[B]].coerce, [4 x <8 x half>]* [[COERCE_DIVE]], align 16
   17246 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
   17247 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
   17248 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
   17249 // CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
   17250 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
   17251 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i64 0, i64 0
   17252 // CHECK:   [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
   17253 // CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
   17254 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
   17255 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL1]], i64 0, i64 1
   17256 // CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
   17257 // CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
   17258 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
   17259 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL3]], i64 0, i64 2
   17260 // CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
   17261 // CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
   17262 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
   17263 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL5]], i64 0, i64 3
   17264 // CHECK:   [[TMP9:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX6]], align 16
   17265 // CHECK:   [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8>
   17266 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
   17267 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
   17268 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
   17269 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
   17270 // CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16*
   17271 // CHECK:   call void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i16* [[TMP15]])
   17272 // CHECK:   ret void
   17273 void test_vst1q_f16_x4(float16_t *a, float16x8x4_t b) {
   17274   vst1q_f16_x4(a, b);
   17275 }
   17276 
   17277 // CHECK-LABEL: define void @test_vst1q_f32_x4(float* %a, [4 x <4 x float>] %b.coerce) #0 {
   17278 // CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
   17279 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
   17280 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0
   17281 // CHECK:   store [4 x <4 x float>] [[B]].coerce, [4 x <4 x float>]* [[COERCE_DIVE]], align 16
   17282 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
   17283 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
   17284 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
   17285 // CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
   17286 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
   17287 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i64 0, i64 0
   17288 // CHECK:   [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
   17289 // CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
   17290 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
   17291 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL1]], i64 0, i64 1
   17292 // CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
   17293 // CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
   17294 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
   17295 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL3]], i64 0, i64 2
   17296 // CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
   17297 // CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
   17298 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
   17299 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL5]], i64 0, i64 3
   17300 // CHECK:   [[TMP9:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX6]], align 16
   17301 // CHECK:   [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8>
   17302 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
   17303 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
   17304 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
   17305 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x float>
   17306 // CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to float*
   17307 // CHECK:   call void @llvm.aarch64.neon.st1x4.v4f32.p0f32(<4 x float> [[TMP11]], <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], float* [[TMP15]])
   17308 // CHECK:   ret void
   17309 void test_vst1q_f32_x4(float32_t *a, float32x4x4_t b) {
   17310   vst1q_f32_x4(a, b);
   17311 }
   17312 
   17313 // CHECK-LABEL: define void @test_vst1q_f64_x4(double* %a, [4 x <2 x double>] %b.coerce) #0 {
   17314 // CHECK:   [[B:%.*]] = alloca %struct.float64x2x4_t, align 16
   17315 // CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x4_t, align 16
   17316 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[B]], i32 0, i32 0
   17317 // CHECK:   store [4 x <2 x double>] [[B]].coerce, [4 x <2 x double>]* [[COERCE_DIVE]], align 16
   17318 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x4_t* [[__S1]] to i8*
   17319 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x4_t* [[B]] to i8*
   17320 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
   17321 // CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
   17322 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
   17323 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]], i64 0, i64 0
   17324 // CHECK:   [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
   17325 // CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
   17326 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
   17327 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL1]], i64 0, i64 1
   17328 // CHECK:   [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX2]], align 16
   17329 // CHECK:   [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
   17330 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
   17331 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL3]], i64 0, i64 2
   17332 // CHECK:   [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX4]], align 16
   17333 // CHECK:   [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8>
   17334 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
   17335 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL5]], i64 0, i64 3
   17336 // CHECK:   [[TMP9:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX6]], align 16
   17337 // CHECK:   [[TMP10:%.*]] = bitcast <2 x double> [[TMP9]] to <16 x i8>
   17338 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
   17339 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
   17340 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double>
   17341 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x double>
   17342 // CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to double*
   17343 // CHECK:   call void @llvm.aarch64.neon.st1x4.v2f64.p0f64(<2 x double> [[TMP11]], <2 x double> [[TMP12]], <2 x double> [[TMP13]], <2 x double> [[TMP14]], double* [[TMP15]])
   17344 // CHECK:   ret void
   17345 void test_vst1q_f64_x4(float64_t *a, float64x2x4_t b) {
   17346   vst1q_f64_x4(a, b);
   17347 }
   17348 
   17349 // CHECK-LABEL: define void @test_vst1q_p8_x4(i8* %a, [4 x <16 x i8>] %b.coerce) #0 {
   17350 // CHECK:   [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16
   17351 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 16
   17352 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[B]], i32 0, i32 0
   17353 // CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
   17354 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__S1]] to i8*
   17355 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x4_t* [[B]] to i8*
   17356 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
   17357 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
   17358 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
   17359 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
   17360 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
   17361 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i64 0, i64 1
   17362 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
   17363 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
   17364 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i64 0, i64 2
   17365 // CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
   17366 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
   17367 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i64 0, i64 3
   17368 // CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
   17369 // CHECK:   call void @llvm.aarch64.neon.st1x4.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i8* %a)
   17370 // CHECK:   ret void
   17371 void test_vst1q_p8_x4(poly8_t *a, poly8x16x4_t b) {
   17372   vst1q_p8_x4(a, b);
   17373 }
   17374 
   17375 // CHECK-LABEL: define void @test_vst1q_p16_x4(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
   17376 // CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
   17377 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
   17378 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0
   17379 // CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
   17380 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
   17381 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
   17382 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
   17383 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
   17384 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
   17385 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
   17386 // CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
   17387 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
   17388 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
   17389 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i64 0, i64 1
   17390 // CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
   17391 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
   17392 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
   17393 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i64 0, i64 2
   17394 // CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
   17395 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
   17396 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
   17397 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i64 0, i64 3
   17398 // CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
   17399 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
   17400 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
   17401 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
   17402 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
   17403 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
   17404 // CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16*
   17405 // CHECK:   call void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i16* [[TMP15]])
   17406 // CHECK:   ret void
   17407 void test_vst1q_p16_x4(poly16_t *a, poly16x8x4_t b) {
   17408   vst1q_p16_x4(a, b);
   17409 }
   17410 
   17411 // CHECK-LABEL: define void @test_vst1q_p64_x4(i64* %a, [4 x <2 x i64>] %b.coerce) #0 {
   17412 // CHECK:   [[B:%.*]] = alloca %struct.poly64x2x4_t, align 16
   17413 // CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x4_t, align 16
   17414 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[B]], i32 0, i32 0
   17415 // CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
   17416 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__S1]] to i8*
   17417 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x4_t* [[B]] to i8*
   17418 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
   17419 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
   17420 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
   17421 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
   17422 // CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
   17423 // CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
   17424 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
   17425 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL1]], i64 0, i64 1
   17426 // CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
   17427 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
   17428 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
   17429 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL3]], i64 0, i64 2
   17430 // CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
   17431 // CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
   17432 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
   17433 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL5]], i64 0, i64 3
   17434 // CHECK:   [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX6]], align 16
   17435 // CHECK:   [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
   17436 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
   17437 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
   17438 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
   17439 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
   17440 // CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i64*
   17441 // CHECK:   call void @llvm.aarch64.neon.st1x4.v2i64.p0i64(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i64* [[TMP15]])
   17442 // CHECK:   ret void
   17443 void test_vst1q_p64_x4(poly64_t *a, poly64x2x4_t b) {
   17444   vst1q_p64_x4(a, b);
   17445 }
   17446 
   17447 // CHECK-LABEL: define void @test_vst1_u8_x4(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
   17448 // CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
   17449 // CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
   17450 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
   17451 // CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
   17452 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
   17453 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
   17454 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
   17455 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
   17456 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
   17457 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
   17458 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
   17459 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i64 0, i64 1
   17460 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
   17461 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
   17462 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i64 0, i64 2
   17463 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
   17464 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
   17465 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i64 0, i64 3
   17466 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
   17467 // CHECK:   call void @llvm.aarch64.neon.st1x4.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i8* %a)
   17468 // CHECK:   ret void
   17469 void test_vst1_u8_x4(uint8_t *a, uint8x8x4_t b) {
   17470   vst1_u8_x4(a, b);
   17471 }
   17472 
   17473 // CHECK-LABEL: define void @test_vst1_u16_x4(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
   17474 // CHECK:   [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
   17475 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
   17476 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[B]], i32 0, i32 0
   17477 // CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
   17478 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
   17479 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
   17480 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
   17481 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
   17482 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
   17483 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
   17484 // CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
   17485 // CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
   17486 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
   17487 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i64 0, i64 1
   17488 // CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
   17489 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
   17490 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
   17491 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i64 0, i64 2
   17492 // CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
   17493 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
   17494 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
   17495 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i64 0, i64 3
   17496 // CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
   17497 // CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
   17498 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
   17499 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
   17500 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
   17501 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
   17502 // CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16*
   17503 // CHECK:   call void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i16* [[TMP15]])
   17504 // CHECK:   ret void
   17505 void test_vst1_u16_x4(uint16_t *a, uint16x4x4_t b) {
   17506   vst1_u16_x4(a, b);
   17507 }
   17508 
   17509 // CHECK-LABEL: define void @test_vst1_u32_x4(i32* %a, [4 x <2 x i32>] %b.coerce) #0 {
   17510 // CHECK:   [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
   17511 // CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
   17512 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[B]], i32 0, i32 0
   17513 // CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
   17514 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
   17515 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
   17516 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
   17517 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
   17518 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
   17519 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
   17520 // CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
   17521 // CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
   17522 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
   17523 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i64 0, i64 1
   17524 // CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
   17525 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
   17526 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
   17527 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i64 0, i64 2
   17528 // CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
   17529 // CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
   17530 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
   17531 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i64 0, i64 3
   17532 // CHECK:   [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
   17533 // CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
   17534 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
   17535 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
   17536 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
   17537 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
   17538 // CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i32*
   17539 // CHECK:   call void @llvm.aarch64.neon.st1x4.v2i32.p0i32(<2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], i32* [[TMP15]])
   17540 // CHECK:   ret void
   17541 void test_vst1_u32_x4(uint32_t *a, uint32x2x4_t b) {
   17542   vst1_u32_x4(a, b);
   17543 }
   17544 
   17545 // CHECK-LABEL: define void @test_vst1_u64_x4(i64* %a, [4 x <1 x i64>] %b.coerce) #0 {
   17546 // CHECK:   [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8
   17547 // CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8
   17548 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[B]], i32 0, i32 0
   17549 // CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
   17550 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__S1]] to i8*
   17551 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x4_t* [[B]] to i8*
   17552 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
   17553 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
   17554 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
   17555 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
   17556 // CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
   17557 // CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
   17558 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
   17559 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i64 0, i64 1
   17560 // CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
   17561 // CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
   17562 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
   17563 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL3]], i64 0, i64 2
   17564 // CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
   17565 // CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
   17566 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
   17567 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL5]], i64 0, i64 3
   17568 // CHECK:   [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX6]], align 8
   17569 // CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
   17570 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
   17571 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
   17572 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
   17573 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
   17574 // CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i64*
   17575 // CHECK:   call void @llvm.aarch64.neon.st1x4.v1i64.p0i64(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i64* [[TMP15]])
   17576 // CHECK:   ret void
   17577 void test_vst1_u64_x4(uint64_t *a, uint64x1x4_t b) {
   17578   vst1_u64_x4(a, b);
   17579 }
   17580 
   17581 // CHECK-LABEL: define void @test_vst1_s8_x4(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
   17582 // CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
   17583 // CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
   17584 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
   17585 // CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
   17586 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
   17587 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
   17588 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
   17589 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
   17590 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
   17591 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
   17592 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
   17593 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i64 0, i64 1
   17594 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
   17595 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
   17596 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i64 0, i64 2
   17597 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
   17598 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
   17599 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i64 0, i64 3
   17600 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
   17601 // CHECK:   call void @llvm.aarch64.neon.st1x4.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i8* %a)
   17602 // CHECK:   ret void
   17603 void test_vst1_s8_x4(int8_t *a, int8x8x4_t b) {
   17604   vst1_s8_x4(a, b);
   17605 }
   17606 
   17607 // CHECK-LABEL: define void @test_vst1_s16_x4(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
   17608 // CHECK:   [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
   17609 // CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
   17610 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[B]], i32 0, i32 0
   17611 // CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
   17612 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
   17613 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
   17614 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
   17615 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
   17616 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
   17617 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
   17618 // CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
   17619 // CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
   17620 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
   17621 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i64 0, i64 1
   17622 // CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
   17623 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
   17624 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
   17625 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i64 0, i64 2
   17626 // CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
   17627 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
   17628 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
   17629 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i64 0, i64 3
   17630 // CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
   17631 // CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
   17632 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
   17633 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
   17634 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
   17635 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
   17636 // CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16*
   17637 // CHECK:   call void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i16* [[TMP15]])
   17638 // CHECK:   ret void
   17639 void test_vst1_s16_x4(int16_t *a, int16x4x4_t b) {
   17640   vst1_s16_x4(a, b);
   17641 }
   17642 
   17643 // CHECK-LABEL: define void @test_vst1_s32_x4(i32* %a, [4 x <2 x i32>] %b.coerce) #0 {
   17644 // CHECK:   [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
   17645 // CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
   17646 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[B]], i32 0, i32 0
   17647 // CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
   17648 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
   17649 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
   17650 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
   17651 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
   17652 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
   17653 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
   17654 // CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
   17655 // CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
   17656 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
   17657 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i64 0, i64 1
   17658 // CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
   17659 // CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
   17660 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
   17661 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i64 0, i64 2
   17662 // CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
   17663 // CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
   17664 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
   17665 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i64 0, i64 3
   17666 // CHECK:   [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
   17667 // CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
   17668 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
   17669 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
   17670 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
   17671 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
   17672 // CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i32*
   17673 // CHECK:   call void @llvm.aarch64.neon.st1x4.v2i32.p0i32(<2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], i32* [[TMP15]])
   17674 // CHECK:   ret void
   17675 void test_vst1_s32_x4(int32_t *a, int32x2x4_t b) {
   17676   vst1_s32_x4(a, b);
   17677 }
   17678 
   17679 // CHECK-LABEL: define void @test_vst1_s64_x4(i64* %a, [4 x <1 x i64>] %b.coerce) #0 {
   17680 // CHECK:   [[B:%.*]] = alloca %struct.int64x1x4_t, align 8
   17681 // CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8
   17682 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[B]], i32 0, i32 0
   17683 // CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
   17684 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__S1]] to i8*
   17685 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x4_t* [[B]] to i8*
   17686 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
   17687 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
   17688 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
   17689 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
   17690 // CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
   17691 // CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
   17692 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
   17693 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i64 0, i64 1
   17694 // CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
   17695 // CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
   17696 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
   17697 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL3]], i64 0, i64 2
   17698 // CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
   17699 // CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
   17700 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
   17701 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL5]], i64 0, i64 3
   17702 // CHECK:   [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX6]], align 8
   17703 // CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
   17704 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
   17705 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
   17706 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
   17707 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
   17708 // CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i64*
   17709 // CHECK:   call void @llvm.aarch64.neon.st1x4.v1i64.p0i64(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i64* [[TMP15]])
   17710 // CHECK:   ret void
   17711 void test_vst1_s64_x4(int64_t *a, int64x1x4_t b) {
   17712   vst1_s64_x4(a, b);
   17713 }
   17714 
   17715 // CHECK-LABEL: define void @test_vst1_f16_x4(half* %a, [4 x <4 x half>] %b.coerce) #0 {
   17716 // CHECK:   [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
   17717 // CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
   17718 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[B]], i32 0, i32 0
   17719 // CHECK:   store [4 x <4 x half>] [[B]].coerce, [4 x <4 x half>]* [[COERCE_DIVE]], align 8
   17720 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
   17721 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
   17722 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
   17723 // CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
   17724 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
   17725 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i64 0, i64 0
   17726 // CHECK:   [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
   17727 // CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
   17728 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
   17729 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL1]], i64 0, i64 1
   17730 // CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
   17731 // CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
   17732 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
   17733 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL3]], i64 0, i64 2
   17734 // CHECK:   [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
   17735 // CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
   17736 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
   17737 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL5]], i64 0, i64 3
   17738 // CHECK:   [[TMP9:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX6]], align 8
   17739 // CHECK:   [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8>
   17740 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
   17741 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
   17742 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
   17743 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
   17744 // CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16*
   17745 // CHECK:   call void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i16* [[TMP15]])
   17746 // CHECK:   ret void
   17747 void test_vst1_f16_x4(float16_t *a, float16x4x4_t b) {
   17748   vst1_f16_x4(a, b);
   17749 }
   17750 
   17751 // CHECK-LABEL: define void @test_vst1_f32_x4(float* %a, [4 x <2 x float>] %b.coerce) #0 {
   17752 // CHECK:   [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
   17753 // CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
   17754 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[B]], i32 0, i32 0
   17755 // CHECK:   store [4 x <2 x float>] [[B]].coerce, [4 x <2 x float>]* [[COERCE_DIVE]], align 8
   17756 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
   17757 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
   17758 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
   17759 // CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
   17760 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
   17761 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i64 0, i64 0
   17762 // CHECK:   [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
   17763 // CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
   17764 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
   17765 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL1]], i64 0, i64 1
   17766 // CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
   17767 // CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
   17768 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
   17769 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL3]], i64 0, i64 2
   17770 // CHECK:   [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
   17771 // CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
   17772 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
   17773 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL5]], i64 0, i64 3
   17774 // CHECK:   [[TMP9:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX6]], align 8
   17775 // CHECK:   [[TMP10:%.*]] = bitcast <2 x float> [[TMP9]] to <8 x i8>
   17776 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
   17777 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
   17778 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
   17779 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x float>
   17780 // CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to float*
   17781 // CHECK:   call void @llvm.aarch64.neon.st1x4.v2f32.p0f32(<2 x float> [[TMP11]], <2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], float* [[TMP15]])
   17782 // CHECK:   ret void
   17783 void test_vst1_f32_x4(float32_t *a, float32x2x4_t b) {
   17784   vst1_f32_x4(a, b);
   17785 }
   17786 
   17787 // CHECK-LABEL: define void @test_vst1_f64_x4(double* %a, [4 x <1 x double>] %b.coerce) #0 {
   17788 // CHECK:   [[B:%.*]] = alloca %struct.float64x1x4_t, align 8
   17789 // CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x4_t, align 8
   17790 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[B]], i32 0, i32 0
   17791 // CHECK:   store [4 x <1 x double>] [[B]].coerce, [4 x <1 x double>]* [[COERCE_DIVE]], align 8
   17792 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x4_t* [[__S1]] to i8*
   17793 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x4_t* [[B]] to i8*
   17794 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
   17795 // CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
   17796 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
   17797 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]], i64 0, i64 0
   17798 // CHECK:   [[TMP3:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
   17799 // CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
   17800 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
   17801 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL1]], i64 0, i64 1
   17802 // CHECK:   [[TMP5:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX2]], align 8
   17803 // CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
   17804 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
   17805 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL3]], i64 0, i64 2
   17806 // CHECK:   [[TMP7:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX4]], align 8
   17807 // CHECK:   [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8>
   17808 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
   17809 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL5]], i64 0, i64 3
   17810 // CHECK:   [[TMP9:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX6]], align 8
   17811 // CHECK:   [[TMP10:%.*]] = bitcast <1 x double> [[TMP9]] to <8 x i8>
   17812 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
   17813 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
   17814 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double>
   17815 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x double>
   17816 // CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to double*
   17817 // CHECK:   call void @llvm.aarch64.neon.st1x4.v1f64.p0f64(<1 x double> [[TMP11]], <1 x double> [[TMP12]], <1 x double> [[TMP13]], <1 x double> [[TMP14]], double* [[TMP15]])
   17818 // CHECK:   ret void
   17819 void test_vst1_f64_x4(float64_t *a, float64x1x4_t b) {
   17820   vst1_f64_x4(a, b);
   17821 }
   17822 
   17823 // CHECK-LABEL: define void @test_vst1_p8_x4(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
   17824 // CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
   17825 // CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
   17826 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
   17827 // CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
   17828 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
   17829 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
   17830 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
   17831 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
   17832 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
   17833 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
   17834 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
   17835 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i64 0, i64 1
   17836 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
   17837 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
   17838 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i64 0, i64 2
   17839 // CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
   17840 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
   17841 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i64 0, i64 3
   17842 // CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
   17843 // CHECK:   call void @llvm.aarch64.neon.st1x4.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i8* %a)
   17844 // CHECK:   ret void
   17845 void test_vst1_p8_x4(poly8_t *a, poly8x8x4_t b) {
   17846   vst1_p8_x4(a, b);
   17847 }
   17848 
   17849 // CHECK-LABEL: define void @test_vst1_p16_x4(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
   17850 // CHECK:   [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
   17851 // CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
   17852 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[B]], i32 0, i32 0
   17853 // CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
   17854 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
   17855 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
   17856 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
   17857 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
   17858 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
   17859 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
   17860 // CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
   17861 // CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
   17862 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
   17863 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i64 0, i64 1
   17864 // CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
   17865 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
   17866 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
   17867 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i64 0, i64 2
   17868 // CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
   17869 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
   17870 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
   17871 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i64 0, i64 3
   17872 // CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
   17873 // CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
   17874 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
   17875 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
   17876 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
   17877 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
   17878 // CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16*
   17879 // CHECK:   call void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i16* [[TMP15]])
   17880 // CHECK:   ret void
   17881 void test_vst1_p16_x4(poly16_t *a, poly16x4x4_t b) {
   17882   vst1_p16_x4(a, b);
   17883 }
   17884 
   17885 // CHECK-LABEL: define void @test_vst1_p64_x4(i64* %a, [4 x <1 x i64>] %b.coerce) #0 {
   17886 // CHECK:   [[B:%.*]] = alloca %struct.poly64x1x4_t, align 8
   17887 // CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x4_t, align 8
   17888 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[B]], i32 0, i32 0
   17889 // CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
   17890 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__S1]] to i8*
   17891 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x4_t* [[B]] to i8*
   17892 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
   17893 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
   17894 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
   17895 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
   17896 // CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
   17897 // CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
   17898 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
   17899 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i64 0, i64 1
   17900 // CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
   17901 // CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
   17902 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
   17903 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL3]], i64 0, i64 2
   17904 // CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
   17905 // CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
   17906 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
   17907 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL5]], i64 0, i64 3
   17908 // CHECK:   [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX6]], align 8
   17909 // CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
   17910 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
   17911 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
   17912 // CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
   17913 // CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
   17914 // CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i64*
   17915 // CHECK:   call void @llvm.aarch64.neon.st1x4.v1i64.p0i64(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i64* [[TMP15]])
   17916 // CHECK:   ret void
   17917 void test_vst1_p64_x4(poly64_t *a, poly64x1x4_t b) {
   17918   vst1_p64_x4(a, b);
   17919 }
   17920 
   17921 // CHECK-LABEL: define i64 @test_vceqd_s64(i64 %a, i64 %b) #0 {
   17922 // CHECK:   [[TMP0:%.*]] = icmp eq i64 %a, %b
   17923 // CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
   17924 // CHECK:   ret i64 [[VCEQD_I]]
   17925 int64_t test_vceqd_s64(int64_t a, int64_t b) {
   17926   return (int64_t)vceqd_s64(a, b);
   17927 }
   17928 
   17929 // CHECK-LABEL: define i64 @test_vceqd_u64(i64 %a, i64 %b) #0 {
   17930 // CHECK:   [[TMP0:%.*]] = icmp eq i64 %a, %b
   17931 // CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
   17932 // CHECK:   ret i64 [[VCEQD_I]]
   17933 uint64_t test_vceqd_u64(uint64_t a, uint64_t b) {
   17934   return (int64_t)vceqd_u64(a, b);
   17935 }
   17936 
   17937 // CHECK-LABEL: define i64 @test_vceqzd_s64(i64 %a) #0 {
   17938 // CHECK:   [[TMP0:%.*]] = icmp eq i64 %a, 0
   17939 // CHECK:   [[VCEQZ_I:%.*]] = sext i1 [[TMP0]] to i64
   17940 // CHECK:   ret i64 [[VCEQZ_I]]
   17941 int64_t test_vceqzd_s64(int64_t a) {
   17942   return (int64_t)vceqzd_s64(a);
   17943 }
   17944 
   17945 // CHECK-LABEL: define i64 @test_vceqzd_u64(i64 %a) #0 {
   17946 // CHECK:   [[TMP0:%.*]] = icmp eq i64 %a, 0
   17947 // CHECK:   [[VCEQZD_I:%.*]] = sext i1 [[TMP0]] to i64
   17948 // CHECK:   ret i64 [[VCEQZD_I]]
   17949 int64_t test_vceqzd_u64(int64_t a) {
   17950   return (int64_t)vceqzd_u64(a);
   17951 }
   17952 
   17953 // CHECK-LABEL: define i64 @test_vcged_s64(i64 %a, i64 %b) #0 {
   17954 // CHECK:   [[TMP0:%.*]] = icmp sge i64 %a, %b
   17955 // CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
   17956 // CHECK:   ret i64 [[VCEQD_I]]
   17957 int64_t test_vcged_s64(int64_t a, int64_t b) {
   17958   return (int64_t)vcged_s64(a, b);
   17959 }
   17960 
   17961 // CHECK-LABEL: define i64 @test_vcged_u64(i64 %a, i64 %b) #0 {
   17962 // CHECK:   [[TMP0:%.*]] = icmp uge i64 %a, %b
   17963 // CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
   17964 // CHECK:   ret i64 [[VCEQD_I]]
   17965 uint64_t test_vcged_u64(uint64_t a, uint64_t b) {
   17966     return (uint64_t)vcged_u64(a, b);
   17967 }
   17968 
   17969 // CHECK-LABEL: define i64 @test_vcgezd_s64(i64 %a) #0 {
   17970 // CHECK:   [[TMP0:%.*]] = icmp sge i64 %a, 0
   17971 // CHECK:   [[VCGEZ_I:%.*]] = sext i1 [[TMP0]] to i64
   17972 // CHECK:   ret i64 [[VCGEZ_I]]
   17973 int64_t test_vcgezd_s64(int64_t a) {
   17974   return (int64_t)vcgezd_s64(a);
   17975 }
   17976 
   17977 // CHECK-LABEL: define i64 @test_vcgtd_s64(i64 %a, i64 %b) #0 {
   17978 // CHECK:   [[TMP0:%.*]] = icmp sgt i64 %a, %b
   17979 // CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
   17980 // CHECK:   ret i64 [[VCEQD_I]]
   17981 int64_t test_vcgtd_s64(int64_t a, int64_t b) {
   17982   return (int64_t)vcgtd_s64(a, b);
   17983 }
   17984 
   17985 // CHECK-LABEL: define i64 @test_vcgtd_u64(i64 %a, i64 %b) #0 {
   17986 // CHECK:   [[TMP0:%.*]] = icmp ugt i64 %a, %b
   17987 // CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
   17988 // CHECK:   ret i64 [[VCEQD_I]]
   17989 uint64_t test_vcgtd_u64(uint64_t a, uint64_t b) {
   17990   return (uint64_t)vcgtd_u64(a, b);
   17991 }
   17992 
   17993 // CHECK-LABEL: define i64 @test_vcgtzd_s64(i64 %a) #0 {
   17994 // CHECK:   [[TMP0:%.*]] = icmp sgt i64 %a, 0
   17995 // CHECK:   [[VCGTZ_I:%.*]] = sext i1 [[TMP0]] to i64
   17996 // CHECK:   ret i64 [[VCGTZ_I]]
   17997 int64_t test_vcgtzd_s64(int64_t a) {
   17998   return (int64_t)vcgtzd_s64(a);
   17999 }
   18000 
   18001 // CHECK-LABEL: define i64 @test_vcled_s64(i64 %a, i64 %b) #0 {
   18002 // CHECK:   [[TMP0:%.*]] = icmp sle i64 %a, %b
   18003 // CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
   18004 // CHECK:   ret i64 [[VCEQD_I]]
   18005 int64_t test_vcled_s64(int64_t a, int64_t b) {
   18006   return (int64_t)vcled_s64(a, b);
   18007 }
   18008 
   18009 // CHECK-LABEL: define i64 @test_vcled_u64(i64 %a, i64 %b) #0 {
   18010 // CHECK:   [[TMP0:%.*]] = icmp ule i64 %a, %b
   18011 // CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
   18012 // CHECK:   ret i64 [[VCEQD_I]]
   18013 uint64_t test_vcled_u64(uint64_t a, uint64_t b) {
   18014   return (uint64_t)vcled_u64(a, b);
   18015 }
   18016 
   18017 // CHECK-LABEL: define i64 @test_vclezd_s64(i64 %a) #0 {
   18018 // CHECK:   [[TMP0:%.*]] = icmp sle i64 %a, 0
   18019 // CHECK:   [[VCLEZ_I:%.*]] = sext i1 [[TMP0]] to i64
   18020 // CHECK:   ret i64 [[VCLEZ_I]]
   18021 int64_t test_vclezd_s64(int64_t a) {
   18022   return (int64_t)vclezd_s64(a);
   18023 }
   18024 
   18025 // CHECK-LABEL: define i64 @test_vcltd_s64(i64 %a, i64 %b) #0 {
   18026 // CHECK:   [[TMP0:%.*]] = icmp slt i64 %a, %b
   18027 // CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
   18028 // CHECK:   ret i64 [[VCEQD_I]]
   18029 int64_t test_vcltd_s64(int64_t a, int64_t b) {
   18030   return (int64_t)vcltd_s64(a, b);
   18031 }
   18032 
   18033 // CHECK-LABEL: define i64 @test_vcltd_u64(i64 %a, i64 %b) #0 {
   18034 // CHECK:   [[TMP0:%.*]] = icmp ult i64 %a, %b
   18035 // CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
   18036 // CHECK:   ret i64 [[VCEQD_I]]
   18037 uint64_t test_vcltd_u64(uint64_t a, uint64_t b) {
   18038   return (uint64_t)vcltd_u64(a, b);
   18039 }
   18040 
   18041 // CHECK-LABEL: define i64 @test_vcltzd_s64(i64 %a) #0 {
   18042 // CHECK:   [[TMP0:%.*]] = icmp slt i64 %a, 0
   18043 // CHECK:   [[VCLTZ_I:%.*]] = sext i1 [[TMP0]] to i64
   18044 // CHECK:   ret i64 [[VCLTZ_I]]
   18045 int64_t test_vcltzd_s64(int64_t a) {
   18046   return (int64_t)vcltzd_s64(a);
   18047 }
   18048 
   18049 // CHECK-LABEL: define i64 @test_vtstd_s64(i64 %a, i64 %b) #0 {
   18050 // CHECK:   [[TMP0:%.*]] = and i64 %a, %b
   18051 // CHECK:   [[TMP1:%.*]] = icmp ne i64 [[TMP0]], 0
   18052 // CHECK:   [[VTSTD_I:%.*]] = sext i1 [[TMP1]] to i64
   18053 // CHECK:   ret i64 [[VTSTD_I]]
   18054 int64_t test_vtstd_s64(int64_t a, int64_t b) {
   18055   return (int64_t)vtstd_s64(a, b);
   18056 }
   18057 
   18058 // CHECK-LABEL: define i64 @test_vtstd_u64(i64 %a, i64 %b) #0 {
   18059 // CHECK:   [[TMP0:%.*]] = and i64 %a, %b
   18060 // CHECK:   [[TMP1:%.*]] = icmp ne i64 [[TMP0]], 0
   18061 // CHECK:   [[VTSTD_I:%.*]] = sext i1 [[TMP1]] to i64
   18062 // CHECK:   ret i64 [[VTSTD_I]]
   18063 uint64_t test_vtstd_u64(uint64_t a, uint64_t b) {
   18064   return (uint64_t)vtstd_u64(a, b);
   18065 }
   18066 
   18067 // CHECK-LABEL: define i64 @test_vabsd_s64(i64 %a) #0 {
   18068 // CHECK:   [[VABSD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.abs.i64(i64 %a) #4
   18069 // CHECK:   ret i64 [[VABSD_S64_I]]
   18070 int64_t test_vabsd_s64(int64_t a) {
   18071   return (int64_t)vabsd_s64(a);
   18072 }
   18073 
   18074 // CHECK-LABEL: define i8 @test_vqabsb_s8(i8 %a) #0 {
   18075 // CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
   18076 // CHECK:   [[VQABSB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8> [[TMP0]]) #4
   18077 // CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQABSB_S8_I]], i64 0
   18078 // CHECK:   ret i8 [[TMP1]]
   18079 int8_t test_vqabsb_s8(int8_t a) {
   18080   return (int8_t)vqabsb_s8(a);
   18081 }
   18082 
   18083 // CHECK-LABEL: define i16 @test_vqabsh_s16(i16 %a) #0 {
   18084 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
   18085 // CHECK:   [[VQABSH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16> [[TMP0]]) #4
   18086 // CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQABSH_S16_I]], i64 0
   18087 // CHECK:   ret i16 [[TMP1]]
   18088 int16_t test_vqabsh_s16(int16_t a) {
   18089   return (int16_t)vqabsh_s16(a);
   18090 }
   18091 
   18092 // CHECK-LABEL: define i32 @test_vqabss_s32(i32 %a) #0 {
   18093 // CHECK:   [[VQABSS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqabs.i32(i32 %a) #4
   18094 // CHECK:   ret i32 [[VQABSS_S32_I]]
   18095 int32_t test_vqabss_s32(int32_t a) {
   18096   return (int32_t)vqabss_s32(a);
   18097 }
   18098 
   18099 // CHECK-LABEL: define i64 @test_vqabsd_s64(i64 %a) #0 {
   18100 // CHECK:   [[VQABSD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqabs.i64(i64 %a) #4
   18101 // CHECK:   ret i64 [[VQABSD_S64_I]]
   18102 int64_t test_vqabsd_s64(int64_t a) {
   18103   return (int64_t)vqabsd_s64(a);
   18104 }
   18105 
   18106 // CHECK-LABEL: define i64 @test_vnegd_s64(i64 %a) #0 {
   18107 // CHECK:   [[VNEGD_I:%.*]] = sub i64 0, %a
   18108 // CHECK:   ret i64 [[VNEGD_I]]
   18109 int64_t test_vnegd_s64(int64_t a) {
   18110   return (int64_t)vnegd_s64(a);
   18111 }
   18112 
   18113 // CHECK-LABEL: define i8 @test_vqnegb_s8(i8 %a) #0 {
   18114 // CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
   18115 // CHECK:   [[VQNEGB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8> [[TMP0]]) #4
   18116 // CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQNEGB_S8_I]], i64 0
   18117 // CHECK:   ret i8 [[TMP1]]
   18118 int8_t test_vqnegb_s8(int8_t a) {
   18119   return (int8_t)vqnegb_s8(a);
   18120 }
   18121 
   18122 // CHECK-LABEL: define i16 @test_vqnegh_s16(i16 %a) #0 {
   18123 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
   18124 // CHECK:   [[VQNEGH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16> [[TMP0]]) #4
   18125 // CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQNEGH_S16_I]], i64 0
   18126 // CHECK:   ret i16 [[TMP1]]
   18127 int16_t test_vqnegh_s16(int16_t a) {
   18128   return (int16_t)vqnegh_s16(a);
   18129 }
   18130 
   18131 // CHECK-LABEL: define i32 @test_vqnegs_s32(i32 %a) #0 {
   18132 // CHECK:   [[VQNEGS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqneg.i32(i32 %a) #4
   18133 // CHECK:   ret i32 [[VQNEGS_S32_I]]
   18134 int32_t test_vqnegs_s32(int32_t a) {
   18135   return (int32_t)vqnegs_s32(a);
   18136 }
   18137 
   18138 // CHECK-LABEL: define i64 @test_vqnegd_s64(i64 %a) #0 {
   18139 // CHECK:   [[VQNEGD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqneg.i64(i64 %a) #4
   18140 // CHECK:   ret i64 [[VQNEGD_S64_I]]
   18141 int64_t test_vqnegd_s64(int64_t a) {
   18142   return (int64_t)vqnegd_s64(a);
   18143 }
   18144 
   18145 // CHECK-LABEL: define i8 @test_vuqaddb_s8(i8 %a, i8 %b) #0 {
   18146 // CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
   18147 // CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0
   18148 // CHECK:   [[VUQADDB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.suqadd.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4
   18149 // CHECK:   [[TMP2:%.*]] = extractelement <8 x i8> [[VUQADDB_S8_I]], i64 0
   18150 // CHECK:   ret i8 [[TMP2]]
   18151 int8_t test_vuqaddb_s8(int8_t a, int8_t b) {
   18152   return (int8_t)vuqaddb_s8(a, b);
   18153 }
   18154 
   18155 // CHECK-LABEL: define i16 @test_vuqaddh_s16(i16 %a, i16 %b) #0 {
   18156 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
   18157 // CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
   18158 // CHECK:   [[VUQADDH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.suqadd.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
   18159 // CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VUQADDH_S16_I]], i64 0
   18160 // CHECK:   ret i16 [[TMP2]]
   18161 int16_t test_vuqaddh_s16(int16_t a, int16_t b) {
   18162   return (int16_t)vuqaddh_s16(a, b);
   18163 }
   18164 
   18165 // CHECK-LABEL: define i32 @test_vuqadds_s32(i32 %a, i32 %b) #0 {
   18166 // CHECK:   [[VUQADDS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.suqadd.i32(i32 %a, i32 %b) #4
   18167 // CHECK:   ret i32 [[VUQADDS_S32_I]]
   18168 int32_t test_vuqadds_s32(int32_t a, int32_t b) {
   18169   return (int32_t)vuqadds_s32(a, b);
   18170 }
   18171 
   18172 // CHECK-LABEL: define i64 @test_vuqaddd_s64(i64 %a, i64 %b) #0 {
   18173 // CHECK:   [[VUQADDD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.suqadd.i64(i64 %a, i64 %b) #4
   18174 // CHECK:   ret i64 [[VUQADDD_S64_I]]
   18175 int64_t test_vuqaddd_s64(int64_t a, int64_t b) {
   18176   return (int64_t)vuqaddd_s64(a, b);
   18177 }
   18178 
   18179 // CHECK-LABEL: define i8 @test_vsqaddb_u8(i8 %a, i8 %b) #0 {
   18180 // CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
   18181 // CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0
   18182 // CHECK:   [[VSQADDB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.usqadd.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4
   18183 // CHECK:   [[TMP2:%.*]] = extractelement <8 x i8> [[VSQADDB_U8_I]], i64 0
   18184 // CHECK:   ret i8 [[TMP2]]
   18185 uint8_t test_vsqaddb_u8(uint8_t a, uint8_t b) {
   18186   return (uint8_t)vsqaddb_u8(a, b);
   18187 }
   18188 
   18189 // CHECK-LABEL: define i16 @test_vsqaddh_u16(i16 %a, i16 %b) #0 {
   18190 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
   18191 // CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
   18192 // CHECK:   [[VSQADDH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.usqadd.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
   18193 // CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VSQADDH_U16_I]], i64 0
   18194 // CHECK:   ret i16 [[TMP2]]
   18195 uint16_t test_vsqaddh_u16(uint16_t a, uint16_t b) {
   18196   return (uint16_t)vsqaddh_u16(a, b);
   18197 }
   18198 
   18199 // CHECK-LABEL: define i32 @test_vsqadds_u32(i32 %a, i32 %b) #0 {
   18200 // CHECK:   [[VSQADDS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.usqadd.i32(i32 %a, i32 %b) #4
   18201 // CHECK:   ret i32 [[VSQADDS_U32_I]]
   18202 uint32_t test_vsqadds_u32(uint32_t a, uint32_t b) {
   18203   return (uint32_t)vsqadds_u32(a, b);
   18204 }
   18205 
   18206 // CHECK-LABEL: define i64 @test_vsqaddd_u64(i64 %a, i64 %b) #0 {
   18207 // CHECK:   [[VSQADDD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.usqadd.i64(i64 %a, i64 %b) #4
   18208 // CHECK:   ret i64 [[VSQADDD_U64_I]]
   18209 uint64_t test_vsqaddd_u64(uint64_t a, uint64_t b) {
   18210   return (uint64_t)vsqaddd_u64(a, b);
   18211 }
   18212 
   18213 // CHECK-LABEL: define i32 @test_vqdmlalh_s16(i32 %a, i16 %b, i16 %c) #0 {
   18214 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
   18215 // CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %c, i64 0
   18216 // CHECK:   [[VQDMLXL_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
   18217 // CHECK:   [[LANE0_I:%.*]] = extractelement <4 x i32> [[VQDMLXL_I]], i64 0
   18218 // CHECK:   [[VQDMLXL1_I:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %a, i32 [[LANE0_I]]) #4
   18219 // CHECK:   ret i32 [[VQDMLXL1_I]]
   18220 int32_t test_vqdmlalh_s16(int32_t a, int16_t b, int16_t c) {
   18221 
   18222   return (int32_t)vqdmlalh_s16(a, b, c);
   18223 }
   18224 
   18225 // CHECK-LABEL: define i64 @test_vqdmlals_s32(i64 %a, i32 %b, i32 %c) #0 {
   18226 // CHECK:   [[VQDMLXL_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 %c) #4
   18227 // CHECK:   [[VQDMLXL1_I:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %a, i64 [[VQDMLXL_I]]) #4
   18228 // CHECK:   ret i64 [[VQDMLXL1_I]]
   18229 int64_t test_vqdmlals_s32(int64_t a, int32_t b, int32_t c) {
   18230   return (int64_t)vqdmlals_s32(a, b, c);
   18231 }
   18232 
   18233 // CHECK-LABEL: define i32 @test_vqdmlslh_s16(i32 %a, i16 %b, i16 %c) #0 {
   18234 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
   18235 // CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %c, i64 0
   18236 // CHECK:   [[VQDMLXL_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
   18237 // CHECK:   [[LANE0_I:%.*]] = extractelement <4 x i32> [[VQDMLXL_I]], i64 0
   18238 // CHECK:   [[VQDMLXL1_I:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %a, i32 [[LANE0_I]]) #4
   18239 // CHECK:   ret i32 [[VQDMLXL1_I]]
   18240 int32_t test_vqdmlslh_s16(int32_t a, int16_t b, int16_t c) {
   18241 
   18242   return (int32_t)vqdmlslh_s16(a, b, c);
   18243 }
   18244 
   18245 // CHECK-LABEL: define i64 @test_vqdmlsls_s32(i64 %a, i32 %b, i32 %c) #0 {
   18246 // CHECK:   [[VQDMLXL_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 %c) #4
   18247 // CHECK:   [[VQDMLXL1_I:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %a, i64 [[VQDMLXL_I]]) #4
   18248 // CHECK:   ret i64 [[VQDMLXL1_I]]
   18249 int64_t test_vqdmlsls_s32(int64_t a, int32_t b, int32_t c) {
   18250   return (int64_t)vqdmlsls_s32(a, b, c);
   18251 }
   18252 
   18253 // CHECK-LABEL: define i32 @test_vqdmullh_s16(i16 %a, i16 %b) #0 {
   18254 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
   18255 // CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
   18256 // CHECK:   [[VQDMULLH_S16_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
   18257 // CHECK:   [[TMP2:%.*]] = extractelement <4 x i32> [[VQDMULLH_S16_I]], i64 0
   18258 // CHECK:   ret i32 [[TMP2]]
   18259 int32_t test_vqdmullh_s16(int16_t a, int16_t b) {
   18260   return (int32_t)vqdmullh_s16(a, b);
   18261 }
   18262 
   18263 // CHECK-LABEL: define i64 @test_vqdmulls_s32(i32 %a, i32 %b) #0 {
   18264 // CHECK:   [[VQDMULLS_S32_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %a, i32 %b) #4
   18265 // CHECK:   ret i64 [[VQDMULLS_S32_I]]
   18266 int64_t test_vqdmulls_s32(int32_t a, int32_t b) {
   18267   return (int64_t)vqdmulls_s32(a, b);
   18268 }
   18269 
   18270 // CHECK-LABEL: define i8 @test_vqmovunh_s16(i16 %a) #0 {
   18271 // CHECK:   [[TMP0:%.*]] = insertelement <8 x i16> undef, i16 %a, i64 0
   18272 // CHECK:   [[VQMOVUNH_S16_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> [[TMP0]]) #4
   18273 // CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQMOVUNH_S16_I]], i64 0
   18274 // CHECK:   ret i8 [[TMP1]]
   18275 int8_t test_vqmovunh_s16(int16_t a) {
   18276   return (int8_t)vqmovunh_s16(a);
   18277 }
   18278 
   18279 // CHECK-LABEL: define i16 @test_vqmovuns_s32(i32 %a) #0 {
   18280 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 %a, i64 0
   18281 // CHECK:   [[VQMOVUNS_S32_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> [[TMP0]]) #4
   18282 // CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQMOVUNS_S32_I]], i64 0
   18283 // CHECK:   ret i16 [[TMP1]]
   18284 int16_t test_vqmovuns_s32(int32_t a) {
   18285   return (int16_t)vqmovuns_s32(a);
   18286 }
   18287 
   18288 // CHECK-LABEL: define i32 @test_vqmovund_s64(i64 %a) #0 {
   18289 // CHECK:   [[VQMOVUND_S64_I:%.*]] = call i32 @llvm.aarch64.neon.scalar.sqxtun.i32.i64(i64 %a) #4
   18290 // CHECK:   ret i32 [[VQMOVUND_S64_I]]
   18291 int32_t test_vqmovund_s64(int64_t a) {
   18292   return (int32_t)vqmovund_s64(a);
   18293 }
   18294 
   18295 // CHECK-LABEL: define i8 @test_vqmovnh_s16(i16 %a) #0 {
   18296 // CHECK:   [[TMP0:%.*]] = insertelement <8 x i16> undef, i16 %a, i64 0
   18297 // CHECK:   [[VQMOVNH_S16_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> [[TMP0]]) #4
   18298 // CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQMOVNH_S16_I]], i64 0
   18299 // CHECK:   ret i8 [[TMP1]]
   18300 int8_t test_vqmovnh_s16(int16_t a) {
   18301   return (int8_t)vqmovnh_s16(a);
   18302 }
   18303 
   18304 // CHECK-LABEL: define i16 @test_vqmovns_s32(i32 %a) #0 {
   18305 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 %a, i64 0
   18306 // CHECK:   [[VQMOVNS_S32_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> [[TMP0]]) #4
   18307 // CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQMOVNS_S32_I]], i64 0
   18308 // CHECK:   ret i16 [[TMP1]]
   18309 int16_t test_vqmovns_s32(int32_t a) {
   18310   return (int16_t)vqmovns_s32(a);
   18311 }
   18312 
   18313 // CHECK-LABEL: define i32 @test_vqmovnd_s64(i64 %a) #0 {
   18314 // CHECK:   [[VQMOVND_S64_I:%.*]] = call i32 @llvm.aarch64.neon.scalar.sqxtn.i32.i64(i64 %a) #4
   18315 // CHECK:   ret i32 [[VQMOVND_S64_I]]
   18316 int32_t test_vqmovnd_s64(int64_t a) {
   18317   return (int32_t)vqmovnd_s64(a);
   18318 }
   18319 
   18320 // CHECK-LABEL: define i8 @test_vqmovnh_u16(i16 %a) #0 {
   18321 // CHECK:   [[TMP0:%.*]] = insertelement <8 x i16> undef, i16 %a, i64 0
   18322 // CHECK:   [[VQMOVNH_U16_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> [[TMP0]]) #4
   18323 // CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQMOVNH_U16_I]], i64 0
   18324 // CHECK:   ret i8 [[TMP1]]
   18325 int8_t test_vqmovnh_u16(int16_t a) {
   18326   return (int8_t)vqmovnh_u16(a);
   18327 }
   18328 
   18329 // CHECK-LABEL: define i16 @test_vqmovns_u32(i32 %a) #0 {
   18330 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 %a, i64 0
   18331 // CHECK:   [[VQMOVNS_U32_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> [[TMP0]]) #4
   18332 // CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQMOVNS_U32_I]], i64 0
   18333 // CHECK:   ret i16 [[TMP1]]
   18334 int16_t test_vqmovns_u32(int32_t a) {
   18335   return (int16_t)vqmovns_u32(a);
   18336 }
   18337 
   18338 // CHECK-LABEL: define i32 @test_vqmovnd_u64(i64 %a) #0 {
   18339 // CHECK:   [[VQMOVND_U64_I:%.*]] = call i32 @llvm.aarch64.neon.scalar.uqxtn.i32.i64(i64 %a) #4
   18340 // CHECK:   ret i32 [[VQMOVND_U64_I]]
   18341 int32_t test_vqmovnd_u64(int64_t a) {
   18342   return (int32_t)vqmovnd_u64(a);
   18343 }
   18344 
   18345 // CHECK-LABEL: define i32 @test_vceqs_f32(float %a, float %b) #0 {
   18346 // CHECK:   [[TMP0:%.*]] = fcmp oeq float %a, %b
   18347 // CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32
   18348 // CHECK:   ret i32 [[VCMPD_I]]
   18349 uint32_t test_vceqs_f32(float32_t a, float32_t b) {
   18350   return (uint32_t)vceqs_f32(a, b);
   18351 }
   18352 
   18353 // CHECK-LABEL: define i64 @test_vceqd_f64(double %a, double %b) #0 {
   18354 // CHECK:   [[TMP0:%.*]] = fcmp oeq double %a, %b
   18355 // CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64
   18356 // CHECK:   ret i64 [[VCMPD_I]]
   18357 uint64_t test_vceqd_f64(float64_t a, float64_t b) {
   18358   return (uint64_t)vceqd_f64(a, b);
   18359 }
   18360 
   18361 // CHECK-LABEL: define i32 @test_vceqzs_f32(float %a) #0 {
   18362 // CHECK:   [[TMP0:%.*]] = fcmp oeq float %a, 0.000000e+00
   18363 // CHECK:   [[VCEQZ_I:%.*]] = sext i1 [[TMP0]] to i32
   18364 // CHECK:   ret i32 [[VCEQZ_I]]
   18365 uint32_t test_vceqzs_f32(float32_t a) {
   18366   return (uint32_t)vceqzs_f32(a);
   18367 }
   18368 
   18369 // CHECK-LABEL: define i64 @test_vceqzd_f64(double %a) #0 {
   18370 // CHECK:   [[TMP0:%.*]] = fcmp oeq double %a, 0.000000e+00
   18371 // CHECK:   [[VCEQZ_I:%.*]] = sext i1 [[TMP0]] to i64
   18372 // CHECK:   ret i64 [[VCEQZ_I]]
   18373 uint64_t test_vceqzd_f64(float64_t a) {
   18374   return (uint64_t)vceqzd_f64(a);
   18375 }
   18376 
   18377 // CHECK-LABEL: define i32 @test_vcges_f32(float %a, float %b) #0 {
   18378 // CHECK:   [[TMP0:%.*]] = fcmp oge float %a, %b
   18379 // CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32
   18380 // CHECK:   ret i32 [[VCMPD_I]]
   18381 uint32_t test_vcges_f32(float32_t a, float32_t b) {
   18382   return (uint32_t)vcges_f32(a, b);
   18383 }
   18384 
   18385 // CHECK-LABEL: define i64 @test_vcged_f64(double %a, double %b) #0 {
   18386 // CHECK:   [[TMP0:%.*]] = fcmp oge double %a, %b
   18387 // CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64
   18388 // CHECK:   ret i64 [[VCMPD_I]]
   18389 uint64_t test_vcged_f64(float64_t a, float64_t b) {
   18390   return (uint64_t)vcged_f64(a, b);
   18391 }
   18392 
   18393 // CHECK-LABEL: define i32 @test_vcgezs_f32(float %a) #0 {
   18394 // CHECK:   [[TMP0:%.*]] = fcmp oge float %a, 0.000000e+00
   18395 // CHECK:   [[VCGEZ_I:%.*]] = sext i1 [[TMP0]] to i32
   18396 // CHECK:   ret i32 [[VCGEZ_I]]
   18397 uint32_t test_vcgezs_f32(float32_t a) {
   18398   return (uint32_t)vcgezs_f32(a);
   18399 }
   18400 
   18401 // CHECK-LABEL: define i64 @test_vcgezd_f64(double %a) #0 {
   18402 // CHECK:   [[TMP0:%.*]] = fcmp oge double %a, 0.000000e+00
   18403 // CHECK:   [[VCGEZ_I:%.*]] = sext i1 [[TMP0]] to i64
   18404 // CHECK:   ret i64 [[VCGEZ_I]]
   18405 uint64_t test_vcgezd_f64(float64_t a) {
   18406   return (uint64_t)vcgezd_f64(a);
   18407 }
   18408 
   18409 // CHECK-LABEL: define i32 @test_vcgts_f32(float %a, float %b) #0 {
   18410 // CHECK:   [[TMP0:%.*]] = fcmp ogt float %a, %b
   18411 // CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32
   18412 // CHECK:   ret i32 [[VCMPD_I]]
   18413 uint32_t test_vcgts_f32(float32_t a, float32_t b) {
   18414   return (uint32_t)vcgts_f32(a, b);
   18415 }
   18416 
   18417 // CHECK-LABEL: define i64 @test_vcgtd_f64(double %a, double %b) #0 {
   18418 // CHECK:   [[TMP0:%.*]] = fcmp ogt double %a, %b
   18419 // CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64
   18420 // CHECK:   ret i64 [[VCMPD_I]]
   18421 uint64_t test_vcgtd_f64(float64_t a, float64_t b) {
   18422   return (uint64_t)vcgtd_f64(a, b);
   18423 }
   18424 
   18425 // CHECK-LABEL: define i32 @test_vcgtzs_f32(float %a) #0 {
   18426 // CHECK:   [[TMP0:%.*]] = fcmp ogt float %a, 0.000000e+00
   18427 // CHECK:   [[VCGTZ_I:%.*]] = sext i1 [[TMP0]] to i32
   18428 // CHECK:   ret i32 [[VCGTZ_I]]
   18429 uint32_t test_vcgtzs_f32(float32_t a) {
   18430   return (uint32_t)vcgtzs_f32(a);
   18431 }
   18432 
   18433 // CHECK-LABEL: define i64 @test_vcgtzd_f64(double %a) #0 {
   18434 // CHECK:   [[TMP0:%.*]] = fcmp ogt double %a, 0.000000e+00
   18435 // CHECK:   [[VCGTZ_I:%.*]] = sext i1 [[TMP0]] to i64
   18436 // CHECK:   ret i64 [[VCGTZ_I]]
   18437 uint64_t test_vcgtzd_f64(float64_t a) {
   18438   return (uint64_t)vcgtzd_f64(a);
   18439 }
   18440 
   18441 // CHECK-LABEL: define i32 @test_vcles_f32(float %a, float %b) #0 {
   18442 // CHECK:   [[TMP0:%.*]] = fcmp ole float %a, %b
   18443 // CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32
   18444 // CHECK:   ret i32 [[VCMPD_I]]
   18445 uint32_t test_vcles_f32(float32_t a, float32_t b) {
   18446   return (uint32_t)vcles_f32(a, b);
   18447 }
   18448 
   18449 // CHECK-LABEL: define i64 @test_vcled_f64(double %a, double %b) #0 {
   18450 // CHECK:   [[TMP0:%.*]] = fcmp ole double %a, %b
   18451 // CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64
   18452 // CHECK:   ret i64 [[VCMPD_I]]
   18453 uint64_t test_vcled_f64(float64_t a, float64_t b) {
   18454   return (uint64_t)vcled_f64(a, b);
   18455 }
   18456 
   18457 // CHECK-LABEL: define i32 @test_vclezs_f32(float %a) #0 {
   18458 // CHECK:   [[TMP0:%.*]] = fcmp ole float %a, 0.000000e+00
   18459 // CHECK:   [[VCLEZ_I:%.*]] = sext i1 [[TMP0]] to i32
   18460 // CHECK:   ret i32 [[VCLEZ_I]]
   18461 uint32_t test_vclezs_f32(float32_t a) {
   18462   return (uint32_t)vclezs_f32(a);
   18463 }
   18464 
   18465 // CHECK-LABEL: define i64 @test_vclezd_f64(double %a) #0 {
   18466 // CHECK:   [[TMP0:%.*]] = fcmp ole double %a, 0.000000e+00
   18467 // CHECK:   [[VCLEZ_I:%.*]] = sext i1 [[TMP0]] to i64
   18468 // CHECK:   ret i64 [[VCLEZ_I]]
   18469 uint64_t test_vclezd_f64(float64_t a) {
   18470   return (uint64_t)vclezd_f64(a);
   18471 }
   18472 
   18473 // CHECK-LABEL: define i32 @test_vclts_f32(float %a, float %b) #0 {
   18474 // CHECK:   [[TMP0:%.*]] = fcmp olt float %a, %b
   18475 // CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32
   18476 // CHECK:   ret i32 [[VCMPD_I]]
   18477 uint32_t test_vclts_f32(float32_t a, float32_t b) {
   18478   return (uint32_t)vclts_f32(a, b);
   18479 }
   18480 
   18481 // CHECK-LABEL: define i64 @test_vcltd_f64(double %a, double %b) #0 {
   18482 // CHECK:   [[TMP0:%.*]] = fcmp olt double %a, %b
   18483 // CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64
   18484 // CHECK:   ret i64 [[VCMPD_I]]
   18485 uint64_t test_vcltd_f64(float64_t a, float64_t b) {
   18486   return (uint64_t)vcltd_f64(a, b);
   18487 }
   18488 
   18489 // CHECK-LABEL: define i32 @test_vcltzs_f32(float %a) #0 {
   18490 // CHECK:   [[TMP0:%.*]] = fcmp olt float %a, 0.000000e+00
   18491 // CHECK:   [[VCLTZ_I:%.*]] = sext i1 [[TMP0]] to i32
   18492 // CHECK:   ret i32 [[VCLTZ_I]]
   18493 uint32_t test_vcltzs_f32(float32_t a) {
   18494   return (uint32_t)vcltzs_f32(a);
   18495 }
   18496 
   18497 // CHECK-LABEL: define i64 @test_vcltzd_f64(double %a) #0 {
   18498 // CHECK:   [[TMP0:%.*]] = fcmp olt double %a, 0.000000e+00
   18499 // CHECK:   [[VCLTZ_I:%.*]] = sext i1 [[TMP0]] to i64
   18500 // CHECK:   ret i64 [[VCLTZ_I]]
   18501 uint64_t test_vcltzd_f64(float64_t a) {
   18502   return (uint64_t)vcltzd_f64(a);
   18503 }
   18504 
   18505 // CHECK-LABEL: define i32 @test_vcages_f32(float %a, float %b) #0 {
   18506 // CHECK:   [[VCAGES_F32_I:%.*]] = call i32 @llvm.aarch64.neon.facge.i32.f32(float %a, float %b) #4
   18507 // CHECK:   ret i32 [[VCAGES_F32_I]]
   18508 uint32_t test_vcages_f32(float32_t a, float32_t b) {
   18509   return (uint32_t)vcages_f32(a, b);
   18510 }
   18511 
   18512 // CHECK-LABEL: define i64 @test_vcaged_f64(double %a, double %b) #0 {
   18513 // CHECK:   [[VCAGED_F64_I:%.*]] = call i64 @llvm.aarch64.neon.facge.i64.f64(double %a, double %b) #4
   18514 // CHECK:   ret i64 [[VCAGED_F64_I]]
   18515 uint64_t test_vcaged_f64(float64_t a, float64_t b) {
   18516   return (uint64_t)vcaged_f64(a, b);
   18517 }
   18518 
   18519 // CHECK-LABEL: define i32 @test_vcagts_f32(float %a, float %b) #0 {
   18520 // CHECK:   [[VCAGTS_F32_I:%.*]] = call i32 @llvm.aarch64.neon.facgt.i32.f32(float %a, float %b) #4
   18521 // CHECK:   ret i32 [[VCAGTS_F32_I]]
   18522 uint32_t test_vcagts_f32(float32_t a, float32_t b) {
   18523   return (uint32_t)vcagts_f32(a, b);
   18524 }
   18525 
   18526 // CHECK-LABEL: define i64 @test_vcagtd_f64(double %a, double %b) #0 {
   18527 // CHECK:   [[VCAGTD_F64_I:%.*]] = call i64 @llvm.aarch64.neon.facgt.i64.f64(double %a, double %b) #4
   18528 // CHECK:   ret i64 [[VCAGTD_F64_I]]
   18529 uint64_t test_vcagtd_f64(float64_t a, float64_t b) {
   18530   return (uint64_t)vcagtd_f64(a, b);
   18531 }
   18532 
   18533 // CHECK-LABEL: define i32 @test_vcales_f32(float %a, float %b) #0 {
   18534 // CHECK:   [[VCALES_F32_I:%.*]] = call i32 @llvm.aarch64.neon.facge.i32.f32(float %b, float %a) #4
   18535 // CHECK:   ret i32 [[VCALES_F32_I]]
   18536 uint32_t test_vcales_f32(float32_t a, float32_t b) {
   18537   return (uint32_t)vcales_f32(a, b);
   18538 }
   18539 
   18540 // CHECK-LABEL: define i64 @test_vcaled_f64(double %a, double %b) #0 {
   18541 // CHECK:   [[VCALED_F64_I:%.*]] = call i64 @llvm.aarch64.neon.facge.i64.f64(double %b, double %a) #4
   18542 // CHECK:   ret i64 [[VCALED_F64_I]]
   18543 uint64_t test_vcaled_f64(float64_t a, float64_t b) {
   18544   return (uint64_t)vcaled_f64(a, b);
   18545 }
   18546 
   18547 // CHECK-LABEL: define i32 @test_vcalts_f32(float %a, float %b) #0 {
   18548 // CHECK:   [[VCALTS_F32_I:%.*]] = call i32 @llvm.aarch64.neon.facgt.i32.f32(float %b, float %a) #4
   18549 // CHECK:   ret i32 [[VCALTS_F32_I]]
   18550 uint32_t test_vcalts_f32(float32_t a, float32_t b) {
   18551   return (uint32_t)vcalts_f32(a, b);
   18552 }
   18553 
   18554 // CHECK-LABEL: define i64 @test_vcaltd_f64(double %a, double %b) #0 {
   18555 // CHECK:   [[VCALTD_F64_I:%.*]] = call i64 @llvm.aarch64.neon.facgt.i64.f64(double %b, double %a) #4
   18556 // CHECK:   ret i64 [[VCALTD_F64_I]]
   18557 uint64_t test_vcaltd_f64(float64_t a, float64_t b) {
   18558   return (uint64_t)vcaltd_f64(a, b);
   18559 }
   18560 
   18561 // CHECK-LABEL: define i64 @test_vshrd_n_s64(i64 %a) #0 {
   18562 // CHECK:   [[SHRD_N:%.*]] = ashr i64 %a, 1
   18563 // CHECK:   ret i64 [[SHRD_N]]
   18564 int64_t test_vshrd_n_s64(int64_t a) {
   18565   return (int64_t)vshrd_n_s64(a, 1);
   18566 }
   18567 
   18568 // CHECK-LABEL: define <1 x i64> @test_vshr_n_s64(<1 x i64> %a) #0 {
   18569 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   18570 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   18571 // CHECK:   [[VSHR_N:%.*]] = ashr <1 x i64> [[TMP1]], <i64 1>
   18572 // CHECK:   ret <1 x i64> [[VSHR_N]]
   18573 int64x1_t test_vshr_n_s64(int64x1_t a) {
   18574   return vshr_n_s64(a, 1);
   18575 }
   18576 
   18577 // CHECK-LABEL: define i64 @test_vshrd_n_u64(i64 %a) #0 {
   18578 // CHECK:   ret i64 0
   18579 uint64_t test_vshrd_n_u64(uint64_t a) {
   18580 
   18581   return (uint64_t)vshrd_n_u64(a, 64);
   18582 }
   18583 
   18584 // CHECK-LABEL: define i64 @test_vshrd_n_u64_2() #0 {
   18585 // CHECK:   ret i64 0
   18586 uint64_t test_vshrd_n_u64_2() {
   18587 
   18588   uint64_t a = UINT64_C(0xf000000000000000);
   18589   return vshrd_n_u64(a, 64);
   18590 }
   18591 
   18592 // CHECK-LABEL: define <1 x i64> @test_vshr_n_u64(<1 x i64> %a) #0 {
   18593 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   18594 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   18595 // CHECK:   [[VSHR_N:%.*]] = lshr <1 x i64> [[TMP1]], <i64 1>
   18596 // CHECK:   ret <1 x i64> [[VSHR_N]]
   18597 uint64x1_t test_vshr_n_u64(uint64x1_t a) {
   18598   return vshr_n_u64(a, 1);
   18599 }
   18600 
   18601 // CHECK-LABEL: define i64 @test_vrshrd_n_s64(i64 %a) #0 {
   18602 // CHECK:   [[VRSHR_N:%.*]] = call i64 @llvm.aarch64.neon.srshl.i64(i64 %a, i64 -63)
   18603 // CHECK:   ret i64 [[VRSHR_N]]
   18604 int64_t test_vrshrd_n_s64(int64_t a) {
   18605   return (int64_t)vrshrd_n_s64(a, 63);
   18606 }
   18607 
   18608 // CHECK-LABEL: define <1 x i64> @test_vrshr_n_s64(<1 x i64> %a) #0 {
   18609 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   18610 // CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   18611 // CHECK:   [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> <i64 -1>)
   18612 // CHECK:   ret <1 x i64> [[VRSHR_N1]]
   18613 int64x1_t test_vrshr_n_s64(int64x1_t a) {
   18614   return vrshr_n_s64(a, 1);
   18615 }
   18616 
   18617 // CHECK-LABEL: define i64 @test_vrshrd_n_u64(i64 %a) #0 {
   18618 // CHECK:   [[VRSHR_N:%.*]] = call i64 @llvm.aarch64.neon.urshl.i64(i64 %a, i64 -63)
   18619 // CHECK:   ret i64 [[VRSHR_N]]
   18620 uint64_t test_vrshrd_n_u64(uint64_t a) {
   18621   return (uint64_t)vrshrd_n_u64(a, 63);
   18622 }
   18623 
   18624 // CHECK-LABEL: define <1 x i64> @test_vrshr_n_u64(<1 x i64> %a) #0 {
   18625 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   18626 // CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   18627 // CHECK:   [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> <i64 -1>)
   18628 // CHECK:   ret <1 x i64> [[VRSHR_N1]]
   18629 uint64x1_t test_vrshr_n_u64(uint64x1_t a) {
   18630   return vrshr_n_u64(a, 1);
   18631 }
   18632 
   18633 // CHECK-LABEL: define i64 @test_vsrad_n_s64(i64 %a, i64 %b) #0 {
   18634 // CHECK:   [[SHRD_N:%.*]] = ashr i64 %b, 63
   18635 // CHECK:   [[TMP0:%.*]] = add i64 %a, [[SHRD_N]]
   18636 // CHECK:   ret i64 [[TMP0]]
   18637 int64_t test_vsrad_n_s64(int64_t a, int64_t b) {
   18638   return (int64_t)vsrad_n_s64(a, b, 63);
   18639 }
   18640 
   18641 // CHECK-LABEL: define <1 x i64> @test_vsra_n_s64(<1 x i64> %a, <1 x i64> %b) #0 {
   18642 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   18643 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   18644 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   18645 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   18646 // CHECK:   [[VSRA_N:%.*]] = ashr <1 x i64> [[TMP3]], <i64 1>
   18647 // CHECK:   [[TMP4:%.*]] = add <1 x i64> [[TMP2]], [[VSRA_N]]
   18648 // CHECK:   ret <1 x i64> [[TMP4]]
   18649 int64x1_t test_vsra_n_s64(int64x1_t a, int64x1_t b) {
   18650   return vsra_n_s64(a, b, 1);
   18651 }
   18652 
   18653 // CHECK-LABEL: define i64 @test_vsrad_n_u64(i64 %a, i64 %b) #0 {
   18654 // CHECK:   [[SHRD_N:%.*]] = lshr i64 %b, 63
   18655 // CHECK:   [[TMP0:%.*]] = add i64 %a, [[SHRD_N]]
   18656 // CHECK:   ret i64 [[TMP0]]
   18657 uint64_t test_vsrad_n_u64(uint64_t a, uint64_t b) {
   18658   return (uint64_t)vsrad_n_u64(a, b, 63);
   18659 }
   18660 
   18661 // CHECK-LABEL: define i64 @test_vsrad_n_u64_2(i64 %a, i64 %b) #0 {
   18662 // CHECK:   ret i64 %a
   18663 uint64_t test_vsrad_n_u64_2(uint64_t a, uint64_t b) {
   18664 
   18665   return (uint64_t)vsrad_n_u64(a, b, 64);
   18666 }
   18667 
   18668 // CHECK-LABEL: define <1 x i64> @test_vsra_n_u64(<1 x i64> %a, <1 x i64> %b) #0 {
   18669 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   18670 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   18671 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   18672 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   18673 // CHECK:   [[VSRA_N:%.*]] = lshr <1 x i64> [[TMP3]], <i64 1>
   18674 // CHECK:   [[TMP4:%.*]] = add <1 x i64> [[TMP2]], [[VSRA_N]]
   18675 // CHECK:   ret <1 x i64> [[TMP4]]
   18676 uint64x1_t test_vsra_n_u64(uint64x1_t a, uint64x1_t b) {
   18677   return vsra_n_u64(a, b, 1);
   18678 }
   18679 
   18680 // CHECK-LABEL: define i64 @test_vrsrad_n_s64(i64 %a, i64 %b) #0 {
   18681 // CHECK:   [[TMP0:%.*]] = call i64 @llvm.aarch64.neon.srshl.i64(i64 %b, i64 -63)
   18682 // CHECK:   [[TMP1:%.*]] = add i64 %a, [[TMP0]]
   18683 // CHECK:   ret i64 [[TMP1]]
   18684 int64_t test_vrsrad_n_s64(int64_t a, int64_t b) {
   18685   return (int64_t)vrsrad_n_s64(a, b, 63);
   18686 }
   18687 
   18688 // CHECK-LABEL: define <1 x i64> @test_vrsra_n_s64(<1 x i64> %a, <1 x i64> %b) #0 {
   18689 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   18690 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   18691 // CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   18692 // CHECK:   [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> <i64 -1>)
   18693 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   18694 // CHECK:   [[TMP3:%.*]] = add <1 x i64> [[TMP2]], [[VRSHR_N1]]
   18695 // CHECK:   ret <1 x i64> [[TMP3]]
   18696 int64x1_t test_vrsra_n_s64(int64x1_t a, int64x1_t b) {
   18697   return vrsra_n_s64(a, b, 1);
   18698 }
   18699 
   18700 // CHECK-LABEL: define i64 @test_vrsrad_n_u64(i64 %a, i64 %b) #0 {
   18701 // CHECK:   [[TMP0:%.*]] = call i64 @llvm.aarch64.neon.urshl.i64(i64 %b, i64 -63)
   18702 // CHECK:   [[TMP1:%.*]] = add i64 %a, [[TMP0]]
   18703 // CHECK:   ret i64 [[TMP1]]
   18704 uint64_t test_vrsrad_n_u64(uint64_t a, uint64_t b) {
   18705   return (uint64_t)vrsrad_n_u64(a, b, 63);
   18706 }
   18707 
   18708 // CHECK-LABEL: define <1 x i64> @test_vrsra_n_u64(<1 x i64> %a, <1 x i64> %b) #0 {
   18709 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   18710 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   18711 // CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   18712 // CHECK:   [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> <i64 -1>)
   18713 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   18714 // CHECK:   [[TMP3:%.*]] = add <1 x i64> [[TMP2]], [[VRSHR_N1]]
   18715 // CHECK:   ret <1 x i64> [[TMP3]]
   18716 uint64x1_t test_vrsra_n_u64(uint64x1_t a, uint64x1_t b) {
   18717   return vrsra_n_u64(a, b, 1);
   18718 }
   18719 
   18720 // CHECK-LABEL: define i64 @test_vshld_n_s64(i64 %a) #0 {
   18721 // CHECK:   [[SHLD_N:%.*]] = shl i64 %a, 1
   18722 // CHECK:   ret i64 [[SHLD_N]]
   18723 int64_t test_vshld_n_s64(int64_t a) {
   18724   return (int64_t)vshld_n_s64(a, 1);
   18725 }
   18726 // CHECK-LABEL: define <1 x i64> @test_vshl_n_s64(<1 x i64> %a) #0 {
   18727 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   18728 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   18729 // CHECK:   [[VSHL_N:%.*]] = shl <1 x i64> [[TMP1]], <i64 1>
   18730 // CHECK:   ret <1 x i64> [[VSHL_N]]
   18731 int64x1_t test_vshl_n_s64(int64x1_t a) {
   18732   return vshl_n_s64(a, 1);
   18733 }
   18734 
   18735 // CHECK-LABEL: define i64 @test_vshld_n_u64(i64 %a) #0 {
   18736 // CHECK:   [[SHLD_N:%.*]] = shl i64 %a, 63
   18737 // CHECK:   ret i64 [[SHLD_N]]
   18738 uint64_t test_vshld_n_u64(uint64_t a) {
   18739   return (uint64_t)vshld_n_u64(a, 63);
   18740 }
   18741 
   18742 // CHECK-LABEL: define <1 x i64> @test_vshl_n_u64(<1 x i64> %a) #0 {
   18743 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   18744 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   18745 // CHECK:   [[VSHL_N:%.*]] = shl <1 x i64> [[TMP1]], <i64 1>
   18746 // CHECK:   ret <1 x i64> [[VSHL_N]]
   18747 uint64x1_t test_vshl_n_u64(uint64x1_t a) {
   18748   return vshl_n_u64(a, 1);
   18749 }
   18750 
   18751 // CHECK-LABEL: define i8 @test_vqshlb_n_s8(i8 %a) #0 {
   18752 // CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
   18753 // CHECK:   [[VQSHLB_N_S8:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> <i8 7, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>)
   18754 // CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHLB_N_S8]], i64 0
   18755 // CHECK:   ret i8 [[TMP1]]
   18756 int8_t test_vqshlb_n_s8(int8_t a) {
   18757   return (int8_t)vqshlb_n_s8(a, 7);
   18758 }
   18759 
   18760 // CHECK-LABEL: define i16 @test_vqshlh_n_s16(i16 %a) #0 {
   18761 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
   18762 // CHECK:   [[VQSHLH_N_S16:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> <i16 15, i16 undef, i16 undef, i16 undef>)
   18763 // CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHLH_N_S16]], i64 0
   18764 // CHECK:   ret i16 [[TMP1]]
   18765 int16_t test_vqshlh_n_s16(int16_t a) {
   18766   return (int16_t)vqshlh_n_s16(a, 15);
   18767 }
   18768 
   18769 // CHECK-LABEL: define i32 @test_vqshls_n_s32(i32 %a) #0 {
   18770 // CHECK:   [[VQSHLS_N_S32:%.*]] = call i32 @llvm.aarch64.neon.sqshl.i32(i32 %a, i32 31)
   18771 // CHECK:   ret i32 [[VQSHLS_N_S32]]
   18772 int32_t test_vqshls_n_s32(int32_t a) {
   18773   return (int32_t)vqshls_n_s32(a, 31);
   18774 }
   18775 
   18776 // CHECK-LABEL: define i64 @test_vqshld_n_s64(i64 %a) #0 {
   18777 // CHECK:   [[VQSHL_N:%.*]] = call i64 @llvm.aarch64.neon.sqshl.i64(i64 %a, i64 63)
   18778 // CHECK:   ret i64 [[VQSHL_N]]
   18779 int64_t test_vqshld_n_s64(int64_t a) {
   18780   return (int64_t)vqshld_n_s64(a, 63);
   18781 }
   18782 
   18783 // CHECK-LABEL: define <8 x i8> @test_vqshl_n_s8(<8 x i8> %a) #0 {
   18784 // CHECK:   [[VQSHL_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> %a, <8 x i8> zeroinitializer)
   18785 // CHECK:   ret <8 x i8> [[VQSHL_N]]
   18786 int8x8_t test_vqshl_n_s8(int8x8_t a) {
   18787   return vqshl_n_s8(a, 0);
   18788 }
   18789 
   18790 // CHECK-LABEL: define <16 x i8> @test_vqshlq_n_s8(<16 x i8> %a) #0 {
   18791 // CHECK:   [[VQSHL_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8> %a, <16 x i8> zeroinitializer)
   18792 // CHECK:   ret <16 x i8> [[VQSHL_N]]
   18793 int8x16_t test_vqshlq_n_s8(int8x16_t a) {
   18794   return vqshlq_n_s8(a, 0);
   18795 }
   18796 
   18797 // CHECK-LABEL: define <4 x i16> @test_vqshl_n_s16(<4 x i16> %a) #0 {
   18798 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   18799 // CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   18800 // CHECK:   [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[VQSHL_N]], <4 x i16> zeroinitializer)
   18801 // CHECK:   ret <4 x i16> [[VQSHL_N1]]
   18802 int16x4_t test_vqshl_n_s16(int16x4_t a) {
   18803   return vqshl_n_s16(a, 0);
   18804 }
   18805 
   18806 // CHECK-LABEL: define <8 x i16> @test_vqshlq_n_s16(<8 x i16> %a) #0 {
   18807 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   18808 // CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   18809 // CHECK:   [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> [[VQSHL_N]], <8 x i16> zeroinitializer)
   18810 // CHECK:   ret <8 x i16> [[VQSHL_N1]]
   18811 int16x8_t test_vqshlq_n_s16(int16x8_t a) {
   18812   return vqshlq_n_s16(a, 0);
   18813 }
   18814 
   18815 // CHECK-LABEL: define <2 x i32> @test_vqshl_n_s32(<2 x i32> %a) #0 {
   18816 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   18817 // CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   18818 // CHECK:   [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> [[VQSHL_N]], <2 x i32> zeroinitializer)
   18819 // CHECK:   ret <2 x i32> [[VQSHL_N1]]
   18820 int32x2_t test_vqshl_n_s32(int32x2_t a) {
   18821   return vqshl_n_s32(a, 0);
   18822 }
   18823 
   18824 // CHECK-LABEL: define <4 x i32> @test_vqshlq_n_s32(<4 x i32> %a) #0 {
   18825 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   18826 // CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   18827 // CHECK:   [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> [[VQSHL_N]], <4 x i32> zeroinitializer)
   18828 // CHECK:   ret <4 x i32> [[VQSHL_N1]]
   18829 int32x4_t test_vqshlq_n_s32(int32x4_t a) {
   18830   return vqshlq_n_s32(a, 0);
   18831 }
   18832 
   18833 // CHECK-LABEL: define <2 x i64> @test_vqshlq_n_s64(<2 x i64> %a) #0 {
   18834 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   18835 // CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   18836 // CHECK:   [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> [[VQSHL_N]], <2 x i64> zeroinitializer)
   18837 // CHECK:   ret <2 x i64> [[VQSHL_N1]]
   18838 int64x2_t test_vqshlq_n_s64(int64x2_t a) {
   18839   return vqshlq_n_s64(a, 0);
   18840 }
   18841 
   18842 // CHECK-LABEL: define <8 x i8> @test_vqshl_n_u8(<8 x i8> %a) #0 {
   18843 // CHECK:   [[VQSHL_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> %a, <8 x i8> zeroinitializer)
   18844 // CHECK:   ret <8 x i8> [[VQSHL_N]]
   18845 uint8x8_t test_vqshl_n_u8(uint8x8_t a) {
   18846   return vqshl_n_u8(a, 0);
   18847 }
   18848 
   18849 // CHECK-LABEL: define <16 x i8> @test_vqshlq_n_u8(<16 x i8> %a) #0 {
   18850 // CHECK:   [[VQSHL_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8> %a, <16 x i8> zeroinitializer)
   18851 // CHECK:   ret <16 x i8> [[VQSHL_N]]
   18852 uint8x16_t test_vqshlq_n_u8(uint8x16_t a) {
   18853   return vqshlq_n_u8(a, 0);
   18854 }
   18855 
   18856 // CHECK-LABEL: define <4 x i16> @test_vqshl_n_u16(<4 x i16> %a) #0 {
   18857 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   18858 // CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   18859 // CHECK:   [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[VQSHL_N]], <4 x i16> zeroinitializer)
   18860 // CHECK:   ret <4 x i16> [[VQSHL_N1]]
   18861 uint16x4_t test_vqshl_n_u16(uint16x4_t a) {
   18862   return vqshl_n_u16(a, 0);
   18863 }
   18864 
   18865 // CHECK-LABEL: define <8 x i16> @test_vqshlq_n_u16(<8 x i16> %a) #0 {
   18866 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   18867 // CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   18868 // CHECK:   [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> [[VQSHL_N]], <8 x i16> zeroinitializer)
   18869 // CHECK:   ret <8 x i16> [[VQSHL_N1]]
   18870 uint16x8_t test_vqshlq_n_u16(uint16x8_t a) {
   18871   return vqshlq_n_u16(a, 0);
   18872 }
   18873 
   18874 // CHECK-LABEL: define <2 x i32> @test_vqshl_n_u32(<2 x i32> %a) #0 {
   18875 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   18876 // CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   18877 // CHECK:   [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> [[VQSHL_N]], <2 x i32> zeroinitializer)
   18878 // CHECK:   ret <2 x i32> [[VQSHL_N1]]
   18879 uint32x2_t test_vqshl_n_u32(uint32x2_t a) {
   18880   return vqshl_n_u32(a, 0);
   18881 }
   18882 
   18883 // CHECK-LABEL: define <4 x i32> @test_vqshlq_n_u32(<4 x i32> %a) #0 {
   18884 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   18885 // CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   18886 // CHECK:   [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> [[VQSHL_N]], <4 x i32> zeroinitializer)
   18887 // CHECK:   ret <4 x i32> [[VQSHL_N1]]
   18888 uint32x4_t test_vqshlq_n_u32(uint32x4_t a) {
   18889   return vqshlq_n_u32(a, 0);
   18890 }
   18891 
   18892 // CHECK-LABEL: define <2 x i64> @test_vqshlq_n_u64(<2 x i64> %a) #0 {
   18893 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   18894 // CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   18895 // CHECK:   [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> [[VQSHL_N]], <2 x i64> zeroinitializer)
   18896 // CHECK:   ret <2 x i64> [[VQSHL_N1]]
   18897 uint64x2_t test_vqshlq_n_u64(uint64x2_t a) {
   18898   return vqshlq_n_u64(a, 0);
   18899 }
   18900 
   18901 // CHECK-LABEL: define <1 x i64> @test_vqshl_n_s64(<1 x i64> %a) #0 {
   18902 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   18903 // CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   18904 // CHECK:   [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshl.v1i64(<1 x i64> [[VQSHL_N]], <1 x i64> <i64 1>)
   18905 // CHECK:   ret <1 x i64> [[VQSHL_N1]]
   18906 int64x1_t test_vqshl_n_s64(int64x1_t a) {
   18907   return vqshl_n_s64(a, 1);
   18908 }
   18909 
   18910 // CHECK-LABEL: define i8 @test_vqshlb_n_u8(i8 %a) #0 {
   18911 // CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
   18912 // CHECK:   [[VQSHLB_N_U8:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> <i8 7, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>)
   18913 // CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHLB_N_U8]], i64 0
   18914 // CHECK:   ret i8 [[TMP1]]
   18915 uint8_t test_vqshlb_n_u8(uint8_t a) {
   18916   return (uint8_t)vqshlb_n_u8(a, 7);
   18917 }
   18918 
   18919 // CHECK-LABEL: define i16 @test_vqshlh_n_u16(i16 %a) #0 {
   18920 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
   18921 // CHECK:   [[VQSHLH_N_U16:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> <i16 15, i16 undef, i16 undef, i16 undef>)
   18922 // CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHLH_N_U16]], i64 0
   18923 // CHECK:   ret i16 [[TMP1]]
   18924 uint16_t test_vqshlh_n_u16(uint16_t a) {
   18925   return (uint16_t)vqshlh_n_u16(a, 15);
   18926 }
   18927 
   18928 // CHECK-LABEL: define i32 @test_vqshls_n_u32(i32 %a) #0 {
   18929 // CHECK:   [[VQSHLS_N_U32:%.*]] = call i32 @llvm.aarch64.neon.uqshl.i32(i32 %a, i32 31)
   18930 // CHECK:   ret i32 [[VQSHLS_N_U32]]
   18931 uint32_t test_vqshls_n_u32(uint32_t a) {
   18932   return (uint32_t)vqshls_n_u32(a, 31);
   18933 }
   18934 
   18935 // CHECK-LABEL: define i64 @test_vqshld_n_u64(i64 %a) #0 {
   18936 // CHECK:   [[VQSHL_N:%.*]] = call i64 @llvm.aarch64.neon.uqshl.i64(i64 %a, i64 63)
   18937 // CHECK:   ret i64 [[VQSHL_N]]
   18938 uint64_t test_vqshld_n_u64(uint64_t a) {
   18939   return (uint64_t)vqshld_n_u64(a, 63);
   18940 }
   18941 
   18942 // CHECK-LABEL: define <1 x i64> @test_vqshl_n_u64(<1 x i64> %a) #0 {
   18943 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   18944 // CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   18945 // CHECK:   [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqshl.v1i64(<1 x i64> [[VQSHL_N]], <1 x i64> <i64 1>)
   18946 // CHECK:   ret <1 x i64> [[VQSHL_N1]]
   18947 uint64x1_t test_vqshl_n_u64(uint64x1_t a) {
   18948   return vqshl_n_u64(a, 1);
   18949 }
   18950 
   18951 // CHECK-LABEL: define i8 @test_vqshlub_n_s8(i8 %a) #0 {
   18952 // CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
   18953 // CHECK:   [[VQSHLUB_N_S8:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshlu.v8i8(<8 x i8> [[TMP0]], <8 x i8> <i8 7, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>)
   18954 // CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHLUB_N_S8]], i64 0
   18955 // CHECK:   ret i8 [[TMP1]]
   18956 int8_t test_vqshlub_n_s8(int8_t a) {
   18957   return (int8_t)vqshlub_n_s8(a, 7);
   18958 }
   18959 
   18960 // CHECK-LABEL: define i16 @test_vqshluh_n_s16(i16 %a) #0 {
   18961 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
   18962 // CHECK:   [[VQSHLUH_N_S16:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshlu.v4i16(<4 x i16> [[TMP0]], <4 x i16> <i16 15, i16 undef, i16 undef, i16 undef>)
   18963 // CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHLUH_N_S16]], i64 0
   18964 // CHECK:   ret i16 [[TMP1]]
   18965 int16_t test_vqshluh_n_s16(int16_t a) {
   18966   return (int16_t)vqshluh_n_s16(a, 15);
   18967 }
   18968 
   18969 // CHECK-LABEL: define i32 @test_vqshlus_n_s32(i32 %a) #0 {
   18970 // CHECK:   [[VQSHLUS_N_S32:%.*]] = call i32 @llvm.aarch64.neon.sqshlu.i32(i32 %a, i32 31)
   18971 // CHECK:   ret i32 [[VQSHLUS_N_S32]]
   18972 int32_t test_vqshlus_n_s32(int32_t a) {
   18973   return (int32_t)vqshlus_n_s32(a, 31);
   18974 }
   18975 
   18976 // CHECK-LABEL: define i64 @test_vqshlud_n_s64(i64 %a) #0 {
   18977 // CHECK:   [[VQSHLU_N:%.*]] = call i64 @llvm.aarch64.neon.sqshlu.i64(i64 %a, i64 63)
   18978 // CHECK:   ret i64 [[VQSHLU_N]]
   18979 int64_t test_vqshlud_n_s64(int64_t a) {
   18980   return (int64_t)vqshlud_n_s64(a, 63);
   18981 }
   18982 
   18983 // CHECK-LABEL: define <1 x i64> @test_vqshlu_n_s64(<1 x i64> %a) #0 {
   18984 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   18985 // CHECK:   [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   18986 // CHECK:   [[VQSHLU_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshlu.v1i64(<1 x i64> [[VQSHLU_N]], <1 x i64> <i64 1>)
   18987 // CHECK:   ret <1 x i64> [[VQSHLU_N1]]
   18988 uint64x1_t test_vqshlu_n_s64(int64x1_t a) {
   18989   return vqshlu_n_s64(a, 1);
   18990 }
   18991 
   18992 // CHECK-LABEL: define i64 @test_vsrid_n_s64(i64 %a, i64 %b) #0 {
   18993 // CHECK:   [[VSRID_N_S64:%.*]] = bitcast i64 %a to <1 x i64>
   18994 // CHECK:   [[VSRID_N_S641:%.*]] = bitcast i64 %b to <1 x i64>
   18995 // CHECK:   [[VSRID_N_S642:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[VSRID_N_S64]], <1 x i64> [[VSRID_N_S641]], i32 63)
   18996 // CHECK:   [[VSRID_N_S643:%.*]] = bitcast <1 x i64> [[VSRID_N_S642]] to i64
   18997 // CHECK:   ret i64 [[VSRID_N_S643]]
   18998 int64_t test_vsrid_n_s64(int64_t a, int64_t b) {
   18999   return (int64_t)vsrid_n_s64(a, b, 63);
   19000 }
   19001 
   19002 // CHECK-LABEL: define <1 x i64> @test_vsri_n_s64(<1 x i64> %a, <1 x i64> %b) #0 {
   19003 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   19004 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   19005 // CHECK:   [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   19006 // CHECK:   [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   19007 // CHECK:   [[VSRI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[VSRI_N]], <1 x i64> [[VSRI_N1]], i32 1)
   19008 // CHECK:   ret <1 x i64> [[VSRI_N2]]
   19009 int64x1_t test_vsri_n_s64(int64x1_t a, int64x1_t b) {
   19010   return vsri_n_s64(a, b, 1);
   19011 }
   19012 
   19013 // CHECK-LABEL: define i64 @test_vsrid_n_u64(i64 %a, i64 %b) #0 {
   19014 // CHECK:   [[VSRID_N_U64:%.*]] = bitcast i64 %a to <1 x i64>
   19015 // CHECK:   [[VSRID_N_U641:%.*]] = bitcast i64 %b to <1 x i64>
   19016 // CHECK:   [[VSRID_N_U642:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[VSRID_N_U64]], <1 x i64> [[VSRID_N_U641]], i32 63)
   19017 // CHECK:   [[VSRID_N_U643:%.*]] = bitcast <1 x i64> [[VSRID_N_U642]] to i64
   19018 // CHECK:   ret i64 [[VSRID_N_U643]]
   19019 uint64_t test_vsrid_n_u64(uint64_t a, uint64_t b) {
   19020   return (uint64_t)vsrid_n_u64(a, b, 63);
   19021 }
   19022 
   19023 // CHECK-LABEL: define <1 x i64> @test_vsri_n_u64(<1 x i64> %a, <1 x i64> %b) #0 {
   19024 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   19025 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   19026 // CHECK:   [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   19027 // CHECK:   [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   19028 // CHECK:   [[VSRI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[VSRI_N]], <1 x i64> [[VSRI_N1]], i32 1)
   19029 // CHECK:   ret <1 x i64> [[VSRI_N2]]
   19030 uint64x1_t test_vsri_n_u64(uint64x1_t a, uint64x1_t b) {
   19031   return vsri_n_u64(a, b, 1);
   19032 }
   19033 
   19034 // CHECK-LABEL: define i64 @test_vslid_n_s64(i64 %a, i64 %b) #0 {
   19035 // CHECK:   [[VSLID_N_S64:%.*]] = bitcast i64 %a to <1 x i64>
   19036 // CHECK:   [[VSLID_N_S641:%.*]] = bitcast i64 %b to <1 x i64>
   19037 // CHECK:   [[VSLID_N_S642:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[VSLID_N_S64]], <1 x i64> [[VSLID_N_S641]], i32 63)
   19038 // CHECK:   [[VSLID_N_S643:%.*]] = bitcast <1 x i64> [[VSLID_N_S642]] to i64
   19039 // CHECK:   ret i64 [[VSLID_N_S643]]
   19040 int64_t test_vslid_n_s64(int64_t a, int64_t b) {
   19041   return (int64_t)vslid_n_s64(a, b, 63);
   19042 }
   19043 
   19044 // CHECK-LABEL: define <1 x i64> @test_vsli_n_s64(<1 x i64> %a, <1 x i64> %b) #0 {
   19045 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   19046 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   19047 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   19048 // CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   19049 // CHECK:   [[VSLI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], i32 1)
   19050 // CHECK:   ret <1 x i64> [[VSLI_N2]]
   19051 int64x1_t test_vsli_n_s64(int64x1_t a, int64x1_t b) {
   19052   return vsli_n_s64(a, b, 1);
   19053 }
   19054 
   19055 // CHECK-LABEL: define i64 @test_vslid_n_u64(i64 %a, i64 %b) #0 {
   19056 // CHECK:   [[VSLID_N_U64:%.*]] = bitcast i64 %a to <1 x i64>
   19057 // CHECK:   [[VSLID_N_U641:%.*]] = bitcast i64 %b to <1 x i64>
   19058 // CHECK:   [[VSLID_N_U642:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[VSLID_N_U64]], <1 x i64> [[VSLID_N_U641]], i32 63)
   19059 // CHECK:   [[VSLID_N_U643:%.*]] = bitcast <1 x i64> [[VSLID_N_U642]] to i64
   19060 // CHECK:   ret i64 [[VSLID_N_U643]]
   19061 uint64_t test_vslid_n_u64(uint64_t a, uint64_t b) {
   19062   return (uint64_t)vslid_n_u64(a, b, 63);
   19063 }
   19064 
   19065 // CHECK-LABEL: define <1 x i64> @test_vsli_n_u64(<1 x i64> %a, <1 x i64> %b) #0 {
   19066 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   19067 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   19068 // CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   19069 // CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   19070 // CHECK:   [[VSLI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], i32 1)
   19071 // CHECK:   ret <1 x i64> [[VSLI_N2]]
   19072 uint64x1_t test_vsli_n_u64(uint64x1_t a, uint64x1_t b) {
   19073   return vsli_n_u64(a, b, 1);
   19074 }
   19075 
   19076 // CHECK-LABEL: define i8 @test_vqshrnh_n_s16(i16 %a) #0 {
   19077 // CHECK:   [[TMP0:%.*]] = insertelement <8 x i16> undef, i16 %a, i64 0
   19078 // CHECK:   [[VQSHRNH_N_S16:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> [[TMP0]], i32 8)
   19079 // CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHRNH_N_S16]], i64 0
   19080 // CHECK:   ret i8 [[TMP1]]
   19081 int8_t test_vqshrnh_n_s16(int16_t a) {
   19082   return (int8_t)vqshrnh_n_s16(a, 8);
   19083 }
   19084 
   19085 // CHECK-LABEL: define i16 @test_vqshrns_n_s32(i32 %a) #0 {
   19086 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 %a, i64 0
   19087 // CHECK:   [[VQSHRNS_N_S32:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> [[TMP0]], i32 16)
   19088 // CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHRNS_N_S32]], i64 0
   19089 // CHECK:   ret i16 [[TMP1]]
   19090 int16_t test_vqshrns_n_s32(int32_t a) {
   19091   return (int16_t)vqshrns_n_s32(a, 16);
   19092 }
   19093 
   19094 // CHECK-LABEL: define i32 @test_vqshrnd_n_s64(i64 %a) #0 {
   19095 // CHECK:   [[VQSHRND_N_S64:%.*]] = call i32 @llvm.aarch64.neon.sqshrn.i32(i64 %a, i32 32)
   19096 // CHECK:   ret i32 [[VQSHRND_N_S64]]
   19097 int32_t test_vqshrnd_n_s64(int64_t a) {
   19098   return (int32_t)vqshrnd_n_s64(a, 32);
   19099 }
   19100 
   19101 // CHECK-LABEL: define i8 @test_vqshrnh_n_u16(i16 %a) #0 {
   19102 // CHECK:   [[TMP0:%.*]] = insertelement <8 x i16> undef, i16 %a, i64 0
   19103 // CHECK:   [[VQSHRNH_N_U16:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> [[TMP0]], i32 8)
   19104 // CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHRNH_N_U16]], i64 0
   19105 // CHECK:   ret i8 [[TMP1]]
   19106 uint8_t test_vqshrnh_n_u16(uint16_t a) {
   19107   return (uint8_t)vqshrnh_n_u16(a, 8);
   19108 }
   19109 
   19110 // CHECK-LABEL: define i16 @test_vqshrns_n_u32(i32 %a) #0 {
   19111 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 %a, i64 0
   19112 // CHECK:   [[VQSHRNS_N_U32:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> [[TMP0]], i32 16)
   19113 // CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHRNS_N_U32]], i64 0
   19114 // CHECK:   ret i16 [[TMP1]]
   19115 uint16_t test_vqshrns_n_u32(uint32_t a) {
   19116   return (uint16_t)vqshrns_n_u32(a, 16);
   19117 }
   19118 
   19119 // CHECK-LABEL: define i32 @test_vqshrnd_n_u64(i64 %a) #0 {
   19120 // CHECK:   [[VQSHRND_N_U64:%.*]] = call i32 @llvm.aarch64.neon.uqshrn.i32(i64 %a, i32 32)
   19121 // CHECK:   ret i32 [[VQSHRND_N_U64]]
   19122 uint32_t test_vqshrnd_n_u64(uint64_t a) {
   19123   return (uint32_t)vqshrnd_n_u64(a, 32);
   19124 }
   19125 
   19126 // CHECK-LABEL: define i8 @test_vqrshrnh_n_s16(i16 %a) #0 {
   19127 // CHECK:   [[TMP0:%.*]] = insertelement <8 x i16> undef, i16 %a, i64 0
   19128 // CHECK:   [[VQRSHRNH_N_S16:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> [[TMP0]], i32 8)
   19129 // CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQRSHRNH_N_S16]], i64 0
   19130 // CHECK:   ret i8 [[TMP1]]
   19131 int8_t test_vqrshrnh_n_s16(int16_t a) {
   19132   return (int8_t)vqrshrnh_n_s16(a, 8);
   19133 }
   19134 
   19135 // CHECK-LABEL: define i16 @test_vqrshrns_n_s32(i32 %a) #0 {
   19136 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 %a, i64 0
   19137 // CHECK:   [[VQRSHRNS_N_S32:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> [[TMP0]], i32 16)
   19138 // CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQRSHRNS_N_S32]], i64 0
   19139 // CHECK:   ret i16 [[TMP1]]
   19140 int16_t test_vqrshrns_n_s32(int32_t a) {
   19141   return (int16_t)vqrshrns_n_s32(a, 16);
   19142 }
   19143 
   19144 // CHECK-LABEL: define i32 @test_vqrshrnd_n_s64(i64 %a) #0 {
   19145 // CHECK:   [[VQRSHRND_N_S64:%.*]] = call i32 @llvm.aarch64.neon.sqrshrn.i32(i64 %a, i32 32)
   19146 // CHECK:   ret i32 [[VQRSHRND_N_S64]]
   19147 int32_t test_vqrshrnd_n_s64(int64_t a) {
   19148   return (int32_t)vqrshrnd_n_s64(a, 32);
   19149 }
   19150 
   19151 // CHECK-LABEL: define i8 @test_vqrshrnh_n_u16(i16 %a) #0 {
   19152 // CHECK:   [[TMP0:%.*]] = insertelement <8 x i16> undef, i16 %a, i64 0
   19153 // CHECK:   [[VQRSHRNH_N_U16:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> [[TMP0]], i32 8)
   19154 // CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQRSHRNH_N_U16]], i64 0
   19155 // CHECK:   ret i8 [[TMP1]]
   19156 uint8_t test_vqrshrnh_n_u16(uint16_t a) {
   19157   return (uint8_t)vqrshrnh_n_u16(a, 8);
   19158 }
   19159 
   19160 // CHECK-LABEL: define i16 @test_vqrshrns_n_u32(i32 %a) #0 {
   19161 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 %a, i64 0
   19162 // CHECK:   [[VQRSHRNS_N_U32:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> [[TMP0]], i32 16)
   19163 // CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQRSHRNS_N_U32]], i64 0
   19164 // CHECK:   ret i16 [[TMP1]]
   19165 uint16_t test_vqrshrns_n_u32(uint32_t a) {
   19166   return (uint16_t)vqrshrns_n_u32(a, 16);
   19167 }
   19168 
   19169 // CHECK-LABEL: define i32 @test_vqrshrnd_n_u64(i64 %a) #0 {
   19170 // CHECK:   [[VQRSHRND_N_U64:%.*]] = call i32 @llvm.aarch64.neon.uqrshrn.i32(i64 %a, i32 32)
   19171 // CHECK:   ret i32 [[VQRSHRND_N_U64]]
   19172 uint32_t test_vqrshrnd_n_u64(uint64_t a) {
   19173   return (uint32_t)vqrshrnd_n_u64(a, 32);
   19174 }
   19175 
   19176 // CHECK-LABEL: define i8 @test_vqshrunh_n_s16(i16 %a) #0 {
   19177 // CHECK:   [[TMP0:%.*]] = insertelement <8 x i16> undef, i16 %a, i64 0
   19178 // CHECK:   [[VQSHRUNH_N_S16:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> [[TMP0]], i32 8)
   19179 // CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHRUNH_N_S16]], i64 0
   19180 // CHECK:   ret i8 [[TMP1]]
   19181 int8_t test_vqshrunh_n_s16(int16_t a) {
   19182   return (int8_t)vqshrunh_n_s16(a, 8);
   19183 }
   19184 
   19185 // CHECK-LABEL: define i16 @test_vqshruns_n_s32(i32 %a) #0 {
   19186 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 %a, i64 0
   19187 // CHECK:   [[VQSHRUNS_N_S32:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> [[TMP0]], i32 16)
   19188 // CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHRUNS_N_S32]], i64 0
   19189 // CHECK:   ret i16 [[TMP1]]
   19190 int16_t test_vqshruns_n_s32(int32_t a) {
   19191   return (int16_t)vqshruns_n_s32(a, 16);
   19192 }
   19193 
   19194 // CHECK-LABEL: define i32 @test_vqshrund_n_s64(i64 %a) #0 {
   19195 // CHECK:   [[VQSHRUND_N_S64:%.*]] = call i32 @llvm.aarch64.neon.sqshrun.i32(i64 %a, i32 32)
   19196 // CHECK:   ret i32 [[VQSHRUND_N_S64]]
   19197 int32_t test_vqshrund_n_s64(int64_t a) {
   19198   return (int32_t)vqshrund_n_s64(a, 32);
   19199 }
   19200 
   19201 // CHECK-LABEL: define i8 @test_vqrshrunh_n_s16(i16 %a) #0 {
   19202 // CHECK:   [[TMP0:%.*]] = insertelement <8 x i16> undef, i16 %a, i64 0
   19203 // CHECK:   [[VQRSHRUNH_N_S16:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[TMP0]], i32 8)
   19204 // CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQRSHRUNH_N_S16]], i64 0
   19205 // CHECK:   ret i8 [[TMP1]]
   19206 int8_t test_vqrshrunh_n_s16(int16_t a) {
   19207   return (int8_t)vqrshrunh_n_s16(a, 8);
   19208 }
   19209 
   19210 // CHECK-LABEL: define i16 @test_vqrshruns_n_s32(i32 %a) #0 {
   19211 // CHECK:   [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 %a, i64 0
   19212 // CHECK:   [[VQRSHRUNS_N_S32:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[TMP0]], i32 16)
   19213 // CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQRSHRUNS_N_S32]], i64 0
   19214 // CHECK:   ret i16 [[TMP1]]
   19215 int16_t test_vqrshruns_n_s32(int32_t a) {
   19216   return (int16_t)vqrshruns_n_s32(a, 16);
   19217 }
   19218 
   19219 // CHECK-LABEL: define i32 @test_vqrshrund_n_s64(i64 %a) #0 {
   19220 // CHECK:   [[VQRSHRUND_N_S64:%.*]] = call i32 @llvm.aarch64.neon.sqrshrun.i32(i64 %a, i32 32)
   19221 // CHECK:   ret i32 [[VQRSHRUND_N_S64]]
   19222 int32_t test_vqrshrund_n_s64(int64_t a) {
   19223   return (int32_t)vqrshrund_n_s64(a, 32);
   19224 }
   19225 
   19226 // CHECK-LABEL: define float @test_vcvts_n_f32_s32(i32 %a) #0 {
   19227 // CHECK:   [[VCVTS_N_F32_S32:%.*]] = call float @llvm.aarch64.neon.vcvtfxs2fp.f32.i32(i32 %a, i32 1)
   19228 // CHECK:   ret float [[VCVTS_N_F32_S32]]
   19229 float32_t test_vcvts_n_f32_s32(int32_t a) {
   19230   return vcvts_n_f32_s32(a, 1);
   19231 }
   19232 
   19233 // CHECK-LABEL: define double @test_vcvtd_n_f64_s64(i64 %a) #0 {
   19234 // CHECK:   [[VCVTD_N_F64_S64:%.*]] = call double @llvm.aarch64.neon.vcvtfxs2fp.f64.i64(i64 %a, i32 1)
   19235 // CHECK:   ret double [[VCVTD_N_F64_S64]]
   19236 float64_t test_vcvtd_n_f64_s64(int64_t a) {
   19237   return vcvtd_n_f64_s64(a, 1);
   19238 }
   19239 
   19240 // CHECK-LABEL: define float @test_vcvts_n_f32_u32(i32 %a) #0 {
   19241 // CHECK:   [[VCVTS_N_F32_U32:%.*]] = call float @llvm.aarch64.neon.vcvtfxu2fp.f32.i32(i32 %a, i32 32)
   19242 // CHECK:   ret float [[VCVTS_N_F32_U32]]
   19243 float32_t test_vcvts_n_f32_u32(uint32_t a) {
   19244   return vcvts_n_f32_u32(a, 32);
   19245 }
   19246 
   19247 // CHECK-LABEL: define double @test_vcvtd_n_f64_u64(i64 %a) #0 {
   19248 // CHECK:   [[VCVTD_N_F64_U64:%.*]] = call double @llvm.aarch64.neon.vcvtfxu2fp.f64.i64(i64 %a, i32 64)
   19249 // CHECK:   ret double [[VCVTD_N_F64_U64]]
   19250 float64_t test_vcvtd_n_f64_u64(uint64_t a) {
   19251   return vcvtd_n_f64_u64(a, 64);
   19252 }
   19253 
   19254 // CHECK-LABEL: define i32 @test_vcvts_n_s32_f32(float %a) #0 {
   19255 // CHECK:   [[VCVTS_N_S32_F32:%.*]] = call i32 @llvm.aarch64.neon.vcvtfp2fxs.i32.f32(float %a, i32 1)
   19256 // CHECK:   ret i32 [[VCVTS_N_S32_F32]]
   19257 int32_t test_vcvts_n_s32_f32(float32_t a) {
   19258   return (int32_t)vcvts_n_s32_f32(a, 1);
   19259 }
   19260 
   19261 // CHECK-LABEL: define i64 @test_vcvtd_n_s64_f64(double %a) #0 {
   19262 // CHECK:   [[VCVTD_N_S64_F64:%.*]] = call i64 @llvm.aarch64.neon.vcvtfp2fxs.i64.f64(double %a, i32 1)
   19263 // CHECK:   ret i64 [[VCVTD_N_S64_F64]]
   19264 int64_t test_vcvtd_n_s64_f64(float64_t a) {
   19265   return (int64_t)vcvtd_n_s64_f64(a, 1);
   19266 }
   19267 
   19268 // CHECK-LABEL: define i32 @test_vcvts_n_u32_f32(float %a) #0 {
   19269 // CHECK:   [[VCVTS_N_U32_F32:%.*]] = call i32 @llvm.aarch64.neon.vcvtfp2fxu.i32.f32(float %a, i32 32)
   19270 // CHECK:   ret i32 [[VCVTS_N_U32_F32]]
   19271 uint32_t test_vcvts_n_u32_f32(float32_t a) {
   19272   return (uint32_t)vcvts_n_u32_f32(a, 32);
   19273 }
   19274 
   19275 // CHECK-LABEL: define i64 @test_vcvtd_n_u64_f64(double %a) #0 {
   19276 // CHECK:   [[VCVTD_N_U64_F64:%.*]] = call i64 @llvm.aarch64.neon.vcvtfp2fxu.i64.f64(double %a, i32 64)
   19277 // CHECK:   ret i64 [[VCVTD_N_U64_F64]]
   19278 uint64_t test_vcvtd_n_u64_f64(float64_t a) {
   19279   return (uint64_t)vcvtd_n_u64_f64(a, 64);
   19280 }
   19281 
   19282 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_s16(<4 x i16> %a) #0 {
   19283 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   19284 // CHECK:   ret <8 x i8> [[TMP0]]
   19285 int8x8_t test_vreinterpret_s8_s16(int16x4_t a) {
   19286   return vreinterpret_s8_s16(a);
   19287 }
   19288 
   19289 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_s32(<2 x i32> %a) #0 {
   19290 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   19291 // CHECK:   ret <8 x i8> [[TMP0]]
   19292 int8x8_t test_vreinterpret_s8_s32(int32x2_t a) {
   19293   return vreinterpret_s8_s32(a);
   19294 }
   19295 
   19296 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_s64(<1 x i64> %a) #0 {
   19297 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   19298 // CHECK:   ret <8 x i8> [[TMP0]]
   19299 int8x8_t test_vreinterpret_s8_s64(int64x1_t a) {
   19300   return vreinterpret_s8_s64(a);
   19301 }
   19302 
   19303 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u8(<8 x i8> %a) #0 {
   19304 // CHECK:   ret <8 x i8> %a
   19305 int8x8_t test_vreinterpret_s8_u8(uint8x8_t a) {
   19306   return vreinterpret_s8_u8(a);
   19307 }
   19308 
   19309 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u16(<4 x i16> %a) #0 {
   19310 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   19311 // CHECK:   ret <8 x i8> [[TMP0]]
   19312 int8x8_t test_vreinterpret_s8_u16(uint16x4_t a) {
   19313   return vreinterpret_s8_u16(a);
   19314 }
   19315 
   19316 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u32(<2 x i32> %a) #0 {
   19317 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   19318 // CHECK:   ret <8 x i8> [[TMP0]]
   19319 int8x8_t test_vreinterpret_s8_u32(uint32x2_t a) {
   19320   return vreinterpret_s8_u32(a);
   19321 }
   19322 
   19323 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u64(<1 x i64> %a) #0 {
   19324 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   19325 // CHECK:   ret <8 x i8> [[TMP0]]
   19326 int8x8_t test_vreinterpret_s8_u64(uint64x1_t a) {
   19327   return vreinterpret_s8_u64(a);
   19328 }
   19329 
   19330 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_f16(<4 x half> %a) #0 {
   19331 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
   19332 // CHECK:   ret <8 x i8> [[TMP0]]
   19333 int8x8_t test_vreinterpret_s8_f16(float16x4_t a) {
   19334   return vreinterpret_s8_f16(a);
   19335 }
   19336 
   19337 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_f32(<2 x float> %a) #0 {
   19338 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   19339 // CHECK:   ret <8 x i8> [[TMP0]]
   19340 int8x8_t test_vreinterpret_s8_f32(float32x2_t a) {
   19341   return vreinterpret_s8_f32(a);
   19342 }
   19343 
   19344 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_f64(<1 x double> %a) #0 {
   19345 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
   19346 // CHECK:   ret <8 x i8> [[TMP0]]
   19347 int8x8_t test_vreinterpret_s8_f64(float64x1_t a) {
   19348   return vreinterpret_s8_f64(a);
   19349 }
   19350 
   19351 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_p8(<8 x i8> %a) #0 {
   19352 // CHECK:   ret <8 x i8> %a
   19353 int8x8_t test_vreinterpret_s8_p8(poly8x8_t a) {
   19354   return vreinterpret_s8_p8(a);
   19355 }
   19356 
   19357 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_p16(<4 x i16> %a) #0 {
   19358 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   19359 // CHECK:   ret <8 x i8> [[TMP0]]
   19360 int8x8_t test_vreinterpret_s8_p16(poly16x4_t a) {
   19361   return vreinterpret_s8_p16(a);
   19362 }
   19363 
   19364 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_p64(<1 x i64> %a) #0 {
   19365 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   19366 // CHECK:   ret <8 x i8> [[TMP0]]
   19367 int8x8_t test_vreinterpret_s8_p64(poly64x1_t a) {
   19368   return vreinterpret_s8_p64(a);
   19369 }
   19370 
   19371 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_s8(<8 x i8> %a) #0 {
   19372 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
   19373 // CHECK:   ret <4 x i16> [[TMP0]]
   19374 int16x4_t test_vreinterpret_s16_s8(int8x8_t a) {
   19375   return vreinterpret_s16_s8(a);
   19376 }
   19377 
   19378 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_s32(<2 x i32> %a) #0 {
   19379 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
   19380 // CHECK:   ret <4 x i16> [[TMP0]]
   19381 int16x4_t test_vreinterpret_s16_s32(int32x2_t a) {
   19382   return vreinterpret_s16_s32(a);
   19383 }
   19384 
   19385 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_s64(<1 x i64> %a) #0 {
   19386 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
   19387 // CHECK:   ret <4 x i16> [[TMP0]]
   19388 int16x4_t test_vreinterpret_s16_s64(int64x1_t a) {
   19389   return vreinterpret_s16_s64(a);
   19390 }
   19391 
   19392 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u8(<8 x i8> %a) #0 {
   19393 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
   19394 // CHECK:   ret <4 x i16> [[TMP0]]
   19395 int16x4_t test_vreinterpret_s16_u8(uint8x8_t a) {
   19396   return vreinterpret_s16_u8(a);
   19397 }
   19398 
   19399 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u16(<4 x i16> %a) #0 {
   19400 // CHECK:   ret <4 x i16> %a
   19401 int16x4_t test_vreinterpret_s16_u16(uint16x4_t a) {
   19402   return vreinterpret_s16_u16(a);
   19403 }
   19404 
   19405 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u32(<2 x i32> %a) #0 {
   19406 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
   19407 // CHECK:   ret <4 x i16> [[TMP0]]
   19408 int16x4_t test_vreinterpret_s16_u32(uint32x2_t a) {
   19409   return vreinterpret_s16_u32(a);
   19410 }
   19411 
   19412 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u64(<1 x i64> %a) #0 {
   19413 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
   19414 // CHECK:   ret <4 x i16> [[TMP0]]
   19415 int16x4_t test_vreinterpret_s16_u64(uint64x1_t a) {
   19416   return vreinterpret_s16_u64(a);
   19417 }
   19418 
   19419 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_f16(<4 x half> %a) #0 {
   19420 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16>
   19421 // CHECK:   ret <4 x i16> [[TMP0]]
   19422 int16x4_t test_vreinterpret_s16_f16(float16x4_t a) {
   19423   return vreinterpret_s16_f16(a);
   19424 }
   19425 
   19426 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_f32(<2 x float> %a) #0 {
   19427 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16>
   19428 // CHECK:   ret <4 x i16> [[TMP0]]
   19429 int16x4_t test_vreinterpret_s16_f32(float32x2_t a) {
   19430   return vreinterpret_s16_f32(a);
   19431 }
   19432 
   19433 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_f64(<1 x double> %a) #0 {
   19434 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <4 x i16>
   19435 // CHECK:   ret <4 x i16> [[TMP0]]
   19436 int16x4_t test_vreinterpret_s16_f64(float64x1_t a) {
   19437   return vreinterpret_s16_f64(a);
   19438 }
   19439 
   19440 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_p8(<8 x i8> %a) #0 {
   19441 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
   19442 // CHECK:   ret <4 x i16> [[TMP0]]
   19443 int16x4_t test_vreinterpret_s16_p8(poly8x8_t a) {
   19444   return vreinterpret_s16_p8(a);
   19445 }
   19446 
   19447 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_p16(<4 x i16> %a) #0 {
   19448 // CHECK:   ret <4 x i16> %a
   19449 int16x4_t test_vreinterpret_s16_p16(poly16x4_t a) {
   19450   return vreinterpret_s16_p16(a);
   19451 }
   19452 
   19453 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_p64(<1 x i64> %a) #0 {
   19454 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
   19455 // CHECK:   ret <4 x i16> [[TMP0]]
   19456 int16x4_t test_vreinterpret_s16_p64(poly64x1_t a) {
   19457   return vreinterpret_s16_p64(a);
   19458 }
   19459 
   19460 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_s8(<8 x i8> %a) #0 {
   19461 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
   19462 // CHECK:   ret <2 x i32> [[TMP0]]
   19463 int32x2_t test_vreinterpret_s32_s8(int8x8_t a) {
   19464   return vreinterpret_s32_s8(a);
   19465 }
   19466 
   19467 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_s16(<4 x i16> %a) #0 {
   19468 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
   19469 // CHECK:   ret <2 x i32> [[TMP0]]
   19470 int32x2_t test_vreinterpret_s32_s16(int16x4_t a) {
   19471   return vreinterpret_s32_s16(a);
   19472 }
   19473 
   19474 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_s64(<1 x i64> %a) #0 {
   19475 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
   19476 // CHECK:   ret <2 x i32> [[TMP0]]
   19477 int32x2_t test_vreinterpret_s32_s64(int64x1_t a) {
   19478   return vreinterpret_s32_s64(a);
   19479 }
   19480 
   19481 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u8(<8 x i8> %a) #0 {
   19482 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
   19483 // CHECK:   ret <2 x i32> [[TMP0]]
   19484 int32x2_t test_vreinterpret_s32_u8(uint8x8_t a) {
   19485   return vreinterpret_s32_u8(a);
   19486 }
   19487 
   19488 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u16(<4 x i16> %a) #0 {
   19489 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
   19490 // CHECK:   ret <2 x i32> [[TMP0]]
   19491 int32x2_t test_vreinterpret_s32_u16(uint16x4_t a) {
   19492   return vreinterpret_s32_u16(a);
   19493 }
   19494 
   19495 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u32(<2 x i32> %a) #0 {
   19496 // CHECK:   ret <2 x i32> %a
   19497 int32x2_t test_vreinterpret_s32_u32(uint32x2_t a) {
   19498   return vreinterpret_s32_u32(a);
   19499 }
   19500 
   19501 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u64(<1 x i64> %a) #0 {
   19502 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
   19503 // CHECK:   ret <2 x i32> [[TMP0]]
   19504 int32x2_t test_vreinterpret_s32_u64(uint64x1_t a) {
   19505   return vreinterpret_s32_u64(a);
   19506 }
   19507 
   19508 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_f16(<4 x half> %a) #0 {
   19509 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x i32>
   19510 // CHECK:   ret <2 x i32> [[TMP0]]
   19511 int32x2_t test_vreinterpret_s32_f16(float16x4_t a) {
   19512   return vreinterpret_s32_f16(a);
   19513 }
   19514 
   19515 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_f32(<2 x float> %a) #0 {
   19516 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <2 x i32>
   19517 // CHECK:   ret <2 x i32> [[TMP0]]
   19518 int32x2_t test_vreinterpret_s32_f32(float32x2_t a) {
   19519   return vreinterpret_s32_f32(a);
   19520 }
   19521 
   19522 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_f64(<1 x double> %a) #0 {
   19523 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <2 x i32>
   19524 // CHECK:   ret <2 x i32> [[TMP0]]
   19525 int32x2_t test_vreinterpret_s32_f64(float64x1_t a) {
   19526   return vreinterpret_s32_f64(a);
   19527 }
   19528 
   19529 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_p8(<8 x i8> %a) #0 {
   19530 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
   19531 // CHECK:   ret <2 x i32> [[TMP0]]
   19532 int32x2_t test_vreinterpret_s32_p8(poly8x8_t a) {
   19533   return vreinterpret_s32_p8(a);
   19534 }
   19535 
   19536 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_p16(<4 x i16> %a) #0 {
   19537 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
   19538 // CHECK:   ret <2 x i32> [[TMP0]]
   19539 int32x2_t test_vreinterpret_s32_p16(poly16x4_t a) {
   19540   return vreinterpret_s32_p16(a);
   19541 }
   19542 
   19543 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_p64(<1 x i64> %a) #0 {
   19544 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
   19545 // CHECK:   ret <2 x i32> [[TMP0]]
   19546 int32x2_t test_vreinterpret_s32_p64(poly64x1_t a) {
   19547   return vreinterpret_s32_p64(a);
   19548 }
   19549 
   19550 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_s8(<8 x i8> %a) #0 {
   19551 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
   19552 // CHECK:   ret <1 x i64> [[TMP0]]
   19553 int64x1_t test_vreinterpret_s64_s8(int8x8_t a) {
   19554   return vreinterpret_s64_s8(a);
   19555 }
   19556 
   19557 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_s16(<4 x i16> %a) #0 {
   19558 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
   19559 // CHECK:   ret <1 x i64> [[TMP0]]
   19560 int64x1_t test_vreinterpret_s64_s16(int16x4_t a) {
   19561   return vreinterpret_s64_s16(a);
   19562 }
   19563 
   19564 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_s32(<2 x i32> %a) #0 {
   19565 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
   19566 // CHECK:   ret <1 x i64> [[TMP0]]
   19567 int64x1_t test_vreinterpret_s64_s32(int32x2_t a) {
   19568   return vreinterpret_s64_s32(a);
   19569 }
   19570 
   19571 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u8(<8 x i8> %a) #0 {
   19572 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
   19573 // CHECK:   ret <1 x i64> [[TMP0]]
   19574 int64x1_t test_vreinterpret_s64_u8(uint8x8_t a) {
   19575   return vreinterpret_s64_u8(a);
   19576 }
   19577 
   19578 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u16(<4 x i16> %a) #0 {
   19579 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
   19580 // CHECK:   ret <1 x i64> [[TMP0]]
   19581 int64x1_t test_vreinterpret_s64_u16(uint16x4_t a) {
   19582   return vreinterpret_s64_u16(a);
   19583 }
   19584 
   19585 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u32(<2 x i32> %a) #0 {
   19586 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
   19587 // CHECK:   ret <1 x i64> [[TMP0]]
   19588 int64x1_t test_vreinterpret_s64_u32(uint32x2_t a) {
   19589   return vreinterpret_s64_u32(a);
   19590 }
   19591 
   19592 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u64(<1 x i64> %a) #0 {
   19593 // CHECK:   ret <1 x i64> %a
   19594 int64x1_t test_vreinterpret_s64_u64(uint64x1_t a) {
   19595   return vreinterpret_s64_u64(a);
   19596 }
   19597 
   19598 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_f16(<4 x half> %a) #0 {
   19599 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x i64>
   19600 // CHECK:   ret <1 x i64> [[TMP0]]
   19601 int64x1_t test_vreinterpret_s64_f16(float16x4_t a) {
   19602   return vreinterpret_s64_f16(a);
   19603 }
   19604 
   19605 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_f32(<2 x float> %a) #0 {
   19606 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x i64>
   19607 // CHECK:   ret <1 x i64> [[TMP0]]
   19608 int64x1_t test_vreinterpret_s64_f32(float32x2_t a) {
   19609   return vreinterpret_s64_f32(a);
   19610 }
   19611 
   19612 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_f64(<1 x double> %a) #0 {
   19613 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <1 x i64>
   19614 // CHECK:   ret <1 x i64> [[TMP0]]
   19615 int64x1_t test_vreinterpret_s64_f64(float64x1_t a) {
   19616   return vreinterpret_s64_f64(a);
   19617 }
   19618 
   19619 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_p8(<8 x i8> %a) #0 {
   19620 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
   19621 // CHECK:   ret <1 x i64> [[TMP0]]
   19622 int64x1_t test_vreinterpret_s64_p8(poly8x8_t a) {
   19623   return vreinterpret_s64_p8(a);
   19624 }
   19625 
   19626 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_p16(<4 x i16> %a) #0 {
   19627 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
   19628 // CHECK:   ret <1 x i64> [[TMP0]]
   19629 int64x1_t test_vreinterpret_s64_p16(poly16x4_t a) {
   19630   return vreinterpret_s64_p16(a);
   19631 }
   19632 
   19633 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_p64(<1 x i64> %a) #0 {
   19634 // CHECK:   ret <1 x i64> %a
   19635 int64x1_t test_vreinterpret_s64_p64(poly64x1_t a) {
   19636   return vreinterpret_s64_p64(a);
   19637 }
   19638 
   19639 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s8(<8 x i8> %a) #0 {
   19640 // CHECK:   ret <8 x i8> %a
   19641 uint8x8_t test_vreinterpret_u8_s8(int8x8_t a) {
   19642   return vreinterpret_u8_s8(a);
   19643 }
   19644 
   19645 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s16(<4 x i16> %a) #0 {
   19646 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   19647 // CHECK:   ret <8 x i8> [[TMP0]]
   19648 uint8x8_t test_vreinterpret_u8_s16(int16x4_t a) {
   19649   return vreinterpret_u8_s16(a);
   19650 }
   19651 
   19652 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s32(<2 x i32> %a) #0 {
   19653 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   19654 // CHECK:   ret <8 x i8> [[TMP0]]
   19655 uint8x8_t test_vreinterpret_u8_s32(int32x2_t a) {
   19656   return vreinterpret_u8_s32(a);
   19657 }
   19658 
   19659 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s64(<1 x i64> %a) #0 {
   19660 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   19661 // CHECK:   ret <8 x i8> [[TMP0]]
   19662 uint8x8_t test_vreinterpret_u8_s64(int64x1_t a) {
   19663   return vreinterpret_u8_s64(a);
   19664 }
   19665 
   19666 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_u16(<4 x i16> %a) #0 {
   19667 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   19668 // CHECK:   ret <8 x i8> [[TMP0]]
   19669 uint8x8_t test_vreinterpret_u8_u16(uint16x4_t a) {
   19670   return vreinterpret_u8_u16(a);
   19671 }
   19672 
   19673 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_u32(<2 x i32> %a) #0 {
   19674 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   19675 // CHECK:   ret <8 x i8> [[TMP0]]
   19676 uint8x8_t test_vreinterpret_u8_u32(uint32x2_t a) {
   19677   return vreinterpret_u8_u32(a);
   19678 }
   19679 
   19680 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_u64(<1 x i64> %a) #0 {
   19681 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   19682 // CHECK:   ret <8 x i8> [[TMP0]]
   19683 uint8x8_t test_vreinterpret_u8_u64(uint64x1_t a) {
   19684   return vreinterpret_u8_u64(a);
   19685 }
   19686 
   19687 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_f16(<4 x half> %a) #0 {
   19688 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
   19689 // CHECK:   ret <8 x i8> [[TMP0]]
   19690 uint8x8_t test_vreinterpret_u8_f16(float16x4_t a) {
   19691   return vreinterpret_u8_f16(a);
   19692 }
   19693 
   19694 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_f32(<2 x float> %a) #0 {
   19695 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   19696 // CHECK:   ret <8 x i8> [[TMP0]]
   19697 uint8x8_t test_vreinterpret_u8_f32(float32x2_t a) {
   19698   return vreinterpret_u8_f32(a);
   19699 }
   19700 
   19701 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_f64(<1 x double> %a) #0 {
   19702 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
   19703 // CHECK:   ret <8 x i8> [[TMP0]]
   19704 uint8x8_t test_vreinterpret_u8_f64(float64x1_t a) {
   19705   return vreinterpret_u8_f64(a);
   19706 }
   19707 
   19708 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_p8(<8 x i8> %a) #0 {
   19709 // CHECK:   ret <8 x i8> %a
   19710 uint8x8_t test_vreinterpret_u8_p8(poly8x8_t a) {
   19711   return vreinterpret_u8_p8(a);
   19712 }
   19713 
   19714 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_p16(<4 x i16> %a) #0 {
   19715 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   19716 // CHECK:   ret <8 x i8> [[TMP0]]
   19717 uint8x8_t test_vreinterpret_u8_p16(poly16x4_t a) {
   19718   return vreinterpret_u8_p16(a);
   19719 }
   19720 
   19721 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_p64(<1 x i64> %a) #0 {
   19722 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   19723 // CHECK:   ret <8 x i8> [[TMP0]]
   19724 uint8x8_t test_vreinterpret_u8_p64(poly64x1_t a) {
   19725   return vreinterpret_u8_p64(a);
   19726 }
   19727 
   19728 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s8(<8 x i8> %a) #0 {
   19729 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
   19730 // CHECK:   ret <4 x i16> [[TMP0]]
   19731 uint16x4_t test_vreinterpret_u16_s8(int8x8_t a) {
   19732   return vreinterpret_u16_s8(a);
   19733 }
   19734 
   19735 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s16(<4 x i16> %a) #0 {
   19736 // CHECK:   ret <4 x i16> %a
   19737 uint16x4_t test_vreinterpret_u16_s16(int16x4_t a) {
   19738   return vreinterpret_u16_s16(a);
   19739 }
   19740 
   19741 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s32(<2 x i32> %a) #0 {
   19742 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
   19743 // CHECK:   ret <4 x i16> [[TMP0]]
   19744 uint16x4_t test_vreinterpret_u16_s32(int32x2_t a) {
   19745   return vreinterpret_u16_s32(a);
   19746 }
   19747 
   19748 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s64(<1 x i64> %a) #0 {
   19749 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
   19750 // CHECK:   ret <4 x i16> [[TMP0]]
   19751 uint16x4_t test_vreinterpret_u16_s64(int64x1_t a) {
   19752   return vreinterpret_u16_s64(a);
   19753 }
   19754 
   19755 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_u8(<8 x i8> %a) #0 {
   19756 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
   19757 // CHECK:   ret <4 x i16> [[TMP0]]
   19758 uint16x4_t test_vreinterpret_u16_u8(uint8x8_t a) {
   19759   return vreinterpret_u16_u8(a);
   19760 }
   19761 
   19762 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_u32(<2 x i32> %a) #0 {
   19763 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
   19764 // CHECK:   ret <4 x i16> [[TMP0]]
   19765 uint16x4_t test_vreinterpret_u16_u32(uint32x2_t a) {
   19766   return vreinterpret_u16_u32(a);
   19767 }
   19768 
   19769 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_u64(<1 x i64> %a) #0 {
   19770 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
   19771 // CHECK:   ret <4 x i16> [[TMP0]]
   19772 uint16x4_t test_vreinterpret_u16_u64(uint64x1_t a) {
   19773   return vreinterpret_u16_u64(a);
   19774 }
   19775 
   19776 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_f16(<4 x half> %a) #0 {
   19777 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16>
   19778 // CHECK:   ret <4 x i16> [[TMP0]]
   19779 uint16x4_t test_vreinterpret_u16_f16(float16x4_t a) {
   19780   return vreinterpret_u16_f16(a);
   19781 }
   19782 
   19783 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_f32(<2 x float> %a) #0 {
   19784 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16>
   19785 // CHECK:   ret <4 x i16> [[TMP0]]
   19786 uint16x4_t test_vreinterpret_u16_f32(float32x2_t a) {
   19787   return vreinterpret_u16_f32(a);
   19788 }
   19789 
   19790 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_f64(<1 x double> %a) #0 {
   19791 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <4 x i16>
   19792 // CHECK:   ret <4 x i16> [[TMP0]]
   19793 uint16x4_t test_vreinterpret_u16_f64(float64x1_t a) {
   19794   return vreinterpret_u16_f64(a);
   19795 }
   19796 
   19797 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_p8(<8 x i8> %a) #0 {
   19798 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
   19799 // CHECK:   ret <4 x i16> [[TMP0]]
   19800 uint16x4_t test_vreinterpret_u16_p8(poly8x8_t a) {
   19801   return vreinterpret_u16_p8(a);
   19802 }
   19803 
   19804 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_p16(<4 x i16> %a) #0 {
   19805 // CHECK:   ret <4 x i16> %a
   19806 uint16x4_t test_vreinterpret_u16_p16(poly16x4_t a) {
   19807   return vreinterpret_u16_p16(a);
   19808 }
   19809 
   19810 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_p64(<1 x i64> %a) #0 {
   19811 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
   19812 // CHECK:   ret <4 x i16> [[TMP0]]
   19813 uint16x4_t test_vreinterpret_u16_p64(poly64x1_t a) {
   19814   return vreinterpret_u16_p64(a);
   19815 }
   19816 
   19817 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s8(<8 x i8> %a) #0 {
   19818 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
   19819 // CHECK:   ret <2 x i32> [[TMP0]]
   19820 uint32x2_t test_vreinterpret_u32_s8(int8x8_t a) {
   19821   return vreinterpret_u32_s8(a);
   19822 }
   19823 
   19824 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s16(<4 x i16> %a) #0 {
   19825 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
   19826 // CHECK:   ret <2 x i32> [[TMP0]]
   19827 uint32x2_t test_vreinterpret_u32_s16(int16x4_t a) {
   19828   return vreinterpret_u32_s16(a);
   19829 }
   19830 
   19831 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s32(<2 x i32> %a) #0 {
   19832 // CHECK:   ret <2 x i32> %a
   19833 uint32x2_t test_vreinterpret_u32_s32(int32x2_t a) {
   19834   return vreinterpret_u32_s32(a);
   19835 }
   19836 
   19837 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s64(<1 x i64> %a) #0 {
   19838 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
   19839 // CHECK:   ret <2 x i32> [[TMP0]]
   19840 uint32x2_t test_vreinterpret_u32_s64(int64x1_t a) {
   19841   return vreinterpret_u32_s64(a);
   19842 }
   19843 
   19844 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_u8(<8 x i8> %a) #0 {
   19845 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
   19846 // CHECK:   ret <2 x i32> [[TMP0]]
   19847 uint32x2_t test_vreinterpret_u32_u8(uint8x8_t a) {
   19848   return vreinterpret_u32_u8(a);
   19849 }
   19850 
   19851 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_u16(<4 x i16> %a) #0 {
   19852 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
   19853 // CHECK:   ret <2 x i32> [[TMP0]]
   19854 uint32x2_t test_vreinterpret_u32_u16(uint16x4_t a) {
   19855   return vreinterpret_u32_u16(a);
   19856 }
   19857 
   19858 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_u64(<1 x i64> %a) #0 {
   19859 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
   19860 // CHECK:   ret <2 x i32> [[TMP0]]
   19861 uint32x2_t test_vreinterpret_u32_u64(uint64x1_t a) {
   19862   return vreinterpret_u32_u64(a);
   19863 }
   19864 
   19865 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_f16(<4 x half> %a) #0 {
   19866 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x i32>
   19867 // CHECK:   ret <2 x i32> [[TMP0]]
   19868 uint32x2_t test_vreinterpret_u32_f16(float16x4_t a) {
   19869   return vreinterpret_u32_f16(a);
   19870 }
   19871 
   19872 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_f32(<2 x float> %a) #0 {
   19873 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <2 x i32>
   19874 // CHECK:   ret <2 x i32> [[TMP0]]
   19875 uint32x2_t test_vreinterpret_u32_f32(float32x2_t a) {
   19876   return vreinterpret_u32_f32(a);
   19877 }
   19878 
   19879 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_f64(<1 x double> %a) #0 {
   19880 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <2 x i32>
   19881 // CHECK:   ret <2 x i32> [[TMP0]]
   19882 uint32x2_t test_vreinterpret_u32_f64(float64x1_t a) {
   19883   return vreinterpret_u32_f64(a);
   19884 }
   19885 
   19886 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_p8(<8 x i8> %a) #0 {
   19887 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
   19888 // CHECK:   ret <2 x i32> [[TMP0]]
   19889 uint32x2_t test_vreinterpret_u32_p8(poly8x8_t a) {
   19890   return vreinterpret_u32_p8(a);
   19891 }
   19892 
   19893 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_p16(<4 x i16> %a) #0 {
   19894 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
   19895 // CHECK:   ret <2 x i32> [[TMP0]]
   19896 uint32x2_t test_vreinterpret_u32_p16(poly16x4_t a) {
   19897   return vreinterpret_u32_p16(a);
   19898 }
   19899 
   19900 // CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_p64(<1 x i64> %a) #0 {
   19901 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
   19902 // CHECK:   ret <2 x i32> [[TMP0]]
   19903 uint32x2_t test_vreinterpret_u32_p64(poly64x1_t a) {
   19904   return vreinterpret_u32_p64(a);
   19905 }
   19906 
   19907 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s8(<8 x i8> %a) #0 {
   19908 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
   19909 // CHECK:   ret <1 x i64> [[TMP0]]
   19910 uint64x1_t test_vreinterpret_u64_s8(int8x8_t a) {
   19911   return vreinterpret_u64_s8(a);
   19912 }
   19913 
   19914 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s16(<4 x i16> %a) #0 {
   19915 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
   19916 // CHECK:   ret <1 x i64> [[TMP0]]
   19917 uint64x1_t test_vreinterpret_u64_s16(int16x4_t a) {
   19918   return vreinterpret_u64_s16(a);
   19919 }
   19920 
   19921 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s32(<2 x i32> %a) #0 {
   19922 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
   19923 // CHECK:   ret <1 x i64> [[TMP0]]
   19924 uint64x1_t test_vreinterpret_u64_s32(int32x2_t a) {
   19925   return vreinterpret_u64_s32(a);
   19926 }
   19927 
   19928 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s64(<1 x i64> %a) #0 {
   19929 // CHECK:   ret <1 x i64> %a
   19930 uint64x1_t test_vreinterpret_u64_s64(int64x1_t a) {
   19931   return vreinterpret_u64_s64(a);
   19932 }
   19933 
   19934 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_u8(<8 x i8> %a) #0 {
   19935 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
   19936 // CHECK:   ret <1 x i64> [[TMP0]]
   19937 uint64x1_t test_vreinterpret_u64_u8(uint8x8_t a) {
   19938   return vreinterpret_u64_u8(a);
   19939 }
   19940 
   19941 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_u16(<4 x i16> %a) #0 {
   19942 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
   19943 // CHECK:   ret <1 x i64> [[TMP0]]
   19944 uint64x1_t test_vreinterpret_u64_u16(uint16x4_t a) {
   19945   return vreinterpret_u64_u16(a);
   19946 }
   19947 
   19948 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_u32(<2 x i32> %a) #0 {
   19949 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
   19950 // CHECK:   ret <1 x i64> [[TMP0]]
   19951 uint64x1_t test_vreinterpret_u64_u32(uint32x2_t a) {
   19952   return vreinterpret_u64_u32(a);
   19953 }
   19954 
   19955 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_f16(<4 x half> %a) #0 {
   19956 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x i64>
   19957 // CHECK:   ret <1 x i64> [[TMP0]]
   19958 uint64x1_t test_vreinterpret_u64_f16(float16x4_t a) {
   19959   return vreinterpret_u64_f16(a);
   19960 }
   19961 
   19962 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_f32(<2 x float> %a) #0 {
   19963 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x i64>
   19964 // CHECK:   ret <1 x i64> [[TMP0]]
   19965 uint64x1_t test_vreinterpret_u64_f32(float32x2_t a) {
   19966   return vreinterpret_u64_f32(a);
   19967 }
   19968 
   19969 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_f64(<1 x double> %a) #0 {
   19970 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <1 x i64>
   19971 // CHECK:   ret <1 x i64> [[TMP0]]
   19972 uint64x1_t test_vreinterpret_u64_f64(float64x1_t a) {
   19973   return vreinterpret_u64_f64(a);
   19974 }
   19975 
   19976 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_p8(<8 x i8> %a) #0 {
   19977 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
   19978 // CHECK:   ret <1 x i64> [[TMP0]]
   19979 uint64x1_t test_vreinterpret_u64_p8(poly8x8_t a) {
   19980   return vreinterpret_u64_p8(a);
   19981 }
   19982 
   19983 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_p16(<4 x i16> %a) #0 {
   19984 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
   19985 // CHECK:   ret <1 x i64> [[TMP0]]
   19986 uint64x1_t test_vreinterpret_u64_p16(poly16x4_t a) {
   19987   return vreinterpret_u64_p16(a);
   19988 }
   19989 
   19990 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_p64(<1 x i64> %a) #0 {
   19991 // CHECK:   ret <1 x i64> %a
   19992 uint64x1_t test_vreinterpret_u64_p64(poly64x1_t a) {
   19993   return vreinterpret_u64_p64(a);
   19994 }
   19995 
   19996 // CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s8(<8 x i8> %a) #0 {
   19997 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half>
   19998 // CHECK:   ret <4 x half> [[TMP0]]
   19999 float16x4_t test_vreinterpret_f16_s8(int8x8_t a) {
   20000   return vreinterpret_f16_s8(a);
   20001 }
   20002 
   20003 // CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s16(<4 x i16> %a) #0 {
   20004 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half>
   20005 // CHECK:   ret <4 x half> [[TMP0]]
   20006 float16x4_t test_vreinterpret_f16_s16(int16x4_t a) {
   20007   return vreinterpret_f16_s16(a);
   20008 }
   20009 
   20010 // CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s32(<2 x i32> %a) #0 {
   20011 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x half>
   20012 // CHECK:   ret <4 x half> [[TMP0]]
   20013 float16x4_t test_vreinterpret_f16_s32(int32x2_t a) {
   20014   return vreinterpret_f16_s32(a);
   20015 }
   20016 
   20017 // CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s64(<1 x i64> %a) #0 {
   20018 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x half>
   20019 // CHECK:   ret <4 x half> [[TMP0]]
   20020 float16x4_t test_vreinterpret_f16_s64(int64x1_t a) {
   20021   return vreinterpret_f16_s64(a);
   20022 }
   20023 
   20024 // CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u8(<8 x i8> %a) #0 {
   20025 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half>
   20026 // CHECK:   ret <4 x half> [[TMP0]]
   20027 float16x4_t test_vreinterpret_f16_u8(uint8x8_t a) {
   20028   return vreinterpret_f16_u8(a);
   20029 }
   20030 
   20031 // CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u16(<4 x i16> %a) #0 {
   20032 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half>
   20033 // CHECK:   ret <4 x half> [[TMP0]]
   20034 float16x4_t test_vreinterpret_f16_u16(uint16x4_t a) {
   20035   return vreinterpret_f16_u16(a);
   20036 }
   20037 
   20038 // CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u32(<2 x i32> %a) #0 {
   20039 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x half>
   20040 // CHECK:   ret <4 x half> [[TMP0]]
   20041 float16x4_t test_vreinterpret_f16_u32(uint32x2_t a) {
   20042   return vreinterpret_f16_u32(a);
   20043 }
   20044 
   20045 // CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u64(<1 x i64> %a) #0 {
   20046 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x half>
   20047 // CHECK:   ret <4 x half> [[TMP0]]
   20048 float16x4_t test_vreinterpret_f16_u64(uint64x1_t a) {
   20049   return vreinterpret_f16_u64(a);
   20050 }
   20051 
   20052 // CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_f32(<2 x float> %a) #0 {
   20053 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x half>
   20054 // CHECK:   ret <4 x half> [[TMP0]]
   20055 float16x4_t test_vreinterpret_f16_f32(float32x2_t a) {
   20056   return vreinterpret_f16_f32(a);
   20057 }
   20058 
   20059 // CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_f64(<1 x double> %a) #0 {
   20060 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <4 x half>
   20061 // CHECK:   ret <4 x half> [[TMP0]]
   20062 float16x4_t test_vreinterpret_f16_f64(float64x1_t a) {
   20063   return vreinterpret_f16_f64(a);
   20064 }
   20065 
   20066 // CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_p8(<8 x i8> %a) #0 {
   20067 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half>
   20068 // CHECK:   ret <4 x half> [[TMP0]]
   20069 float16x4_t test_vreinterpret_f16_p8(poly8x8_t a) {
   20070   return vreinterpret_f16_p8(a);
   20071 }
   20072 
   20073 // CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_p16(<4 x i16> %a) #0 {
   20074 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half>
   20075 // CHECK:   ret <4 x half> [[TMP0]]
   20076 float16x4_t test_vreinterpret_f16_p16(poly16x4_t a) {
   20077   return vreinterpret_f16_p16(a);
   20078 }
   20079 
   20080 // CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_p64(<1 x i64> %a) #0 {
   20081 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x half>
   20082 // CHECK:   ret <4 x half> [[TMP0]]
   20083 float16x4_t test_vreinterpret_f16_p64(poly64x1_t a) {
   20084   return vreinterpret_f16_p64(a);
   20085 }
   20086 
   20087 // CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s8(<8 x i8> %a) #0 {
   20088 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float>
   20089 // CHECK:   ret <2 x float> [[TMP0]]
   20090 float32x2_t test_vreinterpret_f32_s8(int8x8_t a) {
   20091   return vreinterpret_f32_s8(a);
   20092 }
   20093 
   20094 // CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s16(<4 x i16> %a) #0 {
   20095 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float>
   20096 // CHECK:   ret <2 x float> [[TMP0]]
   20097 float32x2_t test_vreinterpret_f32_s16(int16x4_t a) {
   20098   return vreinterpret_f32_s16(a);
   20099 }
   20100 
   20101 // CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s32(<2 x i32> %a) #0 {
   20102 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <2 x float>
   20103 // CHECK:   ret <2 x float> [[TMP0]]
   20104 float32x2_t test_vreinterpret_f32_s32(int32x2_t a) {
   20105   return vreinterpret_f32_s32(a);
   20106 }
   20107 
   20108 // CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s64(<1 x i64> %a) #0 {
   20109 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x float>
   20110 // CHECK:   ret <2 x float> [[TMP0]]
   20111 float32x2_t test_vreinterpret_f32_s64(int64x1_t a) {
   20112   return vreinterpret_f32_s64(a);
   20113 }
   20114 
   20115 // CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u8(<8 x i8> %a) #0 {
   20116 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float>
   20117 // CHECK:   ret <2 x float> [[TMP0]]
   20118 float32x2_t test_vreinterpret_f32_u8(uint8x8_t a) {
   20119   return vreinterpret_f32_u8(a);
   20120 }
   20121 
   20122 // CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u16(<4 x i16> %a) #0 {
   20123 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float>
   20124 // CHECK:   ret <2 x float> [[TMP0]]
   20125 float32x2_t test_vreinterpret_f32_u16(uint16x4_t a) {
   20126   return vreinterpret_f32_u16(a);
   20127 }
   20128 
   20129 // CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u32(<2 x i32> %a) #0 {
   20130 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <2 x float>
   20131 // CHECK:   ret <2 x float> [[TMP0]]
   20132 float32x2_t test_vreinterpret_f32_u32(uint32x2_t a) {
   20133   return vreinterpret_f32_u32(a);
   20134 }
   20135 
   20136 // CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u64(<1 x i64> %a) #0 {
   20137 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x float>
   20138 // CHECK:   ret <2 x float> [[TMP0]]
   20139 float32x2_t test_vreinterpret_f32_u64(uint64x1_t a) {
   20140   return vreinterpret_f32_u64(a);
   20141 }
   20142 
   20143 // CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_f16(<4 x half> %a) #0 {
   20144 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x float>
   20145 // CHECK:   ret <2 x float> [[TMP0]]
   20146 float32x2_t test_vreinterpret_f32_f16(float16x4_t a) {
   20147   return vreinterpret_f32_f16(a);
   20148 }
   20149 
   20150 // CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_f64(<1 x double> %a) #0 {
   20151 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <2 x float>
   20152 // CHECK:   ret <2 x float> [[TMP0]]
   20153 float32x2_t test_vreinterpret_f32_f64(float64x1_t a) {
   20154   return vreinterpret_f32_f64(a);
   20155 }
   20156 
   20157 // CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_p8(<8 x i8> %a) #0 {
   20158 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float>
   20159 // CHECK:   ret <2 x float> [[TMP0]]
   20160 float32x2_t test_vreinterpret_f32_p8(poly8x8_t a) {
   20161   return vreinterpret_f32_p8(a);
   20162 }
   20163 
   20164 // CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_p16(<4 x i16> %a) #0 {
   20165 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float>
   20166 // CHECK:   ret <2 x float> [[TMP0]]
   20167 float32x2_t test_vreinterpret_f32_p16(poly16x4_t a) {
   20168   return vreinterpret_f32_p16(a);
   20169 }
   20170 
   20171 // CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_p64(<1 x i64> %a) #0 {
   20172 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x float>
   20173 // CHECK:   ret <2 x float> [[TMP0]]
   20174 float32x2_t test_vreinterpret_f32_p64(poly64x1_t a) {
   20175   return vreinterpret_f32_p64(a);
   20176 }
   20177 
   20178 // CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_s8(<8 x i8> %a) #0 {
   20179 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x double>
   20180 // CHECK:   ret <1 x double> [[TMP0]]
   20181 float64x1_t test_vreinterpret_f64_s8(int8x8_t a) {
   20182   return vreinterpret_f64_s8(a);
   20183 }
   20184 
   20185 // CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_s16(<4 x i16> %a) #0 {
   20186 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x double>
   20187 // CHECK:   ret <1 x double> [[TMP0]]
   20188 float64x1_t test_vreinterpret_f64_s16(int16x4_t a) {
   20189   return vreinterpret_f64_s16(a);
   20190 }
   20191 
   20192 // CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_s32(<2 x i32> %a) #0 {
   20193 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x double>
   20194 // CHECK:   ret <1 x double> [[TMP0]]
   20195 float64x1_t test_vreinterpret_f64_s32(int32x2_t a) {
   20196   return vreinterpret_f64_s32(a);
   20197 }
   20198 
   20199 // CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_s64(<1 x i64> %a) #0 {
   20200 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <1 x double>
   20201 // CHECK:   ret <1 x double> [[TMP0]]
   20202 float64x1_t test_vreinterpret_f64_s64(int64x1_t a) {
   20203   return vreinterpret_f64_s64(a);
   20204 }
   20205 
   20206 // CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_u8(<8 x i8> %a) #0 {
   20207 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x double>
   20208 // CHECK:   ret <1 x double> [[TMP0]]
   20209 float64x1_t test_vreinterpret_f64_u8(uint8x8_t a) {
   20210   return vreinterpret_f64_u8(a);
   20211 }
   20212 
   20213 // CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_u16(<4 x i16> %a) #0 {
   20214 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x double>
   20215 // CHECK:   ret <1 x double> [[TMP0]]
   20216 float64x1_t test_vreinterpret_f64_u16(uint16x4_t a) {
   20217   return vreinterpret_f64_u16(a);
   20218 }
   20219 
   20220 // CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_u32(<2 x i32> %a) #0 {
   20221 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x double>
   20222 // CHECK:   ret <1 x double> [[TMP0]]
   20223 float64x1_t test_vreinterpret_f64_u32(uint32x2_t a) {
   20224   return vreinterpret_f64_u32(a);
   20225 }
   20226 
   20227 // CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_u64(<1 x i64> %a) #0 {
   20228 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <1 x double>
   20229 // CHECK:   ret <1 x double> [[TMP0]]
   20230 float64x1_t test_vreinterpret_f64_u64(uint64x1_t a) {
   20231   return vreinterpret_f64_u64(a);
   20232 }
   20233 
   20234 // CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_f16(<4 x half> %a) #0 {
   20235 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x double>
   20236 // CHECK:   ret <1 x double> [[TMP0]]
   20237 float64x1_t test_vreinterpret_f64_f16(float16x4_t a) {
   20238   return vreinterpret_f64_f16(a);
   20239 }
   20240 
   20241 // CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_f32(<2 x float> %a) #0 {
   20242 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x double>
   20243 // CHECK:   ret <1 x double> [[TMP0]]
   20244 float64x1_t test_vreinterpret_f64_f32(float32x2_t a) {
   20245   return vreinterpret_f64_f32(a);
   20246 }
   20247 
   20248 // CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_p8(<8 x i8> %a) #0 {
   20249 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x double>
   20250 // CHECK:   ret <1 x double> [[TMP0]]
   20251 float64x1_t test_vreinterpret_f64_p8(poly8x8_t a) {
   20252   return vreinterpret_f64_p8(a);
   20253 }
   20254 
   20255 // CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_p16(<4 x i16> %a) #0 {
   20256 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x double>
   20257 // CHECK:   ret <1 x double> [[TMP0]]
   20258 float64x1_t test_vreinterpret_f64_p16(poly16x4_t a) {
   20259   return vreinterpret_f64_p16(a);
   20260 }
   20261 
   20262 // CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_p64(<1 x i64> %a) #0 {
   20263 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <1 x double>
   20264 // CHECK:   ret <1 x double> [[TMP0]]
   20265 float64x1_t test_vreinterpret_f64_p64(poly64x1_t a) {
   20266   return vreinterpret_f64_p64(a);
   20267 }
   20268 
   20269 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s8(<8 x i8> %a) #0 {
   20270 // CHECK:   ret <8 x i8> %a
   20271 poly8x8_t test_vreinterpret_p8_s8(int8x8_t a) {
   20272   return vreinterpret_p8_s8(a);
   20273 }
   20274 
   20275 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s16(<4 x i16> %a) #0 {
   20276 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   20277 // CHECK:   ret <8 x i8> [[TMP0]]
   20278 poly8x8_t test_vreinterpret_p8_s16(int16x4_t a) {
   20279   return vreinterpret_p8_s16(a);
   20280 }
   20281 
   20282 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s32(<2 x i32> %a) #0 {
   20283 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   20284 // CHECK:   ret <8 x i8> [[TMP0]]
   20285 poly8x8_t test_vreinterpret_p8_s32(int32x2_t a) {
   20286   return vreinterpret_p8_s32(a);
   20287 }
   20288 
   20289 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s64(<1 x i64> %a) #0 {
   20290 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   20291 // CHECK:   ret <8 x i8> [[TMP0]]
   20292 poly8x8_t test_vreinterpret_p8_s64(int64x1_t a) {
   20293   return vreinterpret_p8_s64(a);
   20294 }
   20295 
   20296 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u8(<8 x i8> %a) #0 {
   20297 // CHECK:   ret <8 x i8> %a
   20298 poly8x8_t test_vreinterpret_p8_u8(uint8x8_t a) {
   20299   return vreinterpret_p8_u8(a);
   20300 }
   20301 
   20302 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u16(<4 x i16> %a) #0 {
   20303 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   20304 // CHECK:   ret <8 x i8> [[TMP0]]
   20305 poly8x8_t test_vreinterpret_p8_u16(uint16x4_t a) {
   20306   return vreinterpret_p8_u16(a);
   20307 }
   20308 
   20309 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u32(<2 x i32> %a) #0 {
   20310 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   20311 // CHECK:   ret <8 x i8> [[TMP0]]
   20312 poly8x8_t test_vreinterpret_p8_u32(uint32x2_t a) {
   20313   return vreinterpret_p8_u32(a);
   20314 }
   20315 
   20316 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u64(<1 x i64> %a) #0 {
   20317 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   20318 // CHECK:   ret <8 x i8> [[TMP0]]
   20319 poly8x8_t test_vreinterpret_p8_u64(uint64x1_t a) {
   20320   return vreinterpret_p8_u64(a);
   20321 }
   20322 
   20323 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_f16(<4 x half> %a) #0 {
   20324 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
   20325 // CHECK:   ret <8 x i8> [[TMP0]]
   20326 poly8x8_t test_vreinterpret_p8_f16(float16x4_t a) {
   20327   return vreinterpret_p8_f16(a);
   20328 }
   20329 
   20330 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_f32(<2 x float> %a) #0 {
   20331 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   20332 // CHECK:   ret <8 x i8> [[TMP0]]
   20333 poly8x8_t test_vreinterpret_p8_f32(float32x2_t a) {
   20334   return vreinterpret_p8_f32(a);
   20335 }
   20336 
   20337 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_f64(<1 x double> %a) #0 {
   20338 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
   20339 // CHECK:   ret <8 x i8> [[TMP0]]
   20340 poly8x8_t test_vreinterpret_p8_f64(float64x1_t a) {
   20341   return vreinterpret_p8_f64(a);
   20342 }
   20343 
   20344 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_p16(<4 x i16> %a) #0 {
   20345 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   20346 // CHECK:   ret <8 x i8> [[TMP0]]
   20347 poly8x8_t test_vreinterpret_p8_p16(poly16x4_t a) {
   20348   return vreinterpret_p8_p16(a);
   20349 }
   20350 
   20351 // CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_p64(<1 x i64> %a) #0 {
   20352 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   20353 // CHECK:   ret <8 x i8> [[TMP0]]
   20354 poly8x8_t test_vreinterpret_p8_p64(poly64x1_t a) {
   20355   return vreinterpret_p8_p64(a);
   20356 }
   20357 
   20358 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s8(<8 x i8> %a) #0 {
   20359 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
   20360 // CHECK:   ret <4 x i16> [[TMP0]]
   20361 poly16x4_t test_vreinterpret_p16_s8(int8x8_t a) {
   20362   return vreinterpret_p16_s8(a);
   20363 }
   20364 
   20365 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s16(<4 x i16> %a) #0 {
   20366 // CHECK:   ret <4 x i16> %a
   20367 poly16x4_t test_vreinterpret_p16_s16(int16x4_t a) {
   20368   return vreinterpret_p16_s16(a);
   20369 }
   20370 
   20371 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s32(<2 x i32> %a) #0 {
   20372 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
   20373 // CHECK:   ret <4 x i16> [[TMP0]]
   20374 poly16x4_t test_vreinterpret_p16_s32(int32x2_t a) {
   20375   return vreinterpret_p16_s32(a);
   20376 }
   20377 
   20378 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s64(<1 x i64> %a) #0 {
   20379 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
   20380 // CHECK:   ret <4 x i16> [[TMP0]]
   20381 poly16x4_t test_vreinterpret_p16_s64(int64x1_t a) {
   20382   return vreinterpret_p16_s64(a);
   20383 }
   20384 
   20385 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u8(<8 x i8> %a) #0 {
   20386 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
   20387 // CHECK:   ret <4 x i16> [[TMP0]]
   20388 poly16x4_t test_vreinterpret_p16_u8(uint8x8_t a) {
   20389   return vreinterpret_p16_u8(a);
   20390 }
   20391 
   20392 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u16(<4 x i16> %a) #0 {
   20393 // CHECK:   ret <4 x i16> %a
   20394 poly16x4_t test_vreinterpret_p16_u16(uint16x4_t a) {
   20395   return vreinterpret_p16_u16(a);
   20396 }
   20397 
   20398 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u32(<2 x i32> %a) #0 {
   20399 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
   20400 // CHECK:   ret <4 x i16> [[TMP0]]
   20401 poly16x4_t test_vreinterpret_p16_u32(uint32x2_t a) {
   20402   return vreinterpret_p16_u32(a);
   20403 }
   20404 
   20405 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u64(<1 x i64> %a) #0 {
   20406 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
   20407 // CHECK:   ret <4 x i16> [[TMP0]]
   20408 poly16x4_t test_vreinterpret_p16_u64(uint64x1_t a) {
   20409   return vreinterpret_p16_u64(a);
   20410 }
   20411 
   20412 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_f16(<4 x half> %a) #0 {
   20413 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16>
   20414 // CHECK:   ret <4 x i16> [[TMP0]]
   20415 poly16x4_t test_vreinterpret_p16_f16(float16x4_t a) {
   20416   return vreinterpret_p16_f16(a);
   20417 }
   20418 
   20419 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_f32(<2 x float> %a) #0 {
   20420 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16>
   20421 // CHECK:   ret <4 x i16> [[TMP0]]
   20422 poly16x4_t test_vreinterpret_p16_f32(float32x2_t a) {
   20423   return vreinterpret_p16_f32(a);
   20424 }
   20425 
   20426 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_f64(<1 x double> %a) #0 {
   20427 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <4 x i16>
   20428 // CHECK:   ret <4 x i16> [[TMP0]]
   20429 poly16x4_t test_vreinterpret_p16_f64(float64x1_t a) {
   20430   return vreinterpret_p16_f64(a);
   20431 }
   20432 
   20433 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_p8(<8 x i8> %a) #0 {
   20434 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
   20435 // CHECK:   ret <4 x i16> [[TMP0]]
   20436 poly16x4_t test_vreinterpret_p16_p8(poly8x8_t a) {
   20437   return vreinterpret_p16_p8(a);
   20438 }
   20439 
   20440 // CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_p64(<1 x i64> %a) #0 {
   20441 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
   20442 // CHECK:   ret <4 x i16> [[TMP0]]
   20443 poly16x4_t test_vreinterpret_p16_p64(poly64x1_t a) {
   20444   return vreinterpret_p16_p64(a);
   20445 }
   20446 
   20447 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_s8(<8 x i8> %a) #0 {
   20448 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
   20449 // CHECK:   ret <1 x i64> [[TMP0]]
   20450 poly64x1_t test_vreinterpret_p64_s8(int8x8_t a) {
   20451   return vreinterpret_p64_s8(a);
   20452 }
   20453 
   20454 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_s16(<4 x i16> %a) #0 {
   20455 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
   20456 // CHECK:   ret <1 x i64> [[TMP0]]
   20457 poly64x1_t test_vreinterpret_p64_s16(int16x4_t a) {
   20458   return vreinterpret_p64_s16(a);
   20459 }
   20460 
   20461 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_s32(<2 x i32> %a) #0 {
   20462 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
   20463 // CHECK:   ret <1 x i64> [[TMP0]]
   20464 poly64x1_t test_vreinterpret_p64_s32(int32x2_t a) {
   20465   return vreinterpret_p64_s32(a);
   20466 }
   20467 
   20468 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_s64(<1 x i64> %a) #0 {
   20469 // CHECK:   ret <1 x i64> %a
   20470 poly64x1_t test_vreinterpret_p64_s64(int64x1_t a) {
   20471   return vreinterpret_p64_s64(a);
   20472 }
   20473 
   20474 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_u8(<8 x i8> %a) #0 {
   20475 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
   20476 // CHECK:   ret <1 x i64> [[TMP0]]
   20477 poly64x1_t test_vreinterpret_p64_u8(uint8x8_t a) {
   20478   return vreinterpret_p64_u8(a);
   20479 }
   20480 
   20481 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_u16(<4 x i16> %a) #0 {
   20482 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
   20483 // CHECK:   ret <1 x i64> [[TMP0]]
   20484 poly64x1_t test_vreinterpret_p64_u16(uint16x4_t a) {
   20485   return vreinterpret_p64_u16(a);
   20486 }
   20487 
   20488 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_u32(<2 x i32> %a) #0 {
   20489 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
   20490 // CHECK:   ret <1 x i64> [[TMP0]]
   20491 poly64x1_t test_vreinterpret_p64_u32(uint32x2_t a) {
   20492   return vreinterpret_p64_u32(a);
   20493 }
   20494 
   20495 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_u64(<1 x i64> %a) #0 {
   20496 // CHECK:   ret <1 x i64> %a
   20497 poly64x1_t test_vreinterpret_p64_u64(uint64x1_t a) {
   20498   return vreinterpret_p64_u64(a);
   20499 }
   20500 
   20501 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_f16(<4 x half> %a) #0 {
   20502 // CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x i64>
   20503 // CHECK:   ret <1 x i64> [[TMP0]]
   20504 poly64x1_t test_vreinterpret_p64_f16(float16x4_t a) {
   20505   return vreinterpret_p64_f16(a);
   20506 }
   20507 
   20508 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_f32(<2 x float> %a) #0 {
   20509 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x i64>
   20510 // CHECK:   ret <1 x i64> [[TMP0]]
   20511 poly64x1_t test_vreinterpret_p64_f32(float32x2_t a) {
   20512   return vreinterpret_p64_f32(a);
   20513 }
   20514 
   20515 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_f64(<1 x double> %a) #0 {
   20516 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <1 x i64>
   20517 // CHECK:   ret <1 x i64> [[TMP0]]
   20518 poly64x1_t test_vreinterpret_p64_f64(float64x1_t a) {
   20519   return vreinterpret_p64_f64(a);
   20520 }
   20521 
   20522 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_p8(<8 x i8> %a) #0 {
   20523 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
   20524 // CHECK:   ret <1 x i64> [[TMP0]]
   20525 poly64x1_t test_vreinterpret_p64_p8(poly8x8_t a) {
   20526   return vreinterpret_p64_p8(a);
   20527 }
   20528 
   20529 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_p16(<4 x i16> %a) #0 {
   20530 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
   20531 // CHECK:   ret <1 x i64> [[TMP0]]
   20532 poly64x1_t test_vreinterpret_p64_p16(poly16x4_t a) {
   20533   return vreinterpret_p64_p16(a);
   20534 }
   20535 
   20536 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_s16(<8 x i16> %a) #0 {
   20537 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   20538 // CHECK:   ret <16 x i8> [[TMP0]]
   20539 int8x16_t test_vreinterpretq_s8_s16(int16x8_t a) {
   20540   return vreinterpretq_s8_s16(a);
   20541 }
   20542 
   20543 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_s32(<4 x i32> %a) #0 {
   20544 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   20545 // CHECK:   ret <16 x i8> [[TMP0]]
   20546 int8x16_t test_vreinterpretq_s8_s32(int32x4_t a) {
   20547   return vreinterpretq_s8_s32(a);
   20548 }
   20549 
   20550 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_s64(<2 x i64> %a) #0 {
   20551 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   20552 // CHECK:   ret <16 x i8> [[TMP0]]
   20553 int8x16_t test_vreinterpretq_s8_s64(int64x2_t a) {
   20554   return vreinterpretq_s8_s64(a);
   20555 }
   20556 
   20557 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u8(<16 x i8> %a) #0 {
   20558 // CHECK:   ret <16 x i8> %a
   20559 int8x16_t test_vreinterpretq_s8_u8(uint8x16_t a) {
   20560   return vreinterpretq_s8_u8(a);
   20561 }
   20562 
   20563 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u16(<8 x i16> %a) #0 {
   20564 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   20565 // CHECK:   ret <16 x i8> [[TMP0]]
   20566 int8x16_t test_vreinterpretq_s8_u16(uint16x8_t a) {
   20567   return vreinterpretq_s8_u16(a);
   20568 }
   20569 
   20570 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u32(<4 x i32> %a) #0 {
   20571 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   20572 // CHECK:   ret <16 x i8> [[TMP0]]
   20573 int8x16_t test_vreinterpretq_s8_u32(uint32x4_t a) {
   20574   return vreinterpretq_s8_u32(a);
   20575 }
   20576 
   20577 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u64(<2 x i64> %a) #0 {
   20578 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   20579 // CHECK:   ret <16 x i8> [[TMP0]]
   20580 int8x16_t test_vreinterpretq_s8_u64(uint64x2_t a) {
   20581   return vreinterpretq_s8_u64(a);
   20582 }
   20583 
   20584 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_f16(<8 x half> %a) #0 {
   20585 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
   20586 // CHECK:   ret <16 x i8> [[TMP0]]
   20587 int8x16_t test_vreinterpretq_s8_f16(float16x8_t a) {
   20588   return vreinterpretq_s8_f16(a);
   20589 }
   20590 
   20591 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_f32(<4 x float> %a) #0 {
   20592 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
   20593 // CHECK:   ret <16 x i8> [[TMP0]]
   20594 int8x16_t test_vreinterpretq_s8_f32(float32x4_t a) {
   20595   return vreinterpretq_s8_f32(a);
   20596 }
   20597 
   20598 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_f64(<2 x double> %a) #0 {
   20599 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
   20600 // CHECK:   ret <16 x i8> [[TMP0]]
   20601 int8x16_t test_vreinterpretq_s8_f64(float64x2_t a) {
   20602   return vreinterpretq_s8_f64(a);
   20603 }
   20604 
   20605 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_p8(<16 x i8> %a) #0 {
   20606 // CHECK:   ret <16 x i8> %a
   20607 int8x16_t test_vreinterpretq_s8_p8(poly8x16_t a) {
   20608   return vreinterpretq_s8_p8(a);
   20609 }
   20610 
   20611 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_p16(<8 x i16> %a) #0 {
   20612 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   20613 // CHECK:   ret <16 x i8> [[TMP0]]
   20614 int8x16_t test_vreinterpretq_s8_p16(poly16x8_t a) {
   20615   return vreinterpretq_s8_p16(a);
   20616 }
   20617 
   20618 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_p64(<2 x i64> %a) #0 {
   20619 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   20620 // CHECK:   ret <16 x i8> [[TMP0]]
   20621 int8x16_t test_vreinterpretq_s8_p64(poly64x2_t a) {
   20622   return vreinterpretq_s8_p64(a);
   20623 }
   20624 
   20625 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_s8(<16 x i8> %a) #0 {
   20626 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
   20627 // CHECK:   ret <8 x i16> [[TMP0]]
   20628 int16x8_t test_vreinterpretq_s16_s8(int8x16_t a) {
   20629   return vreinterpretq_s16_s8(a);
   20630 }
   20631 
   20632 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_s32(<4 x i32> %a) #0 {
   20633 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
   20634 // CHECK:   ret <8 x i16> [[TMP0]]
   20635 int16x8_t test_vreinterpretq_s16_s32(int32x4_t a) {
   20636   return vreinterpretq_s16_s32(a);
   20637 }
   20638 
   20639 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_s64(<2 x i64> %a) #0 {
   20640 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
   20641 // CHECK:   ret <8 x i16> [[TMP0]]
   20642 int16x8_t test_vreinterpretq_s16_s64(int64x2_t a) {
   20643   return vreinterpretq_s16_s64(a);
   20644 }
   20645 
   20646 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u8(<16 x i8> %a) #0 {
   20647 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
   20648 // CHECK:   ret <8 x i16> [[TMP0]]
   20649 int16x8_t test_vreinterpretq_s16_u8(uint8x16_t a) {
   20650   return vreinterpretq_s16_u8(a);
   20651 }
   20652 
   20653 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u16(<8 x i16> %a) #0 {
   20654 // CHECK:   ret <8 x i16> %a
   20655 int16x8_t test_vreinterpretq_s16_u16(uint16x8_t a) {
   20656   return vreinterpretq_s16_u16(a);
   20657 }
   20658 
   20659 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u32(<4 x i32> %a) #0 {
   20660 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
   20661 // CHECK:   ret <8 x i16> [[TMP0]]
   20662 int16x8_t test_vreinterpretq_s16_u32(uint32x4_t a) {
   20663   return vreinterpretq_s16_u32(a);
   20664 }
   20665 
   20666 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u64(<2 x i64> %a) #0 {
   20667 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
   20668 // CHECK:   ret <8 x i16> [[TMP0]]
   20669 int16x8_t test_vreinterpretq_s16_u64(uint64x2_t a) {
   20670   return vreinterpretq_s16_u64(a);
   20671 }
   20672 
   20673 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_f16(<8 x half> %a) #0 {
   20674 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16>
   20675 // CHECK:   ret <8 x i16> [[TMP0]]
   20676 int16x8_t test_vreinterpretq_s16_f16(float16x8_t a) {
   20677   return vreinterpretq_s16_f16(a);
   20678 }
   20679 
   20680 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_f32(<4 x float> %a) #0 {
   20681 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16>
   20682 // CHECK:   ret <8 x i16> [[TMP0]]
   20683 int16x8_t test_vreinterpretq_s16_f32(float32x4_t a) {
   20684   return vreinterpretq_s16_f32(a);
   20685 }
   20686 
   20687 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_f64(<2 x double> %a) #0 {
   20688 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <8 x i16>
   20689 // CHECK:   ret <8 x i16> [[TMP0]]
   20690 int16x8_t test_vreinterpretq_s16_f64(float64x2_t a) {
   20691   return vreinterpretq_s16_f64(a);
   20692 }
   20693 
   20694 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_p8(<16 x i8> %a) #0 {
   20695 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
   20696 // CHECK:   ret <8 x i16> [[TMP0]]
   20697 int16x8_t test_vreinterpretq_s16_p8(poly8x16_t a) {
   20698   return vreinterpretq_s16_p8(a);
   20699 }
   20700 
   20701 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_p16(<8 x i16> %a) #0 {
   20702 // CHECK:   ret <8 x i16> %a
   20703 int16x8_t test_vreinterpretq_s16_p16(poly16x8_t a) {
   20704   return vreinterpretq_s16_p16(a);
   20705 }
   20706 
   20707 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_p64(<2 x i64> %a) #0 {
   20708 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
   20709 // CHECK:   ret <8 x i16> [[TMP0]]
   20710 int16x8_t test_vreinterpretq_s16_p64(poly64x2_t a) {
   20711   return vreinterpretq_s16_p64(a);
   20712 }
   20713 
   20714 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_s8(<16 x i8> %a) #0 {
   20715 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
   20716 // CHECK:   ret <4 x i32> [[TMP0]]
   20717 int32x4_t test_vreinterpretq_s32_s8(int8x16_t a) {
   20718   return vreinterpretq_s32_s8(a);
   20719 }
   20720 
   20721 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_s16(<8 x i16> %a) #0 {
   20722 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
   20723 // CHECK:   ret <4 x i32> [[TMP0]]
   20724 int32x4_t test_vreinterpretq_s32_s16(int16x8_t a) {
   20725   return vreinterpretq_s32_s16(a);
   20726 }
   20727 
   20728 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_s64(<2 x i64> %a) #0 {
   20729 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
   20730 // CHECK:   ret <4 x i32> [[TMP0]]
   20731 int32x4_t test_vreinterpretq_s32_s64(int64x2_t a) {
   20732   return vreinterpretq_s32_s64(a);
   20733 }
   20734 
   20735 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u8(<16 x i8> %a) #0 {
   20736 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
   20737 // CHECK:   ret <4 x i32> [[TMP0]]
   20738 int32x4_t test_vreinterpretq_s32_u8(uint8x16_t a) {
   20739   return vreinterpretq_s32_u8(a);
   20740 }
   20741 
   20742 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u16(<8 x i16> %a) #0 {
   20743 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
   20744 // CHECK:   ret <4 x i32> [[TMP0]]
   20745 int32x4_t test_vreinterpretq_s32_u16(uint16x8_t a) {
   20746   return vreinterpretq_s32_u16(a);
   20747 }
   20748 
   20749 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u32(<4 x i32> %a) #0 {
   20750 // CHECK:   ret <4 x i32> %a
   20751 int32x4_t test_vreinterpretq_s32_u32(uint32x4_t a) {
   20752   return vreinterpretq_s32_u32(a);
   20753 }
   20754 
   20755 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u64(<2 x i64> %a) #0 {
   20756 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
   20757 // CHECK:   ret <4 x i32> [[TMP0]]
   20758 int32x4_t test_vreinterpretq_s32_u64(uint64x2_t a) {
   20759   return vreinterpretq_s32_u64(a);
   20760 }
   20761 
   20762 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_f16(<8 x half> %a) #0 {
   20763 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x i32>
   20764 // CHECK:   ret <4 x i32> [[TMP0]]
   20765 int32x4_t test_vreinterpretq_s32_f16(float16x8_t a) {
   20766   return vreinterpretq_s32_f16(a);
   20767 }
   20768 
   20769 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_f32(<4 x float> %a) #0 {
   20770 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <4 x i32>
   20771 // CHECK:   ret <4 x i32> [[TMP0]]
   20772 int32x4_t test_vreinterpretq_s32_f32(float32x4_t a) {
   20773   return vreinterpretq_s32_f32(a);
   20774 }
   20775 
   20776 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_f64(<2 x double> %a) #0 {
   20777 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <4 x i32>
   20778 // CHECK:   ret <4 x i32> [[TMP0]]
   20779 int32x4_t test_vreinterpretq_s32_f64(float64x2_t a) {
   20780   return vreinterpretq_s32_f64(a);
   20781 }
   20782 
   20783 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_p8(<16 x i8> %a) #0 {
   20784 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
   20785 // CHECK:   ret <4 x i32> [[TMP0]]
   20786 int32x4_t test_vreinterpretq_s32_p8(poly8x16_t a) {
   20787   return vreinterpretq_s32_p8(a);
   20788 }
   20789 
   20790 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_p16(<8 x i16> %a) #0 {
   20791 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
   20792 // CHECK:   ret <4 x i32> [[TMP0]]
   20793 int32x4_t test_vreinterpretq_s32_p16(poly16x8_t a) {
   20794   return vreinterpretq_s32_p16(a);
   20795 }
   20796 
   20797 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_p64(<2 x i64> %a) #0 {
   20798 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
   20799 // CHECK:   ret <4 x i32> [[TMP0]]
   20800 int32x4_t test_vreinterpretq_s32_p64(poly64x2_t a) {
   20801   return vreinterpretq_s32_p64(a);
   20802 }
   20803 
   20804 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_s8(<16 x i8> %a) #0 {
   20805 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
   20806 // CHECK:   ret <2 x i64> [[TMP0]]
   20807 int64x2_t test_vreinterpretq_s64_s8(int8x16_t a) {
   20808   return vreinterpretq_s64_s8(a);
   20809 }
   20810 
   20811 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_s16(<8 x i16> %a) #0 {
   20812 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
   20813 // CHECK:   ret <2 x i64> [[TMP0]]
   20814 int64x2_t test_vreinterpretq_s64_s16(int16x8_t a) {
   20815   return vreinterpretq_s64_s16(a);
   20816 }
   20817 
   20818 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_s32(<4 x i32> %a) #0 {
   20819 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
   20820 // CHECK:   ret <2 x i64> [[TMP0]]
   20821 int64x2_t test_vreinterpretq_s64_s32(int32x4_t a) {
   20822   return vreinterpretq_s64_s32(a);
   20823 }
   20824 
   20825 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u8(<16 x i8> %a) #0 {
   20826 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
   20827 // CHECK:   ret <2 x i64> [[TMP0]]
   20828 int64x2_t test_vreinterpretq_s64_u8(uint8x16_t a) {
   20829   return vreinterpretq_s64_u8(a);
   20830 }
   20831 
   20832 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u16(<8 x i16> %a) #0 {
   20833 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
   20834 // CHECK:   ret <2 x i64> [[TMP0]]
   20835 int64x2_t test_vreinterpretq_s64_u16(uint16x8_t a) {
   20836   return vreinterpretq_s64_u16(a);
   20837 }
   20838 
   20839 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u32(<4 x i32> %a) #0 {
   20840 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
   20841 // CHECK:   ret <2 x i64> [[TMP0]]
   20842 int64x2_t test_vreinterpretq_s64_u32(uint32x4_t a) {
   20843   return vreinterpretq_s64_u32(a);
   20844 }
   20845 
   20846 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u64(<2 x i64> %a) #0 {
   20847 // CHECK:   ret <2 x i64> %a
   20848 int64x2_t test_vreinterpretq_s64_u64(uint64x2_t a) {
   20849   return vreinterpretq_s64_u64(a);
   20850 }
   20851 
   20852 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_f16(<8 x half> %a) #0 {
   20853 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x i64>
   20854 // CHECK:   ret <2 x i64> [[TMP0]]
   20855 int64x2_t test_vreinterpretq_s64_f16(float16x8_t a) {
   20856   return vreinterpretq_s64_f16(a);
   20857 }
   20858 
   20859 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_f32(<4 x float> %a) #0 {
   20860 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x i64>
   20861 // CHECK:   ret <2 x i64> [[TMP0]]
   20862 int64x2_t test_vreinterpretq_s64_f32(float32x4_t a) {
   20863   return vreinterpretq_s64_f32(a);
   20864 }
   20865 
   20866 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_f64(<2 x double> %a) #0 {
   20867 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <2 x i64>
   20868 // CHECK:   ret <2 x i64> [[TMP0]]
   20869 int64x2_t test_vreinterpretq_s64_f64(float64x2_t a) {
   20870   return vreinterpretq_s64_f64(a);
   20871 }
   20872 
   20873 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_p8(<16 x i8> %a) #0 {
   20874 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
   20875 // CHECK:   ret <2 x i64> [[TMP0]]
   20876 int64x2_t test_vreinterpretq_s64_p8(poly8x16_t a) {
   20877   return vreinterpretq_s64_p8(a);
   20878 }
   20879 
   20880 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_p16(<8 x i16> %a) #0 {
   20881 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
   20882 // CHECK:   ret <2 x i64> [[TMP0]]
   20883 int64x2_t test_vreinterpretq_s64_p16(poly16x8_t a) {
   20884   return vreinterpretq_s64_p16(a);
   20885 }
   20886 
   20887 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_p64(<2 x i64> %a) #0 {
   20888 // CHECK:   ret <2 x i64> %a
   20889 int64x2_t test_vreinterpretq_s64_p64(poly64x2_t a) {
   20890   return vreinterpretq_s64_p64(a);
   20891 }
   20892 
   20893 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s8(<16 x i8> %a) #0 {
   20894 // CHECK:   ret <16 x i8> %a
   20895 uint8x16_t test_vreinterpretq_u8_s8(int8x16_t a) {
   20896   return vreinterpretq_u8_s8(a);
   20897 }
   20898 
   20899 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s16(<8 x i16> %a) #0 {
   20900 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   20901 // CHECK:   ret <16 x i8> [[TMP0]]
   20902 uint8x16_t test_vreinterpretq_u8_s16(int16x8_t a) {
   20903   return vreinterpretq_u8_s16(a);
   20904 }
   20905 
   20906 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s32(<4 x i32> %a) #0 {
   20907 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   20908 // CHECK:   ret <16 x i8> [[TMP0]]
   20909 uint8x16_t test_vreinterpretq_u8_s32(int32x4_t a) {
   20910   return vreinterpretq_u8_s32(a);
   20911 }
   20912 
   20913 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s64(<2 x i64> %a) #0 {
   20914 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   20915 // CHECK:   ret <16 x i8> [[TMP0]]
   20916 uint8x16_t test_vreinterpretq_u8_s64(int64x2_t a) {
   20917   return vreinterpretq_u8_s64(a);
   20918 }
   20919 
   20920 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_u16(<8 x i16> %a) #0 {
   20921 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   20922 // CHECK:   ret <16 x i8> [[TMP0]]
   20923 uint8x16_t test_vreinterpretq_u8_u16(uint16x8_t a) {
   20924   return vreinterpretq_u8_u16(a);
   20925 }
   20926 
   20927 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_u32(<4 x i32> %a) #0 {
   20928 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   20929 // CHECK:   ret <16 x i8> [[TMP0]]
   20930 uint8x16_t test_vreinterpretq_u8_u32(uint32x4_t a) {
   20931   return vreinterpretq_u8_u32(a);
   20932 }
   20933 
   20934 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_u64(<2 x i64> %a) #0 {
   20935 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   20936 // CHECK:   ret <16 x i8> [[TMP0]]
   20937 uint8x16_t test_vreinterpretq_u8_u64(uint64x2_t a) {
   20938   return vreinterpretq_u8_u64(a);
   20939 }
   20940 
   20941 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_f16(<8 x half> %a) #0 {
   20942 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
   20943 // CHECK:   ret <16 x i8> [[TMP0]]
   20944 uint8x16_t test_vreinterpretq_u8_f16(float16x8_t a) {
   20945   return vreinterpretq_u8_f16(a);
   20946 }
   20947 
   20948 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_f32(<4 x float> %a) #0 {
   20949 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
   20950 // CHECK:   ret <16 x i8> [[TMP0]]
   20951 uint8x16_t test_vreinterpretq_u8_f32(float32x4_t a) {
   20952   return vreinterpretq_u8_f32(a);
   20953 }
   20954 
   20955 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_f64(<2 x double> %a) #0 {
   20956 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
   20957 // CHECK:   ret <16 x i8> [[TMP0]]
   20958 uint8x16_t test_vreinterpretq_u8_f64(float64x2_t a) {
   20959   return vreinterpretq_u8_f64(a);
   20960 }
   20961 
   20962 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_p8(<16 x i8> %a) #0 {
   20963 // CHECK:   ret <16 x i8> %a
   20964 uint8x16_t test_vreinterpretq_u8_p8(poly8x16_t a) {
   20965   return vreinterpretq_u8_p8(a);
   20966 }
   20967 
   20968 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_p16(<8 x i16> %a) #0 {
   20969 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   20970 // CHECK:   ret <16 x i8> [[TMP0]]
   20971 uint8x16_t test_vreinterpretq_u8_p16(poly16x8_t a) {
   20972   return vreinterpretq_u8_p16(a);
   20973 }
   20974 
   20975 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_p64(<2 x i64> %a) #0 {
   20976 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   20977 // CHECK:   ret <16 x i8> [[TMP0]]
   20978 uint8x16_t test_vreinterpretq_u8_p64(poly64x2_t a) {
   20979   return vreinterpretq_u8_p64(a);
   20980 }
   20981 
   20982 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s8(<16 x i8> %a) #0 {
   20983 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
   20984 // CHECK:   ret <8 x i16> [[TMP0]]
   20985 uint16x8_t test_vreinterpretq_u16_s8(int8x16_t a) {
   20986   return vreinterpretq_u16_s8(a);
   20987 }
   20988 
   20989 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s16(<8 x i16> %a) #0 {
   20990 // CHECK:   ret <8 x i16> %a
   20991 uint16x8_t test_vreinterpretq_u16_s16(int16x8_t a) {
   20992   return vreinterpretq_u16_s16(a);
   20993 }
   20994 
   20995 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s32(<4 x i32> %a) #0 {
   20996 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
   20997 // CHECK:   ret <8 x i16> [[TMP0]]
   20998 uint16x8_t test_vreinterpretq_u16_s32(int32x4_t a) {
   20999   return vreinterpretq_u16_s32(a);
   21000 }
   21001 
   21002 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s64(<2 x i64> %a) #0 {
   21003 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
   21004 // CHECK:   ret <8 x i16> [[TMP0]]
   21005 uint16x8_t test_vreinterpretq_u16_s64(int64x2_t a) {
   21006   return vreinterpretq_u16_s64(a);
   21007 }
   21008 
   21009 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_u8(<16 x i8> %a) #0 {
   21010 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
   21011 // CHECK:   ret <8 x i16> [[TMP0]]
   21012 uint16x8_t test_vreinterpretq_u16_u8(uint8x16_t a) {
   21013   return vreinterpretq_u16_u8(a);
   21014 }
   21015 
   21016 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_u32(<4 x i32> %a) #0 {
   21017 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
   21018 // CHECK:   ret <8 x i16> [[TMP0]]
   21019 uint16x8_t test_vreinterpretq_u16_u32(uint32x4_t a) {
   21020   return vreinterpretq_u16_u32(a);
   21021 }
   21022 
   21023 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_u64(<2 x i64> %a) #0 {
   21024 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
   21025 // CHECK:   ret <8 x i16> [[TMP0]]
   21026 uint16x8_t test_vreinterpretq_u16_u64(uint64x2_t a) {
   21027   return vreinterpretq_u16_u64(a);
   21028 }
   21029 
   21030 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_f16(<8 x half> %a) #0 {
   21031 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16>
   21032 // CHECK:   ret <8 x i16> [[TMP0]]
   21033 uint16x8_t test_vreinterpretq_u16_f16(float16x8_t a) {
   21034   return vreinterpretq_u16_f16(a);
   21035 }
   21036 
   21037 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_f32(<4 x float> %a) #0 {
   21038 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16>
   21039 // CHECK:   ret <8 x i16> [[TMP0]]
   21040 uint16x8_t test_vreinterpretq_u16_f32(float32x4_t a) {
   21041   return vreinterpretq_u16_f32(a);
   21042 }
   21043 
   21044 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_f64(<2 x double> %a) #0 {
   21045 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <8 x i16>
   21046 // CHECK:   ret <8 x i16> [[TMP0]]
   21047 uint16x8_t test_vreinterpretq_u16_f64(float64x2_t a) {
   21048   return vreinterpretq_u16_f64(a);
   21049 }
   21050 
   21051 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_p8(<16 x i8> %a) #0 {
   21052 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
   21053 // CHECK:   ret <8 x i16> [[TMP0]]
   21054 uint16x8_t test_vreinterpretq_u16_p8(poly8x16_t a) {
   21055   return vreinterpretq_u16_p8(a);
   21056 }
   21057 
   21058 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_p16(<8 x i16> %a) #0 {
   21059 // CHECK:   ret <8 x i16> %a
   21060 uint16x8_t test_vreinterpretq_u16_p16(poly16x8_t a) {
   21061   return vreinterpretq_u16_p16(a);
   21062 }
   21063 
   21064 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_p64(<2 x i64> %a) #0 {
   21065 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
   21066 // CHECK:   ret <8 x i16> [[TMP0]]
   21067 uint16x8_t test_vreinterpretq_u16_p64(poly64x2_t a) {
   21068   return vreinterpretq_u16_p64(a);
   21069 }
   21070 
   21071 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s8(<16 x i8> %a) #0 {
   21072 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
   21073 // CHECK:   ret <4 x i32> [[TMP0]]
   21074 uint32x4_t test_vreinterpretq_u32_s8(int8x16_t a) {
   21075   return vreinterpretq_u32_s8(a);
   21076 }
   21077 
   21078 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s16(<8 x i16> %a) #0 {
   21079 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
   21080 // CHECK:   ret <4 x i32> [[TMP0]]
   21081 uint32x4_t test_vreinterpretq_u32_s16(int16x8_t a) {
   21082   return vreinterpretq_u32_s16(a);
   21083 }
   21084 
   21085 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s32(<4 x i32> %a) #0 {
   21086 // CHECK:   ret <4 x i32> %a
   21087 uint32x4_t test_vreinterpretq_u32_s32(int32x4_t a) {
   21088   return vreinterpretq_u32_s32(a);
   21089 }
   21090 
   21091 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s64(<2 x i64> %a) #0 {
   21092 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
   21093 // CHECK:   ret <4 x i32> [[TMP0]]
   21094 uint32x4_t test_vreinterpretq_u32_s64(int64x2_t a) {
   21095   return vreinterpretq_u32_s64(a);
   21096 }
   21097 
   21098 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_u8(<16 x i8> %a) #0 {
   21099 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
   21100 // CHECK:   ret <4 x i32> [[TMP0]]
   21101 uint32x4_t test_vreinterpretq_u32_u8(uint8x16_t a) {
   21102   return vreinterpretq_u32_u8(a);
   21103 }
   21104 
   21105 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_u16(<8 x i16> %a) #0 {
   21106 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
   21107 // CHECK:   ret <4 x i32> [[TMP0]]
   21108 uint32x4_t test_vreinterpretq_u32_u16(uint16x8_t a) {
   21109   return vreinterpretq_u32_u16(a);
   21110 }
   21111 
   21112 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_u64(<2 x i64> %a) #0 {
   21113 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
   21114 // CHECK:   ret <4 x i32> [[TMP0]]
   21115 uint32x4_t test_vreinterpretq_u32_u64(uint64x2_t a) {
   21116   return vreinterpretq_u32_u64(a);
   21117 }
   21118 
   21119 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_f16(<8 x half> %a) #0 {
   21120 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x i32>
   21121 // CHECK:   ret <4 x i32> [[TMP0]]
   21122 uint32x4_t test_vreinterpretq_u32_f16(float16x8_t a) {
   21123   return vreinterpretq_u32_f16(a);
   21124 }
   21125 
   21126 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_f32(<4 x float> %a) #0 {
   21127 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <4 x i32>
   21128 // CHECK:   ret <4 x i32> [[TMP0]]
   21129 uint32x4_t test_vreinterpretq_u32_f32(float32x4_t a) {
   21130   return vreinterpretq_u32_f32(a);
   21131 }
   21132 
   21133 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_f64(<2 x double> %a) #0 {
   21134 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <4 x i32>
   21135 // CHECK:   ret <4 x i32> [[TMP0]]
   21136 uint32x4_t test_vreinterpretq_u32_f64(float64x2_t a) {
   21137   return vreinterpretq_u32_f64(a);
   21138 }
   21139 
   21140 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_p8(<16 x i8> %a) #0 {
   21141 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
   21142 // CHECK:   ret <4 x i32> [[TMP0]]
   21143 uint32x4_t test_vreinterpretq_u32_p8(poly8x16_t a) {
   21144   return vreinterpretq_u32_p8(a);
   21145 }
   21146 
   21147 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_p16(<8 x i16> %a) #0 {
   21148 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
   21149 // CHECK:   ret <4 x i32> [[TMP0]]
   21150 uint32x4_t test_vreinterpretq_u32_p16(poly16x8_t a) {
   21151   return vreinterpretq_u32_p16(a);
   21152 }
   21153 
   21154 // CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_p64(<2 x i64> %a) #0 {
   21155 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
   21156 // CHECK:   ret <4 x i32> [[TMP0]]
   21157 uint32x4_t test_vreinterpretq_u32_p64(poly64x2_t a) {
   21158   return vreinterpretq_u32_p64(a);
   21159 }
   21160 
   21161 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s8(<16 x i8> %a) #0 {
   21162 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
   21163 // CHECK:   ret <2 x i64> [[TMP0]]
   21164 uint64x2_t test_vreinterpretq_u64_s8(int8x16_t a) {
   21165   return vreinterpretq_u64_s8(a);
   21166 }
   21167 
   21168 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s16(<8 x i16> %a) #0 {
   21169 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
   21170 // CHECK:   ret <2 x i64> [[TMP0]]
   21171 uint64x2_t test_vreinterpretq_u64_s16(int16x8_t a) {
   21172   return vreinterpretq_u64_s16(a);
   21173 }
   21174 
   21175 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s32(<4 x i32> %a) #0 {
   21176 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
   21177 // CHECK:   ret <2 x i64> [[TMP0]]
   21178 uint64x2_t test_vreinterpretq_u64_s32(int32x4_t a) {
   21179   return vreinterpretq_u64_s32(a);
   21180 }
   21181 
   21182 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s64(<2 x i64> %a) #0 {
   21183 // CHECK:   ret <2 x i64> %a
   21184 uint64x2_t test_vreinterpretq_u64_s64(int64x2_t a) {
   21185   return vreinterpretq_u64_s64(a);
   21186 }
   21187 
   21188 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_u8(<16 x i8> %a) #0 {
   21189 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
   21190 // CHECK:   ret <2 x i64> [[TMP0]]
   21191 uint64x2_t test_vreinterpretq_u64_u8(uint8x16_t a) {
   21192   return vreinterpretq_u64_u8(a);
   21193 }
   21194 
   21195 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_u16(<8 x i16> %a) #0 {
   21196 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
   21197 // CHECK:   ret <2 x i64> [[TMP0]]
   21198 uint64x2_t test_vreinterpretq_u64_u16(uint16x8_t a) {
   21199   return vreinterpretq_u64_u16(a);
   21200 }
   21201 
   21202 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_u32(<4 x i32> %a) #0 {
   21203 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
   21204 // CHECK:   ret <2 x i64> [[TMP0]]
   21205 uint64x2_t test_vreinterpretq_u64_u32(uint32x4_t a) {
   21206   return vreinterpretq_u64_u32(a);
   21207 }
   21208 
   21209 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_f16(<8 x half> %a) #0 {
   21210 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x i64>
   21211 // CHECK:   ret <2 x i64> [[TMP0]]
   21212 uint64x2_t test_vreinterpretq_u64_f16(float16x8_t a) {
   21213   return vreinterpretq_u64_f16(a);
   21214 }
   21215 
   21216 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_f32(<4 x float> %a) #0 {
   21217 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x i64>
   21218 // CHECK:   ret <2 x i64> [[TMP0]]
   21219 uint64x2_t test_vreinterpretq_u64_f32(float32x4_t a) {
   21220   return vreinterpretq_u64_f32(a);
   21221 }
   21222 
   21223 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_f64(<2 x double> %a) #0 {
   21224 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <2 x i64>
   21225 // CHECK:   ret <2 x i64> [[TMP0]]
   21226 uint64x2_t test_vreinterpretq_u64_f64(float64x2_t a) {
   21227   return vreinterpretq_u64_f64(a);
   21228 }
   21229 
   21230 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_p8(<16 x i8> %a) #0 {
   21231 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
   21232 // CHECK:   ret <2 x i64> [[TMP0]]
   21233 uint64x2_t test_vreinterpretq_u64_p8(poly8x16_t a) {
   21234   return vreinterpretq_u64_p8(a);
   21235 }
   21236 
   21237 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_p16(<8 x i16> %a) #0 {
   21238 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
   21239 // CHECK:   ret <2 x i64> [[TMP0]]
   21240 uint64x2_t test_vreinterpretq_u64_p16(poly16x8_t a) {
   21241   return vreinterpretq_u64_p16(a);
   21242 }
   21243 
   21244 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_p64(<2 x i64> %a) #0 {
   21245 // CHECK:   ret <2 x i64> %a
   21246 uint64x2_t test_vreinterpretq_u64_p64(poly64x2_t a) {
   21247   return vreinterpretq_u64_p64(a);
   21248 }
   21249 
   21250 // CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s8(<16 x i8> %a) #0 {
   21251 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half>
   21252 // CHECK:   ret <8 x half> [[TMP0]]
   21253 float16x8_t test_vreinterpretq_f16_s8(int8x16_t a) {
   21254   return vreinterpretq_f16_s8(a);
   21255 }
   21256 
   21257 // CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s16(<8 x i16> %a) #0 {
   21258 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half>
   21259 // CHECK:   ret <8 x half> [[TMP0]]
   21260 float16x8_t test_vreinterpretq_f16_s16(int16x8_t a) {
   21261   return vreinterpretq_f16_s16(a);
   21262 }
   21263 
   21264 // CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s32(<4 x i32> %a) #0 {
   21265 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x half>
   21266 // CHECK:   ret <8 x half> [[TMP0]]
   21267 float16x8_t test_vreinterpretq_f16_s32(int32x4_t a) {
   21268   return vreinterpretq_f16_s32(a);
   21269 }
   21270 
   21271 // CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s64(<2 x i64> %a) #0 {
   21272 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x half>
   21273 // CHECK:   ret <8 x half> [[TMP0]]
   21274 float16x8_t test_vreinterpretq_f16_s64(int64x2_t a) {
   21275   return vreinterpretq_f16_s64(a);
   21276 }
   21277 
   21278 // CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u8(<16 x i8> %a) #0 {
   21279 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half>
   21280 // CHECK:   ret <8 x half> [[TMP0]]
   21281 float16x8_t test_vreinterpretq_f16_u8(uint8x16_t a) {
   21282   return vreinterpretq_f16_u8(a);
   21283 }
   21284 
   21285 // CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u16(<8 x i16> %a) #0 {
   21286 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half>
   21287 // CHECK:   ret <8 x half> [[TMP0]]
   21288 float16x8_t test_vreinterpretq_f16_u16(uint16x8_t a) {
   21289   return vreinterpretq_f16_u16(a);
   21290 }
   21291 
   21292 // CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u32(<4 x i32> %a) #0 {
   21293 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x half>
   21294 // CHECK:   ret <8 x half> [[TMP0]]
   21295 float16x8_t test_vreinterpretq_f16_u32(uint32x4_t a) {
   21296   return vreinterpretq_f16_u32(a);
   21297 }
   21298 
   21299 // CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u64(<2 x i64> %a) #0 {
   21300 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x half>
   21301 // CHECK:   ret <8 x half> [[TMP0]]
   21302 float16x8_t test_vreinterpretq_f16_u64(uint64x2_t a) {
   21303   return vreinterpretq_f16_u64(a);
   21304 }
   21305 
   21306 // CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_f32(<4 x float> %a) #0 {
   21307 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x half>
   21308 // CHECK:   ret <8 x half> [[TMP0]]
   21309 float16x8_t test_vreinterpretq_f16_f32(float32x4_t a) {
   21310   return vreinterpretq_f16_f32(a);
   21311 }
   21312 
   21313 // CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_f64(<2 x double> %a) #0 {
   21314 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <8 x half>
   21315 // CHECK:   ret <8 x half> [[TMP0]]
   21316 float16x8_t test_vreinterpretq_f16_f64(float64x2_t a) {
   21317   return vreinterpretq_f16_f64(a);
   21318 }
   21319 
   21320 // CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_p8(<16 x i8> %a) #0 {
   21321 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half>
   21322 // CHECK:   ret <8 x half> [[TMP0]]
   21323 float16x8_t test_vreinterpretq_f16_p8(poly8x16_t a) {
   21324   return vreinterpretq_f16_p8(a);
   21325 }
   21326 
   21327 // CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_p16(<8 x i16> %a) #0 {
   21328 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half>
   21329 // CHECK:   ret <8 x half> [[TMP0]]
   21330 float16x8_t test_vreinterpretq_f16_p16(poly16x8_t a) {
   21331   return vreinterpretq_f16_p16(a);
   21332 }
   21333 
   21334 // CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_p64(<2 x i64> %a) #0 {
   21335 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x half>
   21336 // CHECK:   ret <8 x half> [[TMP0]]
   21337 float16x8_t test_vreinterpretq_f16_p64(poly64x2_t a) {
   21338   return vreinterpretq_f16_p64(a);
   21339 }
   21340 
   21341 // CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s8(<16 x i8> %a) #0 {
   21342 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float>
   21343 // CHECK:   ret <4 x float> [[TMP0]]
   21344 float32x4_t test_vreinterpretq_f32_s8(int8x16_t a) {
   21345   return vreinterpretq_f32_s8(a);
   21346 }
   21347 
   21348 // CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s16(<8 x i16> %a) #0 {
   21349 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float>
   21350 // CHECK:   ret <4 x float> [[TMP0]]
   21351 float32x4_t test_vreinterpretq_f32_s16(int16x8_t a) {
   21352   return vreinterpretq_f32_s16(a);
   21353 }
   21354 
   21355 // CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s32(<4 x i32> %a) #0 {
   21356 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <4 x float>
   21357 // CHECK:   ret <4 x float> [[TMP0]]
   21358 float32x4_t test_vreinterpretq_f32_s32(int32x4_t a) {
   21359   return vreinterpretq_f32_s32(a);
   21360 }
   21361 
   21362 // CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s64(<2 x i64> %a) #0 {
   21363 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x float>
   21364 // CHECK:   ret <4 x float> [[TMP0]]
   21365 float32x4_t test_vreinterpretq_f32_s64(int64x2_t a) {
   21366   return vreinterpretq_f32_s64(a);
   21367 }
   21368 
   21369 // CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u8(<16 x i8> %a) #0 {
   21370 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float>
   21371 // CHECK:   ret <4 x float> [[TMP0]]
   21372 float32x4_t test_vreinterpretq_f32_u8(uint8x16_t a) {
   21373   return vreinterpretq_f32_u8(a);
   21374 }
   21375 
   21376 // CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u16(<8 x i16> %a) #0 {
   21377 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float>
   21378 // CHECK:   ret <4 x float> [[TMP0]]
   21379 float32x4_t test_vreinterpretq_f32_u16(uint16x8_t a) {
   21380   return vreinterpretq_f32_u16(a);
   21381 }
   21382 
   21383 // CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u32(<4 x i32> %a) #0 {
   21384 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <4 x float>
   21385 // CHECK:   ret <4 x float> [[TMP0]]
   21386 float32x4_t test_vreinterpretq_f32_u32(uint32x4_t a) {
   21387   return vreinterpretq_f32_u32(a);
   21388 }
   21389 
   21390 // CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u64(<2 x i64> %a) #0 {
   21391 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x float>
   21392 // CHECK:   ret <4 x float> [[TMP0]]
   21393 float32x4_t test_vreinterpretq_f32_u64(uint64x2_t a) {
   21394   return vreinterpretq_f32_u64(a);
   21395 }
   21396 
   21397 // CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_f16(<8 x half> %a) #0 {
   21398 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x float>
   21399 // CHECK:   ret <4 x float> [[TMP0]]
   21400 float32x4_t test_vreinterpretq_f32_f16(float16x8_t a) {
   21401   return vreinterpretq_f32_f16(a);
   21402 }
   21403 
   21404 // CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_f64(<2 x double> %a) #0 {
   21405 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <4 x float>
   21406 // CHECK:   ret <4 x float> [[TMP0]]
   21407 float32x4_t test_vreinterpretq_f32_f64(float64x2_t a) {
   21408   return vreinterpretq_f32_f64(a);
   21409 }
   21410 
   21411 // CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_p8(<16 x i8> %a) #0 {
   21412 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float>
   21413 // CHECK:   ret <4 x float> [[TMP0]]
   21414 float32x4_t test_vreinterpretq_f32_p8(poly8x16_t a) {
   21415   return vreinterpretq_f32_p8(a);
   21416 }
   21417 
   21418 // CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_p16(<8 x i16> %a) #0 {
   21419 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float>
   21420 // CHECK:   ret <4 x float> [[TMP0]]
   21421 float32x4_t test_vreinterpretq_f32_p16(poly16x8_t a) {
   21422   return vreinterpretq_f32_p16(a);
   21423 }
   21424 
   21425 // CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_p64(<2 x i64> %a) #0 {
   21426 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x float>
   21427 // CHECK:   ret <4 x float> [[TMP0]]
   21428 float32x4_t test_vreinterpretq_f32_p64(poly64x2_t a) {
   21429   return vreinterpretq_f32_p64(a);
   21430 }
   21431 
   21432 // CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_s8(<16 x i8> %a) #0 {
   21433 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x double>
   21434 // CHECK:   ret <2 x double> [[TMP0]]
   21435 float64x2_t test_vreinterpretq_f64_s8(int8x16_t a) {
   21436   return vreinterpretq_f64_s8(a);
   21437 }
   21438 
   21439 // CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_s16(<8 x i16> %a) #0 {
   21440 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x double>
   21441 // CHECK:   ret <2 x double> [[TMP0]]
   21442 float64x2_t test_vreinterpretq_f64_s16(int16x8_t a) {
   21443   return vreinterpretq_f64_s16(a);
   21444 }
   21445 
   21446 // CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_s32(<4 x i32> %a) #0 {
   21447 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x double>
   21448 // CHECK:   ret <2 x double> [[TMP0]]
   21449 float64x2_t test_vreinterpretq_f64_s32(int32x4_t a) {
   21450   return vreinterpretq_f64_s32(a);
   21451 }
   21452 
   21453 // CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_s64(<2 x i64> %a) #0 {
   21454 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <2 x double>
   21455 // CHECK:   ret <2 x double> [[TMP0]]
   21456 float64x2_t test_vreinterpretq_f64_s64(int64x2_t a) {
   21457   return vreinterpretq_f64_s64(a);
   21458 }
   21459 
   21460 // CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_u8(<16 x i8> %a) #0 {
   21461 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x double>
   21462 // CHECK:   ret <2 x double> [[TMP0]]
   21463 float64x2_t test_vreinterpretq_f64_u8(uint8x16_t a) {
   21464   return vreinterpretq_f64_u8(a);
   21465 }
   21466 
   21467 // CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_u16(<8 x i16> %a) #0 {
   21468 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x double>
   21469 // CHECK:   ret <2 x double> [[TMP0]]
   21470 float64x2_t test_vreinterpretq_f64_u16(uint16x8_t a) {
   21471   return vreinterpretq_f64_u16(a);
   21472 }
   21473 
   21474 // CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_u32(<4 x i32> %a) #0 {
   21475 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x double>
   21476 // CHECK:   ret <2 x double> [[TMP0]]
   21477 float64x2_t test_vreinterpretq_f64_u32(uint32x4_t a) {
   21478   return vreinterpretq_f64_u32(a);
   21479 }
   21480 
   21481 // CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_u64(<2 x i64> %a) #0 {
   21482 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <2 x double>
   21483 // CHECK:   ret <2 x double> [[TMP0]]
   21484 float64x2_t test_vreinterpretq_f64_u64(uint64x2_t a) {
   21485   return vreinterpretq_f64_u64(a);
   21486 }
   21487 
   21488 // CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_f16(<8 x half> %a) #0 {
   21489 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x double>
   21490 // CHECK:   ret <2 x double> [[TMP0]]
   21491 float64x2_t test_vreinterpretq_f64_f16(float16x8_t a) {
   21492   return vreinterpretq_f64_f16(a);
   21493 }
   21494 
   21495 // CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_f32(<4 x float> %a) #0 {
   21496 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x double>
   21497 // CHECK:   ret <2 x double> [[TMP0]]
   21498 float64x2_t test_vreinterpretq_f64_f32(float32x4_t a) {
   21499   return vreinterpretq_f64_f32(a);
   21500 }
   21501 
   21502 // CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_p8(<16 x i8> %a) #0 {
   21503 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x double>
   21504 // CHECK:   ret <2 x double> [[TMP0]]
   21505 float64x2_t test_vreinterpretq_f64_p8(poly8x16_t a) {
   21506   return vreinterpretq_f64_p8(a);
   21507 }
   21508 
   21509 // CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_p16(<8 x i16> %a) #0 {
   21510 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x double>
   21511 // CHECK:   ret <2 x double> [[TMP0]]
   21512 float64x2_t test_vreinterpretq_f64_p16(poly16x8_t a) {
   21513   return vreinterpretq_f64_p16(a);
   21514 }
   21515 
   21516 // CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_p64(<2 x i64> %a) #0 {
   21517 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <2 x double>
   21518 // CHECK:   ret <2 x double> [[TMP0]]
   21519 float64x2_t test_vreinterpretq_f64_p64(poly64x2_t a) {
   21520   return vreinterpretq_f64_p64(a);
   21521 }
   21522 
   21523 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s8(<16 x i8> %a) #0 {
   21524 // CHECK:   ret <16 x i8> %a
   21525 poly8x16_t test_vreinterpretq_p8_s8(int8x16_t a) {
   21526   return vreinterpretq_p8_s8(a);
   21527 }
   21528 
   21529 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s16(<8 x i16> %a) #0 {
   21530 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   21531 // CHECK:   ret <16 x i8> [[TMP0]]
   21532 poly8x16_t test_vreinterpretq_p8_s16(int16x8_t a) {
   21533   return vreinterpretq_p8_s16(a);
   21534 }
   21535 
   21536 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s32(<4 x i32> %a) #0 {
   21537 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   21538 // CHECK:   ret <16 x i8> [[TMP0]]
   21539 poly8x16_t test_vreinterpretq_p8_s32(int32x4_t a) {
   21540   return vreinterpretq_p8_s32(a);
   21541 }
   21542 
   21543 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s64(<2 x i64> %a) #0 {
   21544 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   21545 // CHECK:   ret <16 x i8> [[TMP0]]
   21546 poly8x16_t test_vreinterpretq_p8_s64(int64x2_t a) {
   21547   return vreinterpretq_p8_s64(a);
   21548 }
   21549 
   21550 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u8(<16 x i8> %a) #0 {
   21551 // CHECK:   ret <16 x i8> %a
   21552 poly8x16_t test_vreinterpretq_p8_u8(uint8x16_t a) {
   21553   return vreinterpretq_p8_u8(a);
   21554 }
   21555 
   21556 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u16(<8 x i16> %a) #0 {
   21557 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   21558 // CHECK:   ret <16 x i8> [[TMP0]]
   21559 poly8x16_t test_vreinterpretq_p8_u16(uint16x8_t a) {
   21560   return vreinterpretq_p8_u16(a);
   21561 }
   21562 
   21563 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u32(<4 x i32> %a) #0 {
   21564 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   21565 // CHECK:   ret <16 x i8> [[TMP0]]
   21566 poly8x16_t test_vreinterpretq_p8_u32(uint32x4_t a) {
   21567   return vreinterpretq_p8_u32(a);
   21568 }
   21569 
   21570 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u64(<2 x i64> %a) #0 {
   21571 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   21572 // CHECK:   ret <16 x i8> [[TMP0]]
   21573 poly8x16_t test_vreinterpretq_p8_u64(uint64x2_t a) {
   21574   return vreinterpretq_p8_u64(a);
   21575 }
   21576 
   21577 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_f16(<8 x half> %a) #0 {
   21578 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
   21579 // CHECK:   ret <16 x i8> [[TMP0]]
   21580 poly8x16_t test_vreinterpretq_p8_f16(float16x8_t a) {
   21581   return vreinterpretq_p8_f16(a);
   21582 }
   21583 
   21584 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_f32(<4 x float> %a) #0 {
   21585 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
   21586 // CHECK:   ret <16 x i8> [[TMP0]]
   21587 poly8x16_t test_vreinterpretq_p8_f32(float32x4_t a) {
   21588   return vreinterpretq_p8_f32(a);
   21589 }
   21590 
   21591 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_f64(<2 x double> %a) #0 {
   21592 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
   21593 // CHECK:   ret <16 x i8> [[TMP0]]
   21594 poly8x16_t test_vreinterpretq_p8_f64(float64x2_t a) {
   21595   return vreinterpretq_p8_f64(a);
   21596 }
   21597 
   21598 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_p16(<8 x i16> %a) #0 {
   21599 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   21600 // CHECK:   ret <16 x i8> [[TMP0]]
   21601 poly8x16_t test_vreinterpretq_p8_p16(poly16x8_t a) {
   21602   return vreinterpretq_p8_p16(a);
   21603 }
   21604 
   21605 // CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_p64(<2 x i64> %a) #0 {
   21606 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   21607 // CHECK:   ret <16 x i8> [[TMP0]]
   21608 poly8x16_t test_vreinterpretq_p8_p64(poly64x2_t a) {
   21609   return vreinterpretq_p8_p64(a);
   21610 }
   21611 
   21612 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s8(<16 x i8> %a) #0 {
   21613 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
   21614 // CHECK:   ret <8 x i16> [[TMP0]]
   21615 poly16x8_t test_vreinterpretq_p16_s8(int8x16_t a) {
   21616   return vreinterpretq_p16_s8(a);
   21617 }
   21618 
   21619 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s16(<8 x i16> %a) #0 {
   21620 // CHECK:   ret <8 x i16> %a
   21621 poly16x8_t test_vreinterpretq_p16_s16(int16x8_t a) {
   21622   return vreinterpretq_p16_s16(a);
   21623 }
   21624 
   21625 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s32(<4 x i32> %a) #0 {
   21626 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
   21627 // CHECK:   ret <8 x i16> [[TMP0]]
   21628 poly16x8_t test_vreinterpretq_p16_s32(int32x4_t a) {
   21629   return vreinterpretq_p16_s32(a);
   21630 }
   21631 
   21632 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s64(<2 x i64> %a) #0 {
   21633 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
   21634 // CHECK:   ret <8 x i16> [[TMP0]]
   21635 poly16x8_t test_vreinterpretq_p16_s64(int64x2_t a) {
   21636   return vreinterpretq_p16_s64(a);
   21637 }
   21638 
   21639 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u8(<16 x i8> %a) #0 {
   21640 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
   21641 // CHECK:   ret <8 x i16> [[TMP0]]
   21642 poly16x8_t test_vreinterpretq_p16_u8(uint8x16_t a) {
   21643   return vreinterpretq_p16_u8(a);
   21644 }
   21645 
   21646 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u16(<8 x i16> %a) #0 {
   21647 // CHECK:   ret <8 x i16> %a
   21648 poly16x8_t test_vreinterpretq_p16_u16(uint16x8_t a) {
   21649   return vreinterpretq_p16_u16(a);
   21650 }
   21651 
   21652 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u32(<4 x i32> %a) #0 {
   21653 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
   21654 // CHECK:   ret <8 x i16> [[TMP0]]
   21655 poly16x8_t test_vreinterpretq_p16_u32(uint32x4_t a) {
   21656   return vreinterpretq_p16_u32(a);
   21657 }
   21658 
   21659 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u64(<2 x i64> %a) #0 {
   21660 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
   21661 // CHECK:   ret <8 x i16> [[TMP0]]
   21662 poly16x8_t test_vreinterpretq_p16_u64(uint64x2_t a) {
   21663   return vreinterpretq_p16_u64(a);
   21664 }
   21665 
   21666 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_f16(<8 x half> %a) #0 {
   21667 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16>
   21668 // CHECK:   ret <8 x i16> [[TMP0]]
   21669 poly16x8_t test_vreinterpretq_p16_f16(float16x8_t a) {
   21670   return vreinterpretq_p16_f16(a);
   21671 }
   21672 
   21673 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_f32(<4 x float> %a) #0 {
   21674 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16>
   21675 // CHECK:   ret <8 x i16> [[TMP0]]
   21676 poly16x8_t test_vreinterpretq_p16_f32(float32x4_t a) {
   21677   return vreinterpretq_p16_f32(a);
   21678 }
   21679 
   21680 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_f64(<2 x double> %a) #0 {
   21681 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <8 x i16>
   21682 // CHECK:   ret <8 x i16> [[TMP0]]
   21683 poly16x8_t test_vreinterpretq_p16_f64(float64x2_t a) {
   21684   return vreinterpretq_p16_f64(a);
   21685 }
   21686 
   21687 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_p8(<16 x i8> %a) #0 {
   21688 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
   21689 // CHECK:   ret <8 x i16> [[TMP0]]
   21690 poly16x8_t test_vreinterpretq_p16_p8(poly8x16_t a) {
   21691   return vreinterpretq_p16_p8(a);
   21692 }
   21693 
   21694 // CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_p64(<2 x i64> %a) #0 {
   21695 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
   21696 // CHECK:   ret <8 x i16> [[TMP0]]
   21697 poly16x8_t test_vreinterpretq_p16_p64(poly64x2_t a) {
   21698   return vreinterpretq_p16_p64(a);
   21699 }
   21700 
   21701 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_s8(<16 x i8> %a) #0 {
   21702 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
   21703 // CHECK:   ret <2 x i64> [[TMP0]]
   21704 poly64x2_t test_vreinterpretq_p64_s8(int8x16_t a) {
   21705   return vreinterpretq_p64_s8(a);
   21706 }
   21707 
   21708 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_s16(<8 x i16> %a) #0 {
   21709 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
   21710 // CHECK:   ret <2 x i64> [[TMP0]]
   21711 poly64x2_t test_vreinterpretq_p64_s16(int16x8_t a) {
   21712   return vreinterpretq_p64_s16(a);
   21713 }
   21714 
   21715 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_s32(<4 x i32> %a) #0 {
   21716 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
   21717 // CHECK:   ret <2 x i64> [[TMP0]]
   21718 poly64x2_t test_vreinterpretq_p64_s32(int32x4_t a) {
   21719   return vreinterpretq_p64_s32(a);
   21720 }
   21721 
   21722 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_s64(<2 x i64> %a) #0 {
   21723 // CHECK:   ret <2 x i64> %a
   21724 poly64x2_t test_vreinterpretq_p64_s64(int64x2_t a) {
   21725   return vreinterpretq_p64_s64(a);
   21726 }
   21727 
   21728 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_u8(<16 x i8> %a) #0 {
   21729 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
   21730 // CHECK:   ret <2 x i64> [[TMP0]]
   21731 poly64x2_t test_vreinterpretq_p64_u8(uint8x16_t a) {
   21732   return vreinterpretq_p64_u8(a);
   21733 }
   21734 
   21735 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_u16(<8 x i16> %a) #0 {
   21736 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
   21737 // CHECK:   ret <2 x i64> [[TMP0]]
   21738 poly64x2_t test_vreinterpretq_p64_u16(uint16x8_t a) {
   21739   return vreinterpretq_p64_u16(a);
   21740 }
   21741 
   21742 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_u32(<4 x i32> %a) #0 {
   21743 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
   21744 // CHECK:   ret <2 x i64> [[TMP0]]
   21745 poly64x2_t test_vreinterpretq_p64_u32(uint32x4_t a) {
   21746   return vreinterpretq_p64_u32(a);
   21747 }
   21748 
   21749 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_u64(<2 x i64> %a) #0 {
   21750 // CHECK:   ret <2 x i64> %a
   21751 poly64x2_t test_vreinterpretq_p64_u64(uint64x2_t a) {
   21752   return vreinterpretq_p64_u64(a);
   21753 }
   21754 
   21755 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_f16(<8 x half> %a) #0 {
   21756 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x i64>
   21757 // CHECK:   ret <2 x i64> [[TMP0]]
   21758 poly64x2_t test_vreinterpretq_p64_f16(float16x8_t a) {
   21759   return vreinterpretq_p64_f16(a);
   21760 }
   21761 
   21762 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_f32(<4 x float> %a) #0 {
   21763 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x i64>
   21764 // CHECK:   ret <2 x i64> [[TMP0]]
   21765 poly64x2_t test_vreinterpretq_p64_f32(float32x4_t a) {
   21766   return vreinterpretq_p64_f32(a);
   21767 }
   21768 
   21769 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_f64(<2 x double> %a) #0 {
   21770 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <2 x i64>
   21771 // CHECK:   ret <2 x i64> [[TMP0]]
   21772 poly64x2_t test_vreinterpretq_p64_f64(float64x2_t a) {
   21773   return vreinterpretq_p64_f64(a);
   21774 }
   21775 
   21776 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_p8(<16 x i8> %a) #0 {
   21777 // CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
   21778 // CHECK:   ret <2 x i64> [[TMP0]]
   21779 poly64x2_t test_vreinterpretq_p64_p8(poly8x16_t a) {
   21780   return vreinterpretq_p64_p8(a);
   21781 }
   21782 
   21783 // CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_p16(<8 x i16> %a) #0 {
   21784 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
   21785 // CHECK:   ret <2 x i64> [[TMP0]]
   21786 poly64x2_t test_vreinterpretq_p64_p16(poly16x8_t a) {
   21787   return vreinterpretq_p64_p16(a);
   21788 }
   21789 
   21790 // CHECK-LABEL: define float @test_vabds_f32(float %a, float %b) #0 {
   21791 // CHECK:   [[VABDS_F32_I:%.*]] = call float @llvm.aarch64.sisd.fabd.f32(float %a, float %b) #4
   21792 // CHECK:   ret float [[VABDS_F32_I]]
   21793 float32_t test_vabds_f32(float32_t a, float32_t b) {
   21794   return vabds_f32(a, b);
   21795 }
   21796 
   21797 // CHECK-LABEL: define double @test_vabdd_f64(double %a, double %b) #0 {
   21798 // CHECK:   [[VABDD_F64_I:%.*]] = call double @llvm.aarch64.sisd.fabd.f64(double %a, double %b) #4
   21799 // CHECK:   ret double [[VABDD_F64_I]]
   21800 float64_t test_vabdd_f64(float64_t a, float64_t b) {
   21801   return vabdd_f64(a, b);
   21802 }
   21803 
   21804 // CHECK-LABEL: define <1 x i64> @test_vuqadd_s64(<1 x i64> %a, <1 x i64> %b) #0 {
   21805 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   21806 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   21807 // CHECK:   [[VUQADD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   21808 // CHECK:   [[VUQADD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   21809 // CHECK:   [[VUQADD2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.suqadd.v1i64(<1 x i64> [[VUQADD_I]], <1 x i64> [[VUQADD1_I]]) #4
   21810 // CHECK:   ret <1 x i64> [[VUQADD2_I]]
   21811 int64x1_t test_vuqadd_s64(int64x1_t a, uint64x1_t b) {
   21812   return vuqadd_s64(a, b);
   21813 }
   21814 
   21815 // CHECK-LABEL: define <1 x i64> @test_vsqadd_u64(<1 x i64> %a, <1 x i64> %b) #0 {
   21816 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   21817 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
   21818 // CHECK:   [[VSQADD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   21819 // CHECK:   [[VSQADD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
   21820 // CHECK:   [[VSQADD2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.usqadd.v1i64(<1 x i64> [[VSQADD_I]], <1 x i64> [[VSQADD1_I]]) #4
   21821 // CHECK:   ret <1 x i64> [[VSQADD2_I]]
   21822 uint64x1_t test_vsqadd_u64(uint64x1_t a, int64x1_t b) {
   21823   return vsqadd_u64(a, b);
   21824 }
   21825 
   21826 // CHECK-LABEL: define <8 x i8> @test_vsqadd_u8(<8 x i8> %a, <8 x i8> %b) #0 {
   21827 // CHECK:   [[VSQADD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.usqadd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
   21828 // CHECK:   ret <8 x i8> [[VSQADD_I]]
   21829 uint8x8_t test_vsqadd_u8(uint8x8_t a, int8x8_t b) {
   21830   return vsqadd_u8(a, b);
   21831 }
   21832 
   21833 // CHECK-LABEL: define <16 x i8> @test_vsqaddq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
   21834 // CHECK:   [[VSQADD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.usqadd.v16i8(<16 x i8> %a, <16 x i8> %b) #4
   21835 // CHECK:   ret <16 x i8> [[VSQADD_I]]
   21836 uint8x16_t test_vsqaddq_u8(uint8x16_t a, int8x16_t b) {
   21837   return vsqaddq_u8(a, b);
   21838 }
   21839 
   21840 // CHECK-LABEL: define <4 x i16> @test_vsqadd_u16(<4 x i16> %a, <4 x i16> %b) #0 {
   21841 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
   21842 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
   21843 // CHECK:   [[VSQADD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
   21844 // CHECK:   [[VSQADD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
   21845 // CHECK:   [[VSQADD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.usqadd.v4i16(<4 x i16> [[VSQADD_I]], <4 x i16> [[VSQADD1_I]]) #4
   21846 // CHECK:   ret <4 x i16> [[VSQADD2_I]]
   21847 uint16x4_t test_vsqadd_u16(uint16x4_t a, int16x4_t b) {
   21848   return vsqadd_u16(a, b);
   21849 }
   21850 
   21851 // CHECK-LABEL: define <8 x i16> @test_vsqaddq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
   21852 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
   21853 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
   21854 // CHECK:   [[VSQADD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
   21855 // CHECK:   [[VSQADD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
   21856 // CHECK:   [[VSQADD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.usqadd.v8i16(<8 x i16> [[VSQADD_I]], <8 x i16> [[VSQADD1_I]]) #4
   21857 // CHECK:   ret <8 x i16> [[VSQADD2_I]]
   21858 uint16x8_t test_vsqaddq_u16(uint16x8_t a, int16x8_t b) {
   21859   return vsqaddq_u16(a, b);
   21860 }
   21861 
   21862 // CHECK-LABEL: define <2 x i32> @test_vsqadd_u32(<2 x i32> %a, <2 x i32> %b) #0 {
   21863 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   21864 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
   21865 // CHECK:   [[VSQADD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   21866 // CHECK:   [[VSQADD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
   21867 // CHECK:   [[VSQADD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.usqadd.v2i32(<2 x i32> [[VSQADD_I]], <2 x i32> [[VSQADD1_I]]) #4
   21868 // CHECK:   ret <2 x i32> [[VSQADD2_I]]
   21869 uint32x2_t test_vsqadd_u32(uint32x2_t a, int32x2_t b) {
   21870   return vsqadd_u32(a, b);
   21871 }
   21872 
   21873 // CHECK-LABEL: define <4 x i32> @test_vsqaddq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
   21874 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
   21875 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
   21876 // CHECK:   [[VSQADD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
   21877 // CHECK:   [[VSQADD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
   21878 // CHECK:   [[VSQADD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.usqadd.v4i32(<4 x i32> [[VSQADD_I]], <4 x i32> [[VSQADD1_I]]) #4
   21879 // CHECK:   ret <4 x i32> [[VSQADD2_I]]
   21880 uint32x4_t test_vsqaddq_u32(uint32x4_t a, int32x4_t b) {
   21881   return vsqaddq_u32(a, b);
   21882 }
   21883 
   21884 // CHECK-LABEL: define <2 x i64> @test_vsqaddq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
   21885 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   21886 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   21887 // CHECK:   [[VSQADD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   21888 // CHECK:   [[VSQADD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   21889 // CHECK:   [[VSQADD2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.usqadd.v2i64(<2 x i64> [[VSQADD_I]], <2 x i64> [[VSQADD1_I]]) #4
   21890 // CHECK:   ret <2 x i64> [[VSQADD2_I]]
   21891 uint64x2_t test_vsqaddq_u64(uint64x2_t a, int64x2_t b) {
   21892   return vsqaddq_u64(a, b);
   21893 }
   21894 
   21895 // CHECK-LABEL: define <1 x i64> @test_vabs_s64(<1 x i64> %a) #0 {
   21896 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   21897 // CHECK:   [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   21898 // CHECK:   [[VABS1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64> [[VABS_I]]) #4
   21899 // CHECK:   ret <1 x i64> [[VABS1_I]]
   21900 int64x1_t test_vabs_s64(int64x1_t a) {
   21901   return vabs_s64(a);
   21902 }
   21903 
   21904 // CHECK-LABEL: define <1 x i64> @test_vqabs_s64(<1 x i64> %a) #0 {
   21905 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   21906 // CHECK:   [[VQABS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   21907 // CHECK:   [[VQABS_V1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqabs.v1i64(<1 x i64> [[VQABS_V_I]]) #4
   21908 // CHECK:   [[VQABS_V2_I:%.*]] = bitcast <1 x i64> [[VQABS_V1_I]] to <8 x i8>
   21909 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQABS_V2_I]] to <1 x i64>
   21910 // CHECK:   ret <1 x i64> [[TMP1]]
   21911 int64x1_t test_vqabs_s64(int64x1_t a) {
   21912   return vqabs_s64(a);
   21913 }
   21914 
   21915 // CHECK-LABEL: define <1 x i64> @test_vqneg_s64(<1 x i64> %a) #0 {
   21916 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   21917 // CHECK:   [[VQNEG_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   21918 // CHECK:   [[VQNEG_V1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqneg.v1i64(<1 x i64> [[VQNEG_V_I]]) #4
   21919 // CHECK:   [[VQNEG_V2_I:%.*]] = bitcast <1 x i64> [[VQNEG_V1_I]] to <8 x i8>
   21920 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQNEG_V2_I]] to <1 x i64>
   21921 // CHECK:   ret <1 x i64> [[TMP1]]
   21922 int64x1_t test_vqneg_s64(int64x1_t a) {
   21923   return vqneg_s64(a);
   21924 }
   21925 
   21926 // CHECK-LABEL: define <1 x i64> @test_vneg_s64(<1 x i64> %a) #0 {
   21927 // CHECK:   [[SUB_I:%.*]] = sub <1 x i64> zeroinitializer, %a
   21928 // CHECK:   ret <1 x i64> [[SUB_I]]
   21929 int64x1_t test_vneg_s64(int64x1_t a) {
   21930   return vneg_s64(a);
   21931 }
   21932 
   21933 // CHECK-LABEL: define float @test_vaddv_f32(<2 x float> %a) #0 {
   21934 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   21935 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   21936 // CHECK:   [[VADDV_F32_I:%.*]] = call float @llvm.aarch64.neon.faddv.f32.v2f32(<2 x float> [[TMP1]]) #4
   21937 // CHECK:   ret float [[VADDV_F32_I]]
   21938 float32_t test_vaddv_f32(float32x2_t a) {
   21939   return vaddv_f32(a);
   21940 }
   21941 
   21942 // CHECK-LABEL: define float @test_vaddvq_f32(<4 x float> %a) #0 {
   21943 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
   21944 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
   21945 // CHECK:   [[VADDVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float> [[TMP1]]) #4
   21946 // CHECK:   ret float [[VADDVQ_F32_I]]
   21947 float32_t test_vaddvq_f32(float32x4_t a) {
   21948   return vaddvq_f32(a);
   21949 }
   21950 
   21951 // CHECK-LABEL: define double @test_vaddvq_f64(<2 x double> %a) #0 {
   21952 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
   21953 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
   21954 // CHECK:   [[VADDVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.faddv.f64.v2f64(<2 x double> [[TMP1]]) #4
   21955 // CHECK:   ret double [[VADDVQ_F64_I]]
   21956 float64_t test_vaddvq_f64(float64x2_t a) {
   21957   return vaddvq_f64(a);
   21958 }
   21959 
   21960 // CHECK-LABEL: define float @test_vmaxv_f32(<2 x float> %a) #0 {
   21961 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   21962 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   21963 // CHECK:   [[VMAXV_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float> [[TMP1]]) #4
   21964 // CHECK:   ret float [[VMAXV_F32_I]]
   21965 float32_t test_vmaxv_f32(float32x2_t a) {
   21966   return vmaxv_f32(a);
   21967 }
   21968 
   21969 // CHECK-LABEL: define double @test_vmaxvq_f64(<2 x double> %a) #0 {
   21970 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
   21971 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
   21972 // CHECK:   [[VMAXVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.fmaxv.f64.v2f64(<2 x double> [[TMP1]]) #4
   21973 // CHECK:   ret double [[VMAXVQ_F64_I]]
   21974 float64_t test_vmaxvq_f64(float64x2_t a) {
   21975   return vmaxvq_f64(a);
   21976 }
   21977 
   21978 // CHECK-LABEL: define float @test_vminv_f32(<2 x float> %a) #0 {
   21979 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   21980 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   21981 // CHECK:   [[VMINV_F32_I:%.*]] = call float @llvm.aarch64.neon.fminv.f32.v2f32(<2 x float> [[TMP1]]) #4
   21982 // CHECK:   ret float [[VMINV_F32_I]]
   21983 float32_t test_vminv_f32(float32x2_t a) {
   21984   return vminv_f32(a);
   21985 }
   21986 
   21987 // CHECK-LABEL: define double @test_vminvq_f64(<2 x double> %a) #0 {
   21988 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
   21989 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
   21990 // CHECK:   [[VMINVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.fminv.f64.v2f64(<2 x double> [[TMP1]]) #4
   21991 // CHECK:   ret double [[VMINVQ_F64_I]]
   21992 float64_t test_vminvq_f64(float64x2_t a) {
   21993   return vminvq_f64(a);
   21994 }
   21995 
   21996 // CHECK-LABEL: define double @test_vmaxnmvq_f64(<2 x double> %a) #0 {
   21997 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
   21998 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
   21999 // CHECK:   [[VMAXNMVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.fmaxnmv.f64.v2f64(<2 x double> [[TMP1]]) #4
   22000 // CHECK:   ret double [[VMAXNMVQ_F64_I]]
   22001 float64_t test_vmaxnmvq_f64(float64x2_t a) {
   22002   return vmaxnmvq_f64(a);
   22003 }
   22004 
   22005 // CHECK-LABEL: define float @test_vmaxnmv_f32(<2 x float> %a) #0 {
   22006 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   22007 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   22008 // CHECK:   [[VMAXNMV_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxnmv.f32.v2f32(<2 x float> [[TMP1]]) #4
   22009 // CHECK:   ret float [[VMAXNMV_F32_I]]
   22010 float32_t test_vmaxnmv_f32(float32x2_t a) {
   22011   return vmaxnmv_f32(a);
   22012 }
   22013 
   22014 // CHECK-LABEL: define double @test_vminnmvq_f64(<2 x double> %a) #0 {
   22015 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
   22016 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
   22017 // CHECK:   [[VMINNMVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.fminnmv.f64.v2f64(<2 x double> [[TMP1]]) #4
   22018 // CHECK:   ret double [[VMINNMVQ_F64_I]]
   22019 float64_t test_vminnmvq_f64(float64x2_t a) {
   22020   return vminnmvq_f64(a);
   22021 }
   22022 
   22023 // CHECK-LABEL: define float @test_vminnmv_f32(<2 x float> %a) #0 {
   22024 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
   22025 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
   22026 // CHECK:   [[VMINNMV_F32_I:%.*]] = call float @llvm.aarch64.neon.fminnmv.f32.v2f32(<2 x float> [[TMP1]]) #4
   22027 // CHECK:   ret float [[VMINNMV_F32_I]]
   22028 float32_t test_vminnmv_f32(float32x2_t a) {
   22029   return vminnmv_f32(a);
   22030 }
   22031 
   22032 // CHECK-LABEL: define <2 x i64> @test_vpaddq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
   22033 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   22034 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   22035 // CHECK:   [[VPADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   22036 // CHECK:   [[VPADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   22037 // CHECK:   [[VPADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> [[VPADDQ_V_I]], <2 x i64> [[VPADDQ_V1_I]]) #4
   22038 // CHECK:   [[VPADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VPADDQ_V2_I]] to <16 x i8>
   22039 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VPADDQ_V3_I]] to <2 x i64>
   22040 // CHECK:   ret <2 x i64> [[TMP2]]
   22041 int64x2_t test_vpaddq_s64(int64x2_t a, int64x2_t b) {
   22042   return vpaddq_s64(a, b);
   22043 }
   22044 
   22045 // CHECK-LABEL: define <2 x i64> @test_vpaddq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
   22046 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   22047 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
   22048 // CHECK:   [[VPADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   22049 // CHECK:   [[VPADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
   22050 // CHECK:   [[VPADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> [[VPADDQ_V_I]], <2 x i64> [[VPADDQ_V1_I]]) #4
   22051 // CHECK:   [[VPADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VPADDQ_V2_I]] to <16 x i8>
   22052 // CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VPADDQ_V3_I]] to <2 x i64>
   22053 // CHECK:   ret <2 x i64> [[TMP2]]
   22054 uint64x2_t test_vpaddq_u64(uint64x2_t a, uint64x2_t b) {
   22055   return vpaddq_u64(a, b);
   22056 }
   22057 
   22058 // CHECK-LABEL: define i64 @test_vpaddd_u64(<2 x i64> %a) #0 {
   22059 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   22060 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   22061 // CHECK:   [[VPADDD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> [[TMP1]]) #4
   22062 // CHECK:   ret i64 [[VPADDD_U64_I]]
   22063 uint64_t test_vpaddd_u64(uint64x2_t a) {
   22064   return vpaddd_u64(a);
   22065 }
   22066 
   22067 // CHECK-LABEL: define i64 @test_vaddvq_s64(<2 x i64> %a) #0 {
   22068 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   22069 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   22070 // CHECK:   [[VADDVQ_S64_I:%.*]] = call i64 @llvm.aarch64.neon.saddv.i64.v2i64(<2 x i64> [[TMP1]]) #4
   22071 // CHECK:   ret i64 [[VADDVQ_S64_I]]
   22072 int64_t test_vaddvq_s64(int64x2_t a) {
   22073   return vaddvq_s64(a);
   22074 }
   22075 
   22076 // CHECK-LABEL: define i64 @test_vaddvq_u64(<2 x i64> %a) #0 {
   22077 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
   22078 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
   22079 // CHECK:   [[VADDVQ_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> [[TMP1]]) #4
   22080 // CHECK:   ret i64 [[VADDVQ_U64_I]]
   22081 uint64_t test_vaddvq_u64(uint64x2_t a) {
   22082   return vaddvq_u64(a);
   22083 }
   22084 
   22085 // CHECK-LABEL: define <1 x double> @test_vadd_f64(<1 x double> %a, <1 x double> %b) #0 {
   22086 // CHECK:   [[ADD_I:%.*]] = fadd <1 x double> %a, %b
   22087 // CHECK:   ret <1 x double> [[ADD_I]]
   22088 float64x1_t test_vadd_f64(float64x1_t a, float64x1_t b) {
   22089   return vadd_f64(a, b);
   22090 }
   22091 
   22092 // CHECK-LABEL: define <1 x double> @test_vmul_f64(<1 x double> %a, <1 x double> %b) #0 {
   22093 // CHECK:   [[MUL_I:%.*]] = fmul <1 x double> %a, %b
   22094 // CHECK:   ret <1 x double> [[MUL_I]]
   22095 float64x1_t test_vmul_f64(float64x1_t a, float64x1_t b) {
   22096   return vmul_f64(a, b);
   22097 }
   22098 
   22099 // CHECK-LABEL: define <1 x double> @test_vdiv_f64(<1 x double> %a, <1 x double> %b) #0 {
   22100 // CHECK:   [[DIV_I:%.*]] = fdiv <1 x double> %a, %b
   22101 // CHECK:   ret <1 x double> [[DIV_I]]
   22102 float64x1_t test_vdiv_f64(float64x1_t a, float64x1_t b) {
   22103   return vdiv_f64(a, b);
   22104 }
   22105 
   22106 // CHECK-LABEL: define <1 x double> @test_vmla_f64(<1 x double> %a, <1 x double> %b, <1 x double> %c) #0 {
   22107 // CHECK:   [[MUL_I:%.*]] = fmul <1 x double> %b, %c
   22108 // CHECK:   [[ADD_I:%.*]] = fadd <1 x double> %a, [[MUL_I]]
   22109 // CHECK:   ret <1 x double> [[ADD_I]]
   22110 float64x1_t test_vmla_f64(float64x1_t a, float64x1_t b, float64x1_t c) {
   22111   return vmla_f64(a, b, c);
   22112 }
   22113 
   22114 // CHECK-LABEL: define <1 x double> @test_vmls_f64(<1 x double> %a, <1 x double> %b, <1 x double> %c) #0 {
   22115 // CHECK:   [[MUL_I:%.*]] = fmul <1 x double> %b, %c
   22116 // CHECK:   [[SUB_I:%.*]] = fsub <1 x double> %a, [[MUL_I]]
   22117 // CHECK:   ret <1 x double> [[SUB_I]]
   22118 float64x1_t test_vmls_f64(float64x1_t a, float64x1_t b, float64x1_t c) {
   22119   return vmls_f64(a, b, c);
   22120 }
   22121 
   22122 // CHECK-LABEL: define <1 x double> @test_vfma_f64(<1 x double> %a, <1 x double> %b, <1 x double> %c) #0 {
   22123 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
   22124 // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
   22125 // CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %c to <8 x i8>
   22126 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
   22127 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
   22128 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
   22129 // CHECK:   [[TMP6:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[TMP4]], <1 x double> [[TMP5]], <1 x double> [[TMP3]]) #4
   22130 // CHECK:   ret <1 x double> [[TMP6]]
   22131 float64x1_t test_vfma_f64(float64x1_t a, float64x1_t b, float64x1_t c) {
   22132   return vfma_f64(a, b, c);
   22133 }
   22134 
   22135 // CHECK-LABEL: define <1 x double> @test_vfms_f64(<1 x double> %a, <1 x double> %b, <1 x double> %c) #0 {
   22136 // CHECK:   [[SUB_I:%.*]] = fsub <1 x double> <double -0.000000e+00>, %b
   22137 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
   22138 // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> [[SUB_I]] to <8 x i8>
   22139 // CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %c to <8 x i8>
   22140 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
   22141 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
   22142 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
   22143 // CHECK:   [[TMP6:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[TMP4]], <1 x double> [[TMP5]], <1 x double> [[TMP3]]) #4
   22144 // CHECK:   ret <1 x double> [[TMP6]]
   22145 float64x1_t test_vfms_f64(float64x1_t a, float64x1_t b, float64x1_t c) {
   22146   return vfms_f64(a, b, c);
   22147 }
   22148 
   22149 // CHECK-LABEL: define <1 x double> @test_vsub_f64(<1 x double> %a, <1 x double> %b) #0 {
   22150 // CHECK:   [[SUB_I:%.*]] = fsub <1 x double> %a, %b
   22151 // CHECK:   ret <1 x double> [[SUB_I]]
   22152 float64x1_t test_vsub_f64(float64x1_t a, float64x1_t b) {
   22153   return vsub_f64(a, b);
   22154 }
   22155 
   22156 // CHECK-LABEL: define <1 x double> @test_vabd_f64(<1 x double> %a, <1 x double> %b) #0 {
   22157 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
   22158 // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
   22159 // CHECK:   [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
   22160 // CHECK:   [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
   22161 // CHECK:   [[VABD2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fabd.v1f64(<1 x double> [[VABD_I]], <1 x double> [[VABD1_I]]) #4
   22162 // CHECK:   ret <1 x double> [[VABD2_I]]
   22163 float64x1_t test_vabd_f64(float64x1_t a, float64x1_t b) {
   22164   return vabd_f64(a, b);
   22165 }
   22166 
   22167 // CHECK-LABEL: define <1 x double> @test_vmax_f64(<1 x double> %a, <1 x double> %b) #0 {
   22168 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
   22169 // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
   22170 // CHECK:   [[VMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
   22171 // CHECK:   [[VMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
   22172 // CHECK:   [[VMAX2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fmax.v1f64(<1 x double> [[VMAX_I]], <1 x double> [[VMAX1_I]]) #4
   22173 // CHECK:   ret <1 x double> [[VMAX2_I]]
   22174 float64x1_t test_vmax_f64(float64x1_t a, float64x1_t b) {
   22175   return vmax_f64(a, b);
   22176 }
   22177 
   22178 // CHECK-LABEL: define <1 x double> @test_vmin_f64(<1 x double> %a, <1 x double> %b) #0 {
   22179 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
   22180 // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
   22181 // CHECK:   [[VMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
   22182 // CHECK:   [[VMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
   22183 // CHECK:   [[VMIN2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fmin.v1f64(<1 x double> [[VMIN_I]], <1 x double> [[VMIN1_I]]) #4
   22184 // CHECK:   ret <1 x double> [[VMIN2_I]]
   22185 float64x1_t test_vmin_f64(float64x1_t a, float64x1_t b) {
   22186   return vmin_f64(a, b);
   22187 }
   22188 
   22189 // CHECK-LABEL: define <1 x double> @test_vmaxnm_f64(<1 x double> %a, <1 x double> %b) #0 {
   22190 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
   22191 // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
   22192 // CHECK:   [[VMAXNM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
   22193 // CHECK:   [[VMAXNM1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
   22194 // CHECK:   [[VMAXNM2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fmaxnm.v1f64(<1 x double> [[VMAXNM_I]], <1 x double> [[VMAXNM1_I]]) #4
   22195 // CHECK:   ret <1 x double> [[VMAXNM2_I]]
   22196 float64x1_t test_vmaxnm_f64(float64x1_t a, float64x1_t b) {
   22197   return vmaxnm_f64(a, b);
   22198 }
   22199 
   22200 // CHECK-LABEL: define <1 x double> @test_vminnm_f64(<1 x double> %a, <1 x double> %b) #0 {
   22201 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
   22202 // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
   22203 // CHECK:   [[VMINNM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
   22204 // CHECK:   [[VMINNM1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
   22205 // CHECK:   [[VMINNM2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fminnm.v1f64(<1 x double> [[VMINNM_I]], <1 x double> [[VMINNM1_I]]) #4
   22206 // CHECK:   ret <1 x double> [[VMINNM2_I]]
   22207 float64x1_t test_vminnm_f64(float64x1_t a, float64x1_t b) {
   22208   return vminnm_f64(a, b);
   22209 }
   22210 
   22211 // CHECK-LABEL: define <1 x double> @test_vabs_f64(<1 x double> %a) #0 {
   22212 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
   22213 // CHECK:   [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
   22214 // CHECK:   [[VABS1_I:%.*]] = call <1 x double> @llvm.fabs.v1f64(<1 x double> [[VABS_I]]) #4
   22215 // CHECK:   ret <1 x double> [[VABS1_I]]
   22216 float64x1_t test_vabs_f64(float64x1_t a) {
   22217   return vabs_f64(a);
   22218 }
   22219 
   22220 // CHECK-LABEL: define <1 x double> @test_vneg_f64(<1 x double> %a) #0 {
   22221 // CHECK:   [[SUB_I:%.*]] = fsub <1 x double> <double -0.000000e+00>, %a
   22222 // CHECK:   ret <1 x double> [[SUB_I]]
   22223 float64x1_t test_vneg_f64(float64x1_t a) {
   22224   return vneg_f64(a);
   22225 }
   22226 
   22227 // CHECK-LABEL: define <1 x i64> @test_vcvt_s64_f64(<1 x double> %a) #0 {
   22228 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
   22229 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
   22230 // CHECK:   [[TMP2:%.*]] = fptosi <1 x double> [[TMP1]] to <1 x i64>
   22231 // CHECK:   ret <1 x i64> [[TMP2]]
   22232 int64x1_t test_vcvt_s64_f64(float64x1_t a) {
   22233   return vcvt_s64_f64(a);
   22234 }
   22235 
   22236 // CHECK-LABEL: define <1 x i64> @test_vcvt_u64_f64(<1 x double> %a) #0 {
   22237 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
   22238 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
   22239 // CHECK:   [[TMP2:%.*]] = fptoui <1 x double> [[TMP1]] to <1 x i64>
   22240 // CHECK:   ret <1 x i64> [[TMP2]]
   22241 uint64x1_t test_vcvt_u64_f64(float64x1_t a) {
   22242   return vcvt_u64_f64(a);
   22243 }
   22244 
   22245 // CHECK-LABEL: define <1 x i64> @test_vcvtn_s64_f64(<1 x double> %a) #0 {
   22246 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
   22247 // CHECK:   [[VCVTN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
   22248 // CHECK:   [[VCVTN1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtns.v1i64.v1f64(<1 x double> [[VCVTN_I]]) #4
   22249 // CHECK:   ret <1 x i64> [[VCVTN1_I]]
   22250 int64x1_t test_vcvtn_s64_f64(float64x1_t a) {
   22251   return vcvtn_s64_f64(a);
   22252 }
   22253 
   22254 // CHECK-LABEL: define <1 x i64> @test_vcvtn_u64_f64(<1 x double> %a) #0 {
   22255 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
   22256 // CHECK:   [[VCVTN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
   22257 // CHECK:   [[VCVTN1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtnu.v1i64.v1f64(<1 x double> [[VCVTN_I]]) #4
   22258 // CHECK:   ret <1 x i64> [[VCVTN1_I]]
   22259 uint64x1_t test_vcvtn_u64_f64(float64x1_t a) {
   22260   return vcvtn_u64_f64(a);
   22261 }
   22262 
   22263 // CHECK-LABEL: define <1 x i64> @test_vcvtp_s64_f64(<1 x double> %a) #0 {
   22264 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
   22265 // CHECK:   [[VCVTP_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
   22266 // CHECK:   [[VCVTP1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtps.v1i64.v1f64(<1 x double> [[VCVTP_I]]) #4
   22267 // CHECK:   ret <1 x i64> [[VCVTP1_I]]
   22268 int64x1_t test_vcvtp_s64_f64(float64x1_t a) {
   22269   return vcvtp_s64_f64(a);
   22270 }
   22271 
   22272 // CHECK-LABEL: define <1 x i64> @test_vcvtp_u64_f64(<1 x double> %a) #0 {
   22273 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
   22274 // CHECK:   [[VCVTP_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
   22275 // CHECK:   [[VCVTP1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtpu.v1i64.v1f64(<1 x double> [[VCVTP_I]]) #4
   22276 // CHECK:   ret <1 x i64> [[VCVTP1_I]]
   22277 uint64x1_t test_vcvtp_u64_f64(float64x1_t a) {
   22278   return vcvtp_u64_f64(a);
   22279 }
   22280 
   22281 // CHECK-LABEL: define <1 x i64> @test_vcvtm_s64_f64(<1 x double> %a) #0 {
   22282 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
   22283 // CHECK:   [[VCVTM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
   22284 // CHECK:   [[VCVTM1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtms.v1i64.v1f64(<1 x double> [[VCVTM_I]]) #4
   22285 // CHECK:   ret <1 x i64> [[VCVTM1_I]]
   22286 int64x1_t test_vcvtm_s64_f64(float64x1_t a) {
   22287   return vcvtm_s64_f64(a);
   22288 }
   22289 
   22290 // CHECK-LABEL: define <1 x i64> @test_vcvtm_u64_f64(<1 x double> %a) #0 {
   22291 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
   22292 // CHECK:   [[VCVTM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
   22293 // CHECK:   [[VCVTM1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtmu.v1i64.v1f64(<1 x double> [[VCVTM_I]]) #4
   22294 // CHECK:   ret <1 x i64> [[VCVTM1_I]]
   22295 uint64x1_t test_vcvtm_u64_f64(float64x1_t a) {
   22296   return vcvtm_u64_f64(a);
   22297 }
   22298 
   22299 // CHECK-LABEL: define <1 x i64> @test_vcvta_s64_f64(<1 x double> %a) #0 {
   22300 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
   22301 // CHECK:   [[VCVTA_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
   22302 // CHECK:   [[VCVTA1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtas.v1i64.v1f64(<1 x double> [[VCVTA_I]]) #4
   22303 // CHECK:   ret <1 x i64> [[VCVTA1_I]]
   22304 int64x1_t test_vcvta_s64_f64(float64x1_t a) {
   22305   return vcvta_s64_f64(a);
   22306 }
   22307 
   22308 // CHECK-LABEL: define <1 x i64> @test_vcvta_u64_f64(<1 x double> %a) #0 {
   22309 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
   22310 // CHECK:   [[VCVTA_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
   22311 // CHECK:   [[VCVTA1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtau.v1i64.v1f64(<1 x double> [[VCVTA_I]]) #4
   22312 // CHECK:   ret <1 x i64> [[VCVTA1_I]]
   22313 uint64x1_t test_vcvta_u64_f64(float64x1_t a) {
   22314   return vcvta_u64_f64(a);
   22315 }
   22316 
   22317 // CHECK-LABEL: define <1 x double> @test_vcvt_f64_s64(<1 x i64> %a) #0 {
   22318 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   22319 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   22320 // CHECK:   [[VCVT_I:%.*]] = sitofp <1 x i64> [[TMP1]] to <1 x double>
   22321 // CHECK:   ret <1 x double> [[VCVT_I]]
   22322 float64x1_t test_vcvt_f64_s64(int64x1_t a) {
   22323   return vcvt_f64_s64(a);
   22324 }
   22325 
   22326 // CHECK-LABEL: define <1 x double> @test_vcvt_f64_u64(<1 x i64> %a) #0 {
   22327 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   22328 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   22329 // CHECK:   [[VCVT_I:%.*]] = uitofp <1 x i64> [[TMP1]] to <1 x double>
   22330 // CHECK:   ret <1 x double> [[VCVT_I]]
   22331 float64x1_t test_vcvt_f64_u64(uint64x1_t a) {
   22332   return vcvt_f64_u64(a);
   22333 }
   22334 
   22335 // CHECK-LABEL: define <1 x i64> @test_vcvt_n_s64_f64(<1 x double> %a) #0 {
   22336 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
   22337 // CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
   22338 // CHECK:   [[VCVT_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.vcvtfp2fxs.v1i64.v1f64(<1 x double> [[VCVT_N]], i32 64)
   22339 // CHECK:   ret <1 x i64> [[VCVT_N1]]
   22340 int64x1_t test_vcvt_n_s64_f64(float64x1_t a) {
   22341   return vcvt_n_s64_f64(a, 64);
   22342 }
   22343 
   22344 // CHECK-LABEL: define <1 x i64> @test_vcvt_n_u64_f64(<1 x double> %a) #0 {
   22345 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
   22346 // CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
   22347 // CHECK:   [[VCVT_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.vcvtfp2fxu.v1i64.v1f64(<1 x double> [[VCVT_N]], i32 64)
   22348 // CHECK:   ret <1 x i64> [[VCVT_N1]]
   22349 uint64x1_t test_vcvt_n_u64_f64(float64x1_t a) {
   22350   return vcvt_n_u64_f64(a, 64);
   22351 }
   22352 
   22353 // CHECK-LABEL: define <1 x double> @test_vcvt_n_f64_s64(<1 x i64> %a) #0 {
   22354 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   22355 // CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   22356 // CHECK:   [[VCVT_N1:%.*]] = call <1 x double> @llvm.aarch64.neon.vcvtfxs2fp.v1f64.v1i64(<1 x i64> [[VCVT_N]], i32 64)
   22357 // CHECK:   ret <1 x double> [[VCVT_N1]]
   22358 float64x1_t test_vcvt_n_f64_s64(int64x1_t a) {
   22359   return vcvt_n_f64_s64(a, 64);
   22360 }
   22361 
   22362 // CHECK-LABEL: define <1 x double> @test_vcvt_n_f64_u64(<1 x i64> %a) #0 {
   22363 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
   22364 // CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
   22365 // CHECK:   [[VCVT_N1:%.*]] = call <1 x double> @llvm.aarch64.neon.vcvtfxu2fp.v1f64.v1i64(<1 x i64> [[VCVT_N]], i32 64)
   22366 // CHECK:   ret <1 x double> [[VCVT_N1]]
   22367 float64x1_t test_vcvt_n_f64_u64(uint64x1_t a) {
   22368   return vcvt_n_f64_u64(a, 64);
   22369 }
   22370 
   22371 // CHECK-LABEL: define <1 x double> @test_vrndn_f64(<1 x double> %a) #0 {
   22372 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
   22373 // CHECK:   [[VRNDN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
   22374 // CHECK:   [[VRNDN1_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frintn.v1f64(<1 x double> [[VRNDN_I]]) #4
   22375 // CHECK:   ret <1 x double> [[VRNDN1_I]]
   22376 float64x1_t test_vrndn_f64(float64x1_t a) {
   22377   return vrndn_f64(a);
   22378 }
   22379 
   22380 // CHECK-LABEL: define <1 x double> @test_vrnda_f64(<1 x double> %a) #0 {
   22381 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
   22382 // CHECK:   [[VRNDA_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
   22383 // CHECK:   [[VRNDA1_I:%.*]] = call <1 x double> @llvm.round.v1f64(<1 x double> [[VRNDA_I]]) #4
   22384 // CHECK:   ret <1 x double> [[VRNDA1_I]]
   22385 float64x1_t test_vrnda_f64(float64x1_t a) {
   22386   return vrnda_f64(a);
   22387 }
   22388 
   22389 // CHECK-LABEL: define <1 x double> @test_vrndp_f64(<1 x double> %a) #0 {
   22390 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
   22391 // CHECK:   [[VRNDP_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
   22392 // CHECK:   [[VRNDP1_I:%.*]] = call <1 x double> @llvm.ceil.v1f64(<1 x double> [[VRNDP_I]]) #4
   22393 // CHECK:   ret <1 x double> [[VRNDP1_I]]
   22394 float64x1_t test_vrndp_f64(float64x1_t a) {
   22395   return vrndp_f64(a);
   22396 }
   22397 
   22398 // CHECK-LABEL: define <1 x double> @test_vrndm_f64(<1 x double> %a) #0 {
   22399 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
   22400 // CHECK:   [[VRNDM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
   22401 // CHECK:   [[VRNDM1_I:%.*]] = call <1 x double> @llvm.floor.v1f64(<1 x double> [[VRNDM_I]]) #4
   22402 // CHECK:   ret <1 x double> [[VRNDM1_I]]
   22403 float64x1_t test_vrndm_f64(float64x1_t a) {
   22404   return vrndm_f64(a);
   22405 }
   22406 
   22407 // CHECK-LABEL: define <1 x double> @test_vrndx_f64(<1 x double> %a) #0 {
   22408 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
   22409 // CHECK:   [[VRNDX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
   22410 // CHECK:   [[VRNDX1_I:%.*]] = call <1 x double> @llvm.rint.v1f64(<1 x double> [[VRNDX_I]]) #4
   22411 // CHECK:   ret <1 x double> [[VRNDX1_I]]
   22412 float64x1_t test_vrndx_f64(float64x1_t a) {
   22413   return vrndx_f64(a);
   22414 }
   22415 
   22416 // CHECK-LABEL: define <1 x double> @test_vrnd_f64(<1 x double> %a) #0 {
   22417 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
   22418 // CHECK:   [[VRNDZ_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
   22419 // CHECK:   [[VRNDZ1_I:%.*]] = call <1 x double> @llvm.trunc.v1f64(<1 x double> [[VRNDZ_I]]) #4
   22420 // CHECK:   ret <1 x double> [[VRNDZ1_I]]
   22421 float64x1_t test_vrnd_f64(float64x1_t a) {
   22422   return vrnd_f64(a);
   22423 }
   22424 
   22425 // CHECK-LABEL: define <1 x double> @test_vrndi_f64(<1 x double> %a) #0 {
   22426 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
   22427 // CHECK:   [[VRNDI_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
   22428 // CHECK:   [[VRNDI1_I:%.*]] = call <1 x double> @llvm.nearbyint.v1f64(<1 x double> [[VRNDI_I]]) #4
   22429 // CHECK:   ret <1 x double> [[VRNDI1_I]]
   22430 float64x1_t test_vrndi_f64(float64x1_t a) {
   22431   return vrndi_f64(a);
   22432 }
   22433 
   22434 // CHECK-LABEL: define <1 x double> @test_vrsqrte_f64(<1 x double> %a) #0 {
   22435 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
   22436 // CHECK:   [[VRSQRTE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
   22437 // CHECK:   [[VRSQRTE_V1_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frsqrte.v1f64(<1 x double> [[VRSQRTE_V_I]]) #4
   22438 // CHECK:   ret <1 x double> [[VRSQRTE_V1_I]]
   22439 float64x1_t test_vrsqrte_f64(float64x1_t a) {
   22440   return vrsqrte_f64(a);
   22441 }
   22442 
   22443 // CHECK-LABEL: define <1 x double> @test_vrecpe_f64(<1 x double> %a) #0 {
   22444 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
   22445 // CHECK:   [[VRECPE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
   22446 // CHECK:   [[VRECPE_V1_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frecpe.v1f64(<1 x double> [[VRECPE_V_I]]) #4
   22447 // CHECK:   ret <1 x double> [[VRECPE_V1_I]]
   22448 float64x1_t test_vrecpe_f64(float64x1_t a) {
   22449   return vrecpe_f64(a);
   22450 }
   22451 
   22452 // CHECK-LABEL: define <1 x double> @test_vsqrt_f64(<1 x double> %a) #0 {
   22453 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
   22454 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
   22455 // CHECK:   [[VSQRT_I:%.*]] = call <1 x double> @llvm.sqrt.v1f64(<1 x double> [[TMP1]]) #4
   22456 // CHECK:   ret <1 x double> [[VSQRT_I]]
   22457 float64x1_t test_vsqrt_f64(float64x1_t a) {
   22458   return vsqrt_f64(a);
   22459 }
   22460 
   22461 // CHECK-LABEL: define <1 x double> @test_vrecps_f64(<1 x double> %a, <1 x double> %b) #0 {
   22462 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
   22463 // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
   22464 // CHECK:   [[VRECPS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
   22465 // CHECK:   [[VRECPS_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
   22466 // CHECK:   [[VRECPS_V2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frecps.v1f64(<1 x double> [[VRECPS_V_I]], <1 x double> [[VRECPS_V1_I]]) #4
   22467 // CHECK:   [[VRECPS_V3_I:%.*]] = bitcast <1 x double> [[VRECPS_V2_I]] to <8 x i8>
   22468 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRECPS_V3_I]] to <1 x double>
   22469 // CHECK:   ret <1 x double> [[TMP2]]
   22470 float64x1_t test_vrecps_f64(float64x1_t a, float64x1_t b) {
   22471   return vrecps_f64(a, b);
   22472 }
   22473 
   22474 // CHECK-LABEL: define <1 x double> @test_vrsqrts_f64(<1 x double> %a, <1 x double> %b) #0 {
   22475 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
   22476 // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
   22477 // CHECK:   [[VRSQRTS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
   22478 // CHECK:   [[VRSQRTS_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
   22479 // CHECK:   [[VRSQRTS_V2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frsqrts.v1f64(<1 x double> [[VRSQRTS_V_I]], <1 x double> [[VRSQRTS_V1_I]]) #4
   22480 // CHECK:   [[VRSQRTS_V3_I:%.*]] = bitcast <1 x double> [[VRSQRTS_V2_I]] to <8 x i8>
   22481 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSQRTS_V3_I]] to <1 x double>
   22482 // CHECK:   ret <1 x double> [[TMP2]]
   22483 float64x1_t test_vrsqrts_f64(float64x1_t a, float64x1_t b) {
   22484   return vrsqrts_f64(a, b);
   22485 }
   22486 
   22487 // CHECK-LABEL: define i32 @test_vminv_s32(<2 x i32> %a) #0 {
   22488 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   22489 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   22490 // CHECK:   [[VMINV_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v2i32(<2 x i32> [[TMP1]]) #4
   22491 // CHECK:   ret i32 [[VMINV_S32_I]]
   22492 int32_t test_vminv_s32(int32x2_t a) {
   22493   return vminv_s32(a);
   22494 }
   22495 
   22496 // CHECK-LABEL: define i32 @test_vminv_u32(<2 x i32> %a) #0 {
   22497 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   22498 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   22499 // CHECK:   [[VMINV_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v2i32(<2 x i32> [[TMP1]]) #4
   22500 // CHECK:   ret i32 [[VMINV_U32_I]]
   22501 uint32_t test_vminv_u32(uint32x2_t a) {
   22502   return vminv_u32(a);
   22503 }
   22504 
   22505 // CHECK-LABEL: define i32 @test_vmaxv_s32(<2 x i32> %a) #0 {
   22506 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   22507 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   22508 // CHECK:   [[VMAXV_S32_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v2i32(<2 x i32> [[TMP1]]) #4
   22509 // CHECK:   ret i32 [[VMAXV_S32_I]]
   22510 int32_t test_vmaxv_s32(int32x2_t a) {
   22511   return vmaxv_s32(a);
   22512 }
   22513 
   22514 // CHECK-LABEL: define i32 @test_vmaxv_u32(<2 x i32> %a) #0 {
   22515 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   22516 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   22517 // CHECK:   [[VMAXV_U32_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v2i32(<2 x i32> [[TMP1]]) #4
   22518 // CHECK:   ret i32 [[VMAXV_U32_I]]
   22519 uint32_t test_vmaxv_u32(uint32x2_t a) {
   22520   return vmaxv_u32(a);
   22521 }
   22522 
   22523 // CHECK-LABEL: define i32 @test_vaddv_s32(<2 x i32> %a) #0 {
   22524 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   22525 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   22526 // CHECK:   [[VADDV_S32_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32> [[TMP1]]) #4
   22527 // CHECK:   ret i32 [[VADDV_S32_I]]
   22528 int32_t test_vaddv_s32(int32x2_t a) {
   22529   return vaddv_s32(a);
   22530 }
   22531 
   22532 // CHECK-LABEL: define i32 @test_vaddv_u32(<2 x i32> %a) #0 {
   22533 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   22534 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   22535 // CHECK:   [[VADDV_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v2i32(<2 x i32> [[TMP1]]) #4
   22536 // CHECK:   ret i32 [[VADDV_U32_I]]
   22537 uint32_t test_vaddv_u32(uint32x2_t a) {
   22538   return vaddv_u32(a);
   22539 }
   22540 
   22541 // CHECK-LABEL: define i64 @test_vaddlv_s32(<2 x i32> %a) #0 {
   22542 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   22543 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   22544 // CHECK:   [[VADDLV_S32_I:%.*]] = call i64 @llvm.aarch64.neon.saddlv.i64.v2i32(<2 x i32> [[TMP1]]) #4
   22545 // CHECK:   ret i64 [[VADDLV_S32_I]]
   22546 int64_t test_vaddlv_s32(int32x2_t a) {
   22547   return vaddlv_s32(a);
   22548 }
   22549 
   22550 // CHECK-LABEL: define i64 @test_vaddlv_u32(<2 x i32> %a) #0 {
   22551 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
   22552 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
   22553 // CHECK:   [[VADDLV_U32_I:%.*]] = call i64 @llvm.aarch64.neon.uaddlv.i64.v2i32(<2 x i32> [[TMP1]]) #4
   22554 // CHECK:   ret i64 [[VADDLV_U32_I]]
   22555 uint64_t test_vaddlv_u32(uint32x2_t a) {
   22556   return vaddlv_u32(a);
   22557 }
   22558